End of training
Browse files- README.md +3 -1
- all_results.json +13 -13
- train_results.json +13 -13
- trainer_state.json +131 -131
- training_loss.png +0 -0
README.md
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
---
|
| 2 |
library_name: transformers
|
|
|
|
|
|
|
| 3 |
tags:
|
| 4 |
- llama-factory
|
| 5 |
- full
|
|
@@ -14,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
|
|
| 14 |
|
| 15 |
# swesmith-stack-undr7030
|
| 16 |
|
| 17 |
-
This model
|
| 18 |
|
| 19 |
## Model description
|
| 20 |
|
|
|
|
| 1 |
---
|
| 2 |
library_name: transformers
|
| 3 |
+
license: apache-2.0
|
| 4 |
+
base_model: Qwen/Qwen3-8B
|
| 5 |
tags:
|
| 6 |
- llama-factory
|
| 7 |
- full
|
|
|
|
| 16 |
|
| 17 |
# swesmith-stack-undr7030
|
| 18 |
|
| 19 |
+
This model is a fine-tuned version of [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) on the penfever/GLM-4.6-swesmith-32ep-131k-nosumm-reasoning and the penfever/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning datasets.
|
| 20 |
|
| 21 |
## Model description
|
| 22 |
|
all_results.json
CHANGED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
-
"achieved_tflops_per_gpu":
|
| 3 |
-
"achieved_tflops_per_gpu_theoretical":
|
| 4 |
-
"epoch":
|
| 5 |
"loss_nan_ranks": 0,
|
| 6 |
-
"loss_rank_avg": 0.
|
| 7 |
-
"mfu_percent":
|
| 8 |
-
"mfu_percent_theoretical":
|
| 9 |
-
"total_flos": 2.
|
| 10 |
-
"train_loss": 0.
|
| 11 |
-
"train_runtime":
|
| 12 |
-
"train_samples_per_second":
|
| 13 |
-
"train_steps_per_second":
|
| 14 |
-
"valid_targets_mean":
|
| 15 |
-
"valid_targets_min":
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"achieved_tflops_per_gpu": 78.2183808177642,
|
| 3 |
+
"achieved_tflops_per_gpu_theoretical": 3442.1110358283386,
|
| 4 |
+
"epoch": 6.999053926206244,
|
| 5 |
"loss_nan_ranks": 0,
|
| 6 |
+
"loss_rank_avg": 0.14962854981422424,
|
| 7 |
+
"mfu_percent": 25.0699938518475,
|
| 8 |
+
"mfu_percent_theoretical": 1103.2407166116468,
|
| 9 |
+
"total_flos": 2.2812121694270915e+18,
|
| 10 |
+
"train_loss": 0.004161321395542221,
|
| 11 |
+
"train_runtime": 3645.582,
|
| 12 |
+
"train_samples_per_second": 16.237,
|
| 13 |
+
"train_steps_per_second": 1.016,
|
| 14 |
+
"valid_targets_mean": 6778.8,
|
| 15 |
+
"valid_targets_min": 4131
|
| 16 |
}
|
train_results.json
CHANGED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
{
|
| 2 |
-
"achieved_tflops_per_gpu":
|
| 3 |
-
"achieved_tflops_per_gpu_theoretical":
|
| 4 |
-
"epoch":
|
| 5 |
"loss_nan_ranks": 0,
|
| 6 |
-
"loss_rank_avg": 0.
|
| 7 |
-
"mfu_percent":
|
| 8 |
-
"mfu_percent_theoretical":
|
| 9 |
-
"total_flos": 2.
|
| 10 |
-
"train_loss": 0.
|
| 11 |
-
"train_runtime":
|
| 12 |
-
"train_samples_per_second":
|
| 13 |
-
"train_steps_per_second":
|
| 14 |
-
"valid_targets_mean":
|
| 15 |
-
"valid_targets_min":
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"achieved_tflops_per_gpu": 78.2183808177642,
|
| 3 |
+
"achieved_tflops_per_gpu_theoretical": 3442.1110358283386,
|
| 4 |
+
"epoch": 6.999053926206244,
|
| 5 |
"loss_nan_ranks": 0,
|
| 6 |
+
"loss_rank_avg": 0.14962854981422424,
|
| 7 |
+
"mfu_percent": 25.0699938518475,
|
| 8 |
+
"mfu_percent_theoretical": 1103.2407166116468,
|
| 9 |
+
"total_flos": 2.2812121694270915e+18,
|
| 10 |
+
"train_loss": 0.004161321395542221,
|
| 11 |
+
"train_runtime": 3645.582,
|
| 12 |
+
"train_samples_per_second": 16.237,
|
| 13 |
+
"train_steps_per_second": 1.016,
|
| 14 |
+
"valid_targets_mean": 6778.8,
|
| 15 |
+
"valid_targets_min": 4131
|
| 16 |
}
|
trainer_state.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 500,
|
| 7 |
"global_step": 3702,
|
| 8 |
"is_hyper_param_search": false,
|
|
@@ -7930,237 +7930,237 @@
|
|
| 7930 |
"valid_targets_min": 1293
|
| 7931 |
},
|
| 7932 |
{
|
| 7933 |
-
"epoch": 6.
|
| 7934 |
-
"grad_norm": 0.
|
| 7935 |
"learning_rate": 8.706523924000066e-08,
|
| 7936 |
-
"loss": 0.
|
| 7937 |
"loss_nan_ranks": 0,
|
| 7938 |
-
"loss_rank_avg": 0.
|
| 7939 |
"step": 3605,
|
| 7940 |
-
"valid_targets_mean":
|
| 7941 |
-
"valid_targets_min":
|
| 7942 |
},
|
| 7943 |
{
|
| 7944 |
-
"epoch": 6.
|
| 7945 |
-
"grad_norm": 0.
|
| 7946 |
"learning_rate": 7.849846547778983e-08,
|
| 7947 |
-
"loss": 0.
|
| 7948 |
"loss_nan_ranks": 0,
|
| 7949 |
-
"loss_rank_avg": 0.
|
| 7950 |
"step": 3610,
|
| 7951 |
-
"valid_targets_mean":
|
| 7952 |
-
"valid_targets_min":
|
| 7953 |
},
|
| 7954 |
{
|
| 7955 |
-
"epoch": 6.
|
| 7956 |
-
"grad_norm": 0.
|
| 7957 |
"learning_rate": 7.03744340325252e-08,
|
| 7958 |
-
"loss": 0.
|
| 7959 |
"loss_nan_ranks": 0,
|
| 7960 |
-
"loss_rank_avg": 0.
|
| 7961 |
"step": 3615,
|
| 7962 |
-
"valid_targets_mean":
|
| 7963 |
-
"valid_targets_min":
|
| 7964 |
},
|
| 7965 |
{
|
| 7966 |
-
"epoch": 6.
|
| 7967 |
-
"grad_norm": 0.
|
| 7968 |
"learning_rate": 6.269332545548068e-08,
|
| 7969 |
-
"loss": 0.
|
| 7970 |
"loss_nan_ranks": 0,
|
| 7971 |
-
"loss_rank_avg": 0.
|
| 7972 |
"step": 3620,
|
| 7973 |
-
"valid_targets_mean":
|
| 7974 |
-
"valid_targets_min":
|
| 7975 |
},
|
| 7976 |
{
|
| 7977 |
-
"epoch": 6.
|
| 7978 |
-
"grad_norm": 0.
|
| 7979 |
"learning_rate": 5.5455310454259894e-08,
|
| 7980 |
-
"loss": 0.
|
| 7981 |
"loss_nan_ranks": 0,
|
| 7982 |
-
"loss_rank_avg": 0.
|
| 7983 |
"step": 3625,
|
| 7984 |
-
"valid_targets_mean":
|
| 7985 |
-
"valid_targets_min":
|
| 7986 |
},
|
| 7987 |
{
|
| 7988 |
-
"epoch": 6.
|
| 7989 |
-
"grad_norm": 0.
|
| 7990 |
"learning_rate": 4.866054988900581e-08,
|
| 7991 |
-
"loss": 0.
|
| 7992 |
"loss_nan_ranks": 0,
|
| 7993 |
-
"loss_rank_avg": 0.
|
| 7994 |
"step": 3630,
|
| 7995 |
-
"valid_targets_mean":
|
| 7996 |
-
"valid_targets_min":
|
| 7997 |
},
|
| 7998 |
{
|
| 7999 |
-
"epoch": 6.
|
| 8000 |
-
"grad_norm": 0.
|
| 8001 |
"learning_rate": 4.230919476881479e-08,
|
| 8002 |
-
"loss": 0.
|
| 8003 |
"loss_nan_ranks": 0,
|
| 8004 |
-
"loss_rank_avg": 0.
|
| 8005 |
"step": 3635,
|
| 8006 |
-
"valid_targets_mean":
|
| 8007 |
-
"valid_targets_min":
|
| 8008 |
},
|
| 8009 |
{
|
| 8010 |
-
"epoch": 6.
|
| 8011 |
-
"grad_norm": 0.
|
| 8012 |
"learning_rate": 3.640138624839695e-08,
|
| 8013 |
-
"loss": 0.
|
| 8014 |
"loss_nan_ranks": 0,
|
| 8015 |
-
"loss_rank_avg": 0.
|
| 8016 |
"step": 3640,
|
| 8017 |
-
"valid_targets_mean":
|
| 8018 |
-
"valid_targets_min":
|
| 8019 |
},
|
| 8020 |
{
|
| 8021 |
-
"epoch": 6.
|
| 8022 |
-
"grad_norm": 0.
|
| 8023 |
"learning_rate": 3.093725562492544e-08,
|
| 8024 |
-
"loss": 0.
|
| 8025 |
"loss_nan_ranks": 0,
|
| 8026 |
-
"loss_rank_avg": 0.
|
| 8027 |
"step": 3645,
|
| 8028 |
-
"valid_targets_mean":
|
| 8029 |
-
"valid_targets_min":
|
| 8030 |
},
|
| 8031 |
{
|
| 8032 |
-
"epoch": 6.
|
| 8033 |
-
"grad_norm": 0.
|
| 8034 |
"learning_rate": 2.591692433511872e-08,
|
| 8035 |
-
"loss": 0.
|
| 8036 |
"loss_nan_ranks": 0,
|
| 8037 |
-
"loss_rank_avg": 0.
|
| 8038 |
"step": 3650,
|
| 8039 |
-
"valid_targets_mean":
|
| 8040 |
-
"valid_targets_min":
|
| 8041 |
},
|
| 8042 |
{
|
| 8043 |
-
"epoch": 6.
|
| 8044 |
-
"grad_norm": 0.
|
| 8045 |
"learning_rate": 2.1340503952551606e-08,
|
| 8046 |
-
"loss": 0.
|
| 8047 |
"loss_nan_ranks": 0,
|
| 8048 |
-
"loss_rank_avg": 0.
|
| 8049 |
"step": 3655,
|
| 8050 |
-
"valid_targets_mean":
|
| 8051 |
-
"valid_targets_min":
|
| 8052 |
},
|
| 8053 |
{
|
| 8054 |
-
"epoch": 6.
|
| 8055 |
-
"grad_norm": 0.
|
| 8056 |
"learning_rate": 1.720809618516839e-08,
|
| 8057 |
-
"loss": 0.
|
| 8058 |
"loss_nan_ranks": 0,
|
| 8059 |
-
"loss_rank_avg": 0.
|
| 8060 |
"step": 3660,
|
| 8061 |
-
"valid_targets_mean":
|
| 8062 |
-
"valid_targets_min":
|
| 8063 |
},
|
| 8064 |
{
|
| 8065 |
-
"epoch": 6.
|
| 8066 |
-
"grad_norm": 0.
|
| 8067 |
"learning_rate": 1.351979287302463e-08,
|
| 8068 |
-
"loss": 0.
|
| 8069 |
"loss_nan_ranks": 0,
|
| 8070 |
-
"loss_rank_avg": 0.
|
| 8071 |
"step": 3665,
|
| 8072 |
-
"valid_targets_mean":
|
| 8073 |
-
"valid_targets_min":
|
| 8074 |
},
|
| 8075 |
{
|
| 8076 |
-
"epoch": 6.
|
| 8077 |
-
"grad_norm": 0.
|
| 8078 |
"learning_rate": 1.0275675986242128e-08,
|
| 8079 |
-
"loss": 0.
|
| 8080 |
"loss_nan_ranks": 0,
|
| 8081 |
-
"loss_rank_avg": 0.
|
| 8082 |
"step": 3670,
|
| 8083 |
-
"valid_targets_mean":
|
| 8084 |
-
"valid_targets_min":
|
| 8085 |
},
|
| 8086 |
{
|
| 8087 |
-
"epoch": 6.
|
| 8088 |
-
"grad_norm": 0.
|
| 8089 |
"learning_rate": 7.475817623194826e-09,
|
| 8090 |
-
"loss": 0.
|
| 8091 |
"loss_nan_ranks": 0,
|
| 8092 |
-
"loss_rank_avg": 0.
|
| 8093 |
"step": 3675,
|
| 8094 |
-
"valid_targets_mean":
|
| 8095 |
-
"valid_targets_min":
|
| 8096 |
},
|
| 8097 |
{
|
| 8098 |
-
"epoch": 6.
|
| 8099 |
-
"grad_norm": 0.
|
| 8100 |
"learning_rate": 5.120280008901191e-09,
|
| 8101 |
-
"loss": 0.
|
| 8102 |
"loss_nan_ranks": 0,
|
| 8103 |
-
"loss_rank_avg": 0.
|
| 8104 |
"step": 3680,
|
| 8105 |
-
"valid_targets_mean":
|
| 8106 |
-
"valid_targets_min":
|
| 8107 |
},
|
| 8108 |
{
|
| 8109 |
-
"epoch": 6.
|
| 8110 |
-
"grad_norm": 0.
|
| 8111 |
"learning_rate": 3.2091154936386705e-09,
|
| 8112 |
-
"loss": 0.
|
| 8113 |
"loss_nan_ranks": 0,
|
| 8114 |
-
"loss_rank_avg": 0.
|
| 8115 |
"step": 3685,
|
| 8116 |
-
"valid_targets_mean":
|
| 8117 |
-
"valid_targets_min":
|
| 8118 |
},
|
| 8119 |
{
|
| 8120 |
-
"epoch": 6.
|
| 8121 |
-
"grad_norm": 0.
|
| 8122 |
"learning_rate": 1.7423665517868338e-09,
|
| 8123 |
-
"loss": 0.
|
| 8124 |
"loss_nan_ranks": 0,
|
| 8125 |
-
"loss_rank_avg": 0.
|
| 8126 |
"step": 3690,
|
| 8127 |
-
"valid_targets_mean":
|
| 8128 |
-
"valid_targets_min":
|
| 8129 |
},
|
| 8130 |
{
|
| 8131 |
-
"epoch": 6.
|
| 8132 |
-
"grad_norm": 0.
|
| 8133 |
"learning_rate": 7.200657808792422e-10,
|
| 8134 |
-
"loss": 0.
|
| 8135 |
"loss_nan_ranks": 0,
|
| 8136 |
-
"loss_rank_avg": 0.
|
| 8137 |
"step": 3695,
|
| 8138 |
-
"valid_targets_mean":
|
| 8139 |
-
"valid_targets_min":
|
| 8140 |
},
|
| 8141 |
{
|
| 8142 |
-
"epoch": 6.
|
| 8143 |
-
"grad_norm": 0.
|
| 8144 |
"learning_rate": 1.4223590088180416e-10,
|
| 8145 |
-
"loss": 0.
|
| 8146 |
"loss_nan_ranks": 0,
|
| 8147 |
-
"loss_rank_avg": 0.
|
| 8148 |
"step": 3700,
|
| 8149 |
-
"valid_targets_mean":
|
| 8150 |
-
"valid_targets_min":
|
| 8151 |
},
|
| 8152 |
{
|
| 8153 |
-
"epoch":
|
| 8154 |
"loss_nan_ranks": 0,
|
| 8155 |
-
"loss_rank_avg": 0.
|
| 8156 |
"step": 3702,
|
| 8157 |
-
"total_flos": 2.
|
| 8158 |
-
"train_loss": 0.
|
| 8159 |
-
"train_runtime":
|
| 8160 |
-
"train_samples_per_second":
|
| 8161 |
-
"train_steps_per_second":
|
| 8162 |
-
"valid_targets_mean":
|
| 8163 |
-
"valid_targets_min":
|
| 8164 |
}
|
| 8165 |
],
|
| 8166 |
"logging_steps": 5,
|
|
@@ -8180,7 +8180,7 @@
|
|
| 8180 |
"attributes": {}
|
| 8181 |
}
|
| 8182 |
},
|
| 8183 |
-
"total_flos": 2.
|
| 8184 |
"train_batch_size": 1,
|
| 8185 |
"trial_name": null,
|
| 8186 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 6.999053926206244,
|
| 6 |
"eval_steps": 500,
|
| 7 |
"global_step": 3702,
|
| 8 |
"is_hyper_param_search": false,
|
|
|
|
| 7930 |
"valid_targets_min": 1293
|
| 7931 |
},
|
| 7932 |
{
|
| 7933 |
+
"epoch": 6.815515610217597,
|
| 7934 |
+
"grad_norm": 0.41971865894051924,
|
| 7935 |
"learning_rate": 8.706523924000066e-08,
|
| 7936 |
+
"loss": 0.1448,
|
| 7937 |
"loss_nan_ranks": 0,
|
| 7938 |
+
"loss_rank_avg": 0.06027888506650925,
|
| 7939 |
"step": 3605,
|
| 7940 |
+
"valid_targets_mean": 6555.0,
|
| 7941 |
+
"valid_targets_min": 2788
|
| 7942 |
},
|
| 7943 |
{
|
| 7944 |
+
"epoch": 6.824976348155156,
|
| 7945 |
+
"grad_norm": 0.4407706789613994,
|
| 7946 |
"learning_rate": 7.849846547778983e-08,
|
| 7947 |
+
"loss": 0.1496,
|
| 7948 |
"loss_nan_ranks": 0,
|
| 7949 |
+
"loss_rank_avg": 0.06519781798124313,
|
| 7950 |
"step": 3610,
|
| 7951 |
+
"valid_targets_mean": 6884.4,
|
| 7952 |
+
"valid_targets_min": 1265
|
| 7953 |
},
|
| 7954 |
{
|
| 7955 |
+
"epoch": 6.8344370860927155,
|
| 7956 |
+
"grad_norm": 0.4978073691618193,
|
| 7957 |
"learning_rate": 7.03744340325252e-08,
|
| 7958 |
+
"loss": 0.1498,
|
| 7959 |
"loss_nan_ranks": 0,
|
| 7960 |
+
"loss_rank_avg": 0.07424402236938477,
|
| 7961 |
"step": 3615,
|
| 7962 |
+
"valid_targets_mean": 6403.1,
|
| 7963 |
+
"valid_targets_min": 4674
|
| 7964 |
},
|
| 7965 |
{
|
| 7966 |
+
"epoch": 6.843897824030274,
|
| 7967 |
+
"grad_norm": 0.5085529248955364,
|
| 7968 |
"learning_rate": 6.269332545548068e-08,
|
| 7969 |
+
"loss": 0.1473,
|
| 7970 |
"loss_nan_ranks": 0,
|
| 7971 |
+
"loss_rank_avg": 0.08670083433389664,
|
| 7972 |
"step": 3620,
|
| 7973 |
+
"valid_targets_mean": 4922.5,
|
| 7974 |
+
"valid_targets_min": 1785
|
| 7975 |
},
|
| 7976 |
{
|
| 7977 |
+
"epoch": 6.853358561967833,
|
| 7978 |
+
"grad_norm": 0.4344651976137186,
|
| 7979 |
"learning_rate": 5.5455310454259894e-08,
|
| 7980 |
+
"loss": 0.1514,
|
| 7981 |
"loss_nan_ranks": 0,
|
| 7982 |
+
"loss_rank_avg": 0.07209931313991547,
|
| 7983 |
"step": 3625,
|
| 7984 |
+
"valid_targets_mean": 7308.1,
|
| 7985 |
+
"valid_targets_min": 4659
|
| 7986 |
},
|
| 7987 |
{
|
| 7988 |
+
"epoch": 6.862819299905393,
|
| 7989 |
+
"grad_norm": 0.4413038947876362,
|
| 7990 |
"learning_rate": 4.866054988900581e-08,
|
| 7991 |
+
"loss": 0.1464,
|
| 7992 |
"loss_nan_ranks": 0,
|
| 7993 |
+
"loss_rank_avg": 0.06930118799209595,
|
| 7994 |
"step": 3630,
|
| 7995 |
+
"valid_targets_mean": 6234.8,
|
| 7996 |
+
"valid_targets_min": 976
|
| 7997 |
},
|
| 7998 |
{
|
| 7999 |
+
"epoch": 6.872280037842952,
|
| 8000 |
+
"grad_norm": 0.48107253716280135,
|
| 8001 |
"learning_rate": 4.230919476881479e-08,
|
| 8002 |
+
"loss": 0.1487,
|
| 8003 |
"loss_nan_ranks": 0,
|
| 8004 |
+
"loss_rank_avg": 0.06881730258464813,
|
| 8005 |
"step": 3635,
|
| 8006 |
+
"valid_targets_mean": 5447.9,
|
| 8007 |
+
"valid_targets_min": 2560
|
| 8008 |
},
|
| 8009 |
{
|
| 8010 |
+
"epoch": 6.881740775780511,
|
| 8011 |
+
"grad_norm": 0.5104224535773499,
|
| 8012 |
"learning_rate": 3.640138624839695e-08,
|
| 8013 |
+
"loss": 0.1429,
|
| 8014 |
"loss_nan_ranks": 0,
|
| 8015 |
+
"loss_rank_avg": 0.06860077381134033,
|
| 8016 |
"step": 3640,
|
| 8017 |
+
"valid_targets_mean": 6469.2,
|
| 8018 |
+
"valid_targets_min": 3160
|
| 8019 |
},
|
| 8020 |
{
|
| 8021 |
+
"epoch": 6.8912015137180695,
|
| 8022 |
+
"grad_norm": 0.4870345502815453,
|
| 8023 |
"learning_rate": 3.093725562492544e-08,
|
| 8024 |
+
"loss": 0.1445,
|
| 8025 |
"loss_nan_ranks": 0,
|
| 8026 |
+
"loss_rank_avg": 0.07753711938858032,
|
| 8027 |
"step": 3645,
|
| 8028 |
+
"valid_targets_mean": 6092.4,
|
| 8029 |
+
"valid_targets_min": 2908
|
| 8030 |
},
|
| 8031 |
{
|
| 8032 |
+
"epoch": 6.900662251655629,
|
| 8033 |
+
"grad_norm": 0.4461190608500686,
|
| 8034 |
"learning_rate": 2.591692433511872e-08,
|
| 8035 |
+
"loss": 0.148,
|
| 8036 |
"loss_nan_ranks": 0,
|
| 8037 |
+
"loss_rank_avg": 0.060674458742141724,
|
| 8038 |
"step": 3650,
|
| 8039 |
+
"valid_targets_mean": 4704.4,
|
| 8040 |
+
"valid_targets_min": 1739
|
| 8041 |
},
|
| 8042 |
{
|
| 8043 |
+
"epoch": 6.910122989593188,
|
| 8044 |
+
"grad_norm": 0.46744896878817527,
|
| 8045 |
"learning_rate": 2.1340503952551606e-08,
|
| 8046 |
+
"loss": 0.1454,
|
| 8047 |
"loss_nan_ranks": 0,
|
| 8048 |
+
"loss_rank_avg": 0.07023951411247253,
|
| 8049 |
"step": 3655,
|
| 8050 |
+
"valid_targets_mean": 6859.5,
|
| 8051 |
+
"valid_targets_min": 4199
|
| 8052 |
},
|
| 8053 |
{
|
| 8054 |
+
"epoch": 6.919583727530747,
|
| 8055 |
+
"grad_norm": 0.4423365813985784,
|
| 8056 |
"learning_rate": 1.720809618516839e-08,
|
| 8057 |
+
"loss": 0.1392,
|
| 8058 |
"loss_nan_ranks": 0,
|
| 8059 |
+
"loss_rank_avg": 0.05241403728723526,
|
| 8060 |
"step": 3660,
|
| 8061 |
+
"valid_targets_mean": 5074.6,
|
| 8062 |
+
"valid_targets_min": 1461
|
| 8063 |
},
|
| 8064 |
{
|
| 8065 |
+
"epoch": 6.929044465468307,
|
| 8066 |
+
"grad_norm": 0.4940830080464678,
|
| 8067 |
"learning_rate": 1.351979287302463e-08,
|
| 8068 |
+
"loss": 0.1519,
|
| 8069 |
"loss_nan_ranks": 0,
|
| 8070 |
+
"loss_rank_avg": 0.06597153842449188,
|
| 8071 |
"step": 3665,
|
| 8072 |
+
"valid_targets_mean": 5494.9,
|
| 8073 |
+
"valid_targets_min": 1333
|
| 8074 |
},
|
| 8075 |
{
|
| 8076 |
+
"epoch": 6.938505203405866,
|
| 8077 |
+
"grad_norm": 0.5141965819400358,
|
| 8078 |
"learning_rate": 1.0275675986242128e-08,
|
| 8079 |
+
"loss": 0.1502,
|
| 8080 |
"loss_nan_ranks": 0,
|
| 8081 |
+
"loss_rank_avg": 0.06885077059268951,
|
| 8082 |
"step": 3670,
|
| 8083 |
+
"valid_targets_mean": 5493.0,
|
| 8084 |
+
"valid_targets_min": 1550
|
| 8085 |
},
|
| 8086 |
{
|
| 8087 |
+
"epoch": 6.9479659413434245,
|
| 8088 |
+
"grad_norm": 0.49214237360203794,
|
| 8089 |
"learning_rate": 7.475817623194826e-09,
|
| 8090 |
+
"loss": 0.1505,
|
| 8091 |
"loss_nan_ranks": 0,
|
| 8092 |
+
"loss_rank_avg": 0.06208660453557968,
|
| 8093 |
"step": 3675,
|
| 8094 |
+
"valid_targets_mean": 4805.9,
|
| 8095 |
+
"valid_targets_min": 699
|
| 8096 |
},
|
| 8097 |
{
|
| 8098 |
+
"epoch": 6.957426679280984,
|
| 8099 |
+
"grad_norm": 0.5374621708278976,
|
| 8100 |
"learning_rate": 5.120280008901191e-09,
|
| 8101 |
+
"loss": 0.1467,
|
| 8102 |
"loss_nan_ranks": 0,
|
| 8103 |
+
"loss_rank_avg": 0.080912746489048,
|
| 8104 |
"step": 3680,
|
| 8105 |
+
"valid_targets_mean": 5787.4,
|
| 8106 |
+
"valid_targets_min": 4241
|
| 8107 |
},
|
| 8108 |
{
|
| 8109 |
+
"epoch": 6.966887417218543,
|
| 8110 |
+
"grad_norm": 0.5296512592742303,
|
| 8111 |
"learning_rate": 3.2091154936386705e-09,
|
| 8112 |
+
"loss": 0.1559,
|
| 8113 |
"loss_nan_ranks": 0,
|
| 8114 |
+
"loss_rank_avg": 0.06907324492931366,
|
| 8115 |
"step": 3685,
|
| 8116 |
+
"valid_targets_mean": 5773.0,
|
| 8117 |
+
"valid_targets_min": 3043
|
| 8118 |
},
|
| 8119 |
{
|
| 8120 |
+
"epoch": 6.976348155156102,
|
| 8121 |
+
"grad_norm": 0.4419333491426742,
|
| 8122 |
"learning_rate": 1.7423665517868338e-09,
|
| 8123 |
+
"loss": 0.1487,
|
| 8124 |
"loss_nan_ranks": 0,
|
| 8125 |
+
"loss_rank_avg": 0.05245755612850189,
|
| 8126 |
"step": 3690,
|
| 8127 |
+
"valid_targets_mean": 5486.1,
|
| 8128 |
+
"valid_targets_min": 1693
|
| 8129 |
},
|
| 8130 |
{
|
| 8131 |
+
"epoch": 6.985808893093662,
|
| 8132 |
+
"grad_norm": 0.508769965540501,
|
| 8133 |
"learning_rate": 7.200657808792422e-10,
|
| 8134 |
+
"loss": 0.1516,
|
| 8135 |
"loss_nan_ranks": 0,
|
| 8136 |
+
"loss_rank_avg": 0.08677756786346436,
|
| 8137 |
"step": 3695,
|
| 8138 |
+
"valid_targets_mean": 6286.6,
|
| 8139 |
+
"valid_targets_min": 2001
|
| 8140 |
},
|
| 8141 |
{
|
| 8142 |
+
"epoch": 6.995269631031221,
|
| 8143 |
+
"grad_norm": 0.47535964660935565,
|
| 8144 |
"learning_rate": 1.4223590088180416e-10,
|
| 8145 |
+
"loss": 0.1429,
|
| 8146 |
"loss_nan_ranks": 0,
|
| 8147 |
+
"loss_rank_avg": 0.07410024106502533,
|
| 8148 |
"step": 3700,
|
| 8149 |
+
"valid_targets_mean": 5791.9,
|
| 8150 |
+
"valid_targets_min": 2377
|
| 8151 |
},
|
| 8152 |
{
|
| 8153 |
+
"epoch": 6.999053926206244,
|
| 8154 |
"loss_nan_ranks": 0,
|
| 8155 |
+
"loss_rank_avg": 0.14962854981422424,
|
| 8156 |
"step": 3702,
|
| 8157 |
+
"total_flos": 2.2812121694270915e+18,
|
| 8158 |
+
"train_loss": 0.004161321395542221,
|
| 8159 |
+
"train_runtime": 3645.582,
|
| 8160 |
+
"train_samples_per_second": 16.237,
|
| 8161 |
+
"train_steps_per_second": 1.016,
|
| 8162 |
+
"valid_targets_mean": 6778.8,
|
| 8163 |
+
"valid_targets_min": 4131
|
| 8164 |
}
|
| 8165 |
],
|
| 8166 |
"logging_steps": 5,
|
|
|
|
| 8180 |
"attributes": {}
|
| 8181 |
}
|
| 8182 |
},
|
| 8183 |
+
"total_flos": 2.2812121694270915e+18,
|
| 8184 |
"train_batch_size": 1,
|
| 8185 |
"trial_name": null,
|
| 8186 |
"trial_params": null
|
training_loss.png
CHANGED
|
|