penfever commited on
Commit
4481f86
·
verified ·
1 Parent(s): 8f869af

End of training

Browse files
Files changed (5) hide show
  1. README.md +3 -1
  2. all_results.json +13 -13
  3. train_results.json +13 -13
  4. trainer_state.json +131 -131
  5. training_loss.png +0 -0
README.md CHANGED
@@ -1,5 +1,7 @@
1
  ---
2
  library_name: transformers
 
 
3
  tags:
4
  - llama-factory
5
  - full
@@ -14,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  # swesmith-stack-undr7030
16
 
17
- This model was trained from scratch on the None dataset.
18
 
19
  ## Model description
20
 
 
1
  ---
2
  library_name: transformers
3
+ license: apache-2.0
4
+ base_model: Qwen/Qwen3-8B
5
  tags:
6
  - llama-factory
7
  - full
 
16
 
17
  # swesmith-stack-undr7030
18
 
19
+ This model is a fine-tuned version of [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) on the penfever/GLM-4.6-swesmith-32ep-131k-nosumm-reasoning and the penfever/GLM-4.6-stackexchange-overflow-sandboxes-32eps-65k-reasoning datasets.
20
 
21
  ## Model description
22
 
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "achieved_tflops_per_gpu": 6.133380964159379,
3
- "achieved_tflops_per_gpu_theoretical": 271.03236502585344,
4
- "epoch": 7.0,
5
  "loss_nan_ranks": 0,
6
- "loss_rank_avg": 0.16898784041404724,
7
- "mfu_percent": 1.965827232102365,
8
- "mfu_percent_theoretical": 86.86934776469661,
9
- "total_flos": 2.279621690793132e+18,
10
- "train_loss": 0.05624667479018145,
11
- "train_runtime": 46459.3204,
12
- "train_samples_per_second": 1.274,
13
- "train_steps_per_second": 0.08,
14
- "valid_targets_mean": 5833.6,
15
- "valid_targets_min": 2317
16
  }
 
1
  {
2
+ "achieved_tflops_per_gpu": 78.2183808177642,
3
+ "achieved_tflops_per_gpu_theoretical": 3442.1110358283386,
4
+ "epoch": 6.999053926206244,
5
  "loss_nan_ranks": 0,
6
+ "loss_rank_avg": 0.14962854981422424,
7
+ "mfu_percent": 25.0699938518475,
8
+ "mfu_percent_theoretical": 1103.2407166116468,
9
+ "total_flos": 2.2812121694270915e+18,
10
+ "train_loss": 0.004161321395542221,
11
+ "train_runtime": 3645.582,
12
+ "train_samples_per_second": 16.237,
13
+ "train_steps_per_second": 1.016,
14
+ "valid_targets_mean": 6778.8,
15
+ "valid_targets_min": 4131
16
  }
train_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "achieved_tflops_per_gpu": 6.133380964159379,
3
- "achieved_tflops_per_gpu_theoretical": 271.03236502585344,
4
- "epoch": 7.0,
5
  "loss_nan_ranks": 0,
6
- "loss_rank_avg": 0.16898784041404724,
7
- "mfu_percent": 1.965827232102365,
8
- "mfu_percent_theoretical": 86.86934776469661,
9
- "total_flos": 2.279621690793132e+18,
10
- "train_loss": 0.05624667479018145,
11
- "train_runtime": 46459.3204,
12
- "train_samples_per_second": 1.274,
13
- "train_steps_per_second": 0.08,
14
- "valid_targets_mean": 5833.6,
15
- "valid_targets_min": 2317
16
  }
 
1
  {
2
+ "achieved_tflops_per_gpu": 78.2183808177642,
3
+ "achieved_tflops_per_gpu_theoretical": 3442.1110358283386,
4
+ "epoch": 6.999053926206244,
5
  "loss_nan_ranks": 0,
6
+ "loss_rank_avg": 0.14962854981422424,
7
+ "mfu_percent": 25.0699938518475,
8
+ "mfu_percent_theoretical": 1103.2407166116468,
9
+ "total_flos": 2.2812121694270915e+18,
10
+ "train_loss": 0.004161321395542221,
11
+ "train_runtime": 3645.582,
12
+ "train_samples_per_second": 16.237,
13
+ "train_steps_per_second": 1.016,
14
+ "valid_targets_mean": 6778.8,
15
+ "valid_targets_min": 4131
16
  }
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 7.0,
6
  "eval_steps": 500,
7
  "global_step": 3702,
8
  "is_hyper_param_search": false,
@@ -7930,237 +7930,237 @@
7930
  "valid_targets_min": 1293
7931
  },
7932
  {
7933
- "epoch": 6.817407757805109,
7934
- "grad_norm": 0.5430367711086916,
7935
  "learning_rate": 8.706523924000066e-08,
7936
- "loss": 0.1498,
7937
  "loss_nan_ranks": 0,
7938
- "loss_rank_avg": 0.06235578656196594,
7939
  "step": 3605,
7940
- "valid_targets_mean": 4719.1,
7941
- "valid_targets_min": 1802
7942
  },
7943
  {
7944
- "epoch": 6.8268684957426675,
7945
- "grad_norm": 0.5494855444842887,
7946
  "learning_rate": 7.849846547778983e-08,
7947
- "loss": 0.1485,
7948
  "loss_nan_ranks": 0,
7949
- "loss_rank_avg": 0.0756906047463417,
7950
  "step": 3610,
7951
- "valid_targets_mean": 4972.6,
7952
- "valid_targets_min": 2514
7953
  },
7954
  {
7955
- "epoch": 6.836329233680227,
7956
- "grad_norm": 0.5086851462018042,
7957
  "learning_rate": 7.03744340325252e-08,
7958
- "loss": 0.1572,
7959
  "loss_nan_ranks": 0,
7960
- "loss_rank_avg": 0.08582527190446854,
7961
  "step": 3615,
7962
- "valid_targets_mean": 6458.2,
7963
- "valid_targets_min": 4908
7964
  },
7965
  {
7966
- "epoch": 6.845789971617786,
7967
- "grad_norm": 0.5270028569440548,
7968
  "learning_rate": 6.269332545548068e-08,
7969
- "loss": 0.1564,
7970
  "loss_nan_ranks": 0,
7971
- "loss_rank_avg": 0.10052739083766937,
7972
  "step": 3620,
7973
- "valid_targets_mean": 4785.8,
7974
- "valid_targets_min": 1666
7975
  },
7976
  {
7977
- "epoch": 6.855250709555345,
7978
- "grad_norm": 0.5600188215892177,
7979
  "learning_rate": 5.5455310454259894e-08,
7980
- "loss": 0.1755,
7981
  "loss_nan_ranks": 0,
7982
- "loss_rank_avg": 0.19710837304592133,
7983
  "step": 3625,
7984
- "valid_targets_mean": 5958.5,
7985
- "valid_targets_min": 1126
7986
  },
7987
  {
7988
- "epoch": 6.864711447492905,
7989
- "grad_norm": 0.5480215760958226,
7990
  "learning_rate": 4.866054988900581e-08,
7991
- "loss": 0.1641,
7992
  "loss_nan_ranks": 0,
7993
- "loss_rank_avg": 0.0743655413389206,
7994
  "step": 3630,
7995
- "valid_targets_mean": 4818.5,
7996
- "valid_targets_min": 3436
7997
  },
7998
  {
7999
- "epoch": 6.874172185430464,
8000
- "grad_norm": 0.5336673039008878,
8001
  "learning_rate": 4.230919476881479e-08,
8002
- "loss": 0.1654,
8003
  "loss_nan_ranks": 0,
8004
- "loss_rank_avg": 0.09351871907711029,
8005
  "step": 3635,
8006
- "valid_targets_mean": 5571.0,
8007
- "valid_targets_min": 1840
8008
  },
8009
  {
8010
- "epoch": 6.8836329233680225,
8011
- "grad_norm": 0.5289551449236221,
8012
  "learning_rate": 3.640138624839695e-08,
8013
- "loss": 0.1568,
8014
  "loss_nan_ranks": 0,
8015
- "loss_rank_avg": 0.0810607373714447,
8016
  "step": 3640,
8017
- "valid_targets_mean": 6840.8,
8018
- "valid_targets_min": 4690
8019
  },
8020
  {
8021
- "epoch": 6.893093661305582,
8022
- "grad_norm": 0.4793497219405571,
8023
  "learning_rate": 3.093725562492544e-08,
8024
- "loss": 0.1529,
8025
  "loss_nan_ranks": 0,
8026
- "loss_rank_avg": 0.06711985170841217,
8027
  "step": 3645,
8028
- "valid_targets_mean": 5377.1,
8029
- "valid_targets_min": 1511
8030
  },
8031
  {
8032
- "epoch": 6.902554399243141,
8033
- "grad_norm": 0.46797700399359005,
8034
  "learning_rate": 2.591692433511872e-08,
8035
- "loss": 0.1596,
8036
  "loss_nan_ranks": 0,
8037
- "loss_rank_avg": 0.07881344854831696,
8038
  "step": 3650,
8039
- "valid_targets_mean": 5766.8,
8040
- "valid_targets_min": 2497
8041
  },
8042
  {
8043
- "epoch": 6.9120151371807,
8044
- "grad_norm": 0.43857238052909714,
8045
  "learning_rate": 2.1340503952551606e-08,
8046
- "loss": 0.1483,
8047
  "loss_nan_ranks": 0,
8048
- "loss_rank_avg": 0.06501060724258423,
8049
  "step": 3655,
8050
- "valid_targets_mean": 6468.2,
8051
- "valid_targets_min": 4788
8052
  },
8053
  {
8054
- "epoch": 6.92147587511826,
8055
- "grad_norm": 0.4968634044404902,
8056
  "learning_rate": 1.720809618516839e-08,
8057
- "loss": 0.148,
8058
  "loss_nan_ranks": 0,
8059
- "loss_rank_avg": 0.07656071335077286,
8060
  "step": 3660,
8061
- "valid_targets_mean": 6760.2,
8062
- "valid_targets_min": 3082
8063
  },
8064
  {
8065
- "epoch": 6.9309366130558185,
8066
- "grad_norm": 0.48363314457204404,
8067
  "learning_rate": 1.351979287302463e-08,
8068
- "loss": 0.1813,
8069
  "loss_nan_ranks": 0,
8070
- "loss_rank_avg": 0.07225023210048676,
8071
  "step": 3665,
8072
- "valid_targets_mean": 5697.0,
8073
- "valid_targets_min": 1732
8074
  },
8075
  {
8076
- "epoch": 6.940397350993377,
8077
- "grad_norm": 0.5225998185104811,
8078
  "learning_rate": 1.0275675986242128e-08,
8079
- "loss": 0.1557,
8080
  "loss_nan_ranks": 0,
8081
- "loss_rank_avg": 0.0772523581981659,
8082
  "step": 3670,
8083
- "valid_targets_mean": 5107.5,
8084
- "valid_targets_min": 2582
8085
  },
8086
  {
8087
- "epoch": 6.949858088930936,
8088
- "grad_norm": 0.5173176741618409,
8089
  "learning_rate": 7.475817623194826e-09,
8090
- "loss": 0.151,
8091
  "loss_nan_ranks": 0,
8092
- "loss_rank_avg": 0.06647118180990219,
8093
  "step": 3675,
8094
- "valid_targets_mean": 5190.0,
8095
- "valid_targets_min": 2183
8096
  },
8097
  {
8098
- "epoch": 6.959318826868496,
8099
- "grad_norm": 0.49144612331391746,
8100
  "learning_rate": 5.120280008901191e-09,
8101
- "loss": 0.1621,
8102
  "loss_nan_ranks": 0,
8103
- "loss_rank_avg": 0.07282565534114838,
8104
  "step": 3680,
8105
- "valid_targets_mean": 5468.8,
8106
- "valid_targets_min": 3732
8107
  },
8108
  {
8109
- "epoch": 6.968779564806055,
8110
- "grad_norm": 0.6061118774120625,
8111
  "learning_rate": 3.2091154936386705e-09,
8112
- "loss": 0.1647,
8113
  "loss_nan_ranks": 0,
8114
- "loss_rank_avg": 0.09444256126880646,
8115
  "step": 3685,
8116
- "valid_targets_mean": 4745.1,
8117
- "valid_targets_min": 2157
8118
  },
8119
  {
8120
- "epoch": 6.978240302743614,
8121
- "grad_norm": 0.543585630200615,
8122
  "learning_rate": 1.7423665517868338e-09,
8123
- "loss": 0.1576,
8124
  "loss_nan_ranks": 0,
8125
- "loss_rank_avg": 0.08968066424131393,
8126
  "step": 3690,
8127
- "valid_targets_mean": 5457.2,
8128
- "valid_targets_min": 1409
8129
  },
8130
  {
8131
- "epoch": 6.9877010406811735,
8132
- "grad_norm": 0.5396859609859047,
8133
  "learning_rate": 7.200657808792422e-10,
8134
- "loss": 0.1561,
8135
  "loss_nan_ranks": 0,
8136
- "loss_rank_avg": 0.07551106810569763,
8137
  "step": 3695,
8138
- "valid_targets_mean": 4751.1,
8139
- "valid_targets_min": 1786
8140
  },
8141
  {
8142
- "epoch": 6.997161778618732,
8143
- "grad_norm": 0.4860511398956867,
8144
  "learning_rate": 1.4223590088180416e-10,
8145
- "loss": 0.1492,
8146
  "loss_nan_ranks": 0,
8147
- "loss_rank_avg": 0.07160358875989914,
8148
  "step": 3700,
8149
- "valid_targets_mean": 5778.8,
8150
- "valid_targets_min": 4253
8151
  },
8152
  {
8153
- "epoch": 7.0,
8154
  "loss_nan_ranks": 0,
8155
- "loss_rank_avg": 0.16898784041404724,
8156
  "step": 3702,
8157
- "total_flos": 2.279621690793132e+18,
8158
- "train_loss": 0.05624667479018145,
8159
- "train_runtime": 46459.3204,
8160
- "train_samples_per_second": 1.274,
8161
- "train_steps_per_second": 0.08,
8162
- "valid_targets_mean": 5833.6,
8163
- "valid_targets_min": 2317
8164
  }
8165
  ],
8166
  "logging_steps": 5,
@@ -8180,7 +8180,7 @@
8180
  "attributes": {}
8181
  }
8182
  },
8183
- "total_flos": 2.279621690793132e+18,
8184
  "train_batch_size": 1,
8185
  "trial_name": null,
8186
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 6.999053926206244,
6
  "eval_steps": 500,
7
  "global_step": 3702,
8
  "is_hyper_param_search": false,
 
7930
  "valid_targets_min": 1293
7931
  },
7932
  {
7933
+ "epoch": 6.815515610217597,
7934
+ "grad_norm": 0.41971865894051924,
7935
  "learning_rate": 8.706523924000066e-08,
7936
+ "loss": 0.1448,
7937
  "loss_nan_ranks": 0,
7938
+ "loss_rank_avg": 0.06027888506650925,
7939
  "step": 3605,
7940
+ "valid_targets_mean": 6555.0,
7941
+ "valid_targets_min": 2788
7942
  },
7943
  {
7944
+ "epoch": 6.824976348155156,
7945
+ "grad_norm": 0.4407706789613994,
7946
  "learning_rate": 7.849846547778983e-08,
7947
+ "loss": 0.1496,
7948
  "loss_nan_ranks": 0,
7949
+ "loss_rank_avg": 0.06519781798124313,
7950
  "step": 3610,
7951
+ "valid_targets_mean": 6884.4,
7952
+ "valid_targets_min": 1265
7953
  },
7954
  {
7955
+ "epoch": 6.8344370860927155,
7956
+ "grad_norm": 0.4978073691618193,
7957
  "learning_rate": 7.03744340325252e-08,
7958
+ "loss": 0.1498,
7959
  "loss_nan_ranks": 0,
7960
+ "loss_rank_avg": 0.07424402236938477,
7961
  "step": 3615,
7962
+ "valid_targets_mean": 6403.1,
7963
+ "valid_targets_min": 4674
7964
  },
7965
  {
7966
+ "epoch": 6.843897824030274,
7967
+ "grad_norm": 0.5085529248955364,
7968
  "learning_rate": 6.269332545548068e-08,
7969
+ "loss": 0.1473,
7970
  "loss_nan_ranks": 0,
7971
+ "loss_rank_avg": 0.08670083433389664,
7972
  "step": 3620,
7973
+ "valid_targets_mean": 4922.5,
7974
+ "valid_targets_min": 1785
7975
  },
7976
  {
7977
+ "epoch": 6.853358561967833,
7978
+ "grad_norm": 0.4344651976137186,
7979
  "learning_rate": 5.5455310454259894e-08,
7980
+ "loss": 0.1514,
7981
  "loss_nan_ranks": 0,
7982
+ "loss_rank_avg": 0.07209931313991547,
7983
  "step": 3625,
7984
+ "valid_targets_mean": 7308.1,
7985
+ "valid_targets_min": 4659
7986
  },
7987
  {
7988
+ "epoch": 6.862819299905393,
7989
+ "grad_norm": 0.4413038947876362,
7990
  "learning_rate": 4.866054988900581e-08,
7991
+ "loss": 0.1464,
7992
  "loss_nan_ranks": 0,
7993
+ "loss_rank_avg": 0.06930118799209595,
7994
  "step": 3630,
7995
+ "valid_targets_mean": 6234.8,
7996
+ "valid_targets_min": 976
7997
  },
7998
  {
7999
+ "epoch": 6.872280037842952,
8000
+ "grad_norm": 0.48107253716280135,
8001
  "learning_rate": 4.230919476881479e-08,
8002
+ "loss": 0.1487,
8003
  "loss_nan_ranks": 0,
8004
+ "loss_rank_avg": 0.06881730258464813,
8005
  "step": 3635,
8006
+ "valid_targets_mean": 5447.9,
8007
+ "valid_targets_min": 2560
8008
  },
8009
  {
8010
+ "epoch": 6.881740775780511,
8011
+ "grad_norm": 0.5104224535773499,
8012
  "learning_rate": 3.640138624839695e-08,
8013
+ "loss": 0.1429,
8014
  "loss_nan_ranks": 0,
8015
+ "loss_rank_avg": 0.06860077381134033,
8016
  "step": 3640,
8017
+ "valid_targets_mean": 6469.2,
8018
+ "valid_targets_min": 3160
8019
  },
8020
  {
8021
+ "epoch": 6.8912015137180695,
8022
+ "grad_norm": 0.4870345502815453,
8023
  "learning_rate": 3.093725562492544e-08,
8024
+ "loss": 0.1445,
8025
  "loss_nan_ranks": 0,
8026
+ "loss_rank_avg": 0.07753711938858032,
8027
  "step": 3645,
8028
+ "valid_targets_mean": 6092.4,
8029
+ "valid_targets_min": 2908
8030
  },
8031
  {
8032
+ "epoch": 6.900662251655629,
8033
+ "grad_norm": 0.4461190608500686,
8034
  "learning_rate": 2.591692433511872e-08,
8035
+ "loss": 0.148,
8036
  "loss_nan_ranks": 0,
8037
+ "loss_rank_avg": 0.060674458742141724,
8038
  "step": 3650,
8039
+ "valid_targets_mean": 4704.4,
8040
+ "valid_targets_min": 1739
8041
  },
8042
  {
8043
+ "epoch": 6.910122989593188,
8044
+ "grad_norm": 0.46744896878817527,
8045
  "learning_rate": 2.1340503952551606e-08,
8046
+ "loss": 0.1454,
8047
  "loss_nan_ranks": 0,
8048
+ "loss_rank_avg": 0.07023951411247253,
8049
  "step": 3655,
8050
+ "valid_targets_mean": 6859.5,
8051
+ "valid_targets_min": 4199
8052
  },
8053
  {
8054
+ "epoch": 6.919583727530747,
8055
+ "grad_norm": 0.4423365813985784,
8056
  "learning_rate": 1.720809618516839e-08,
8057
+ "loss": 0.1392,
8058
  "loss_nan_ranks": 0,
8059
+ "loss_rank_avg": 0.05241403728723526,
8060
  "step": 3660,
8061
+ "valid_targets_mean": 5074.6,
8062
+ "valid_targets_min": 1461
8063
  },
8064
  {
8065
+ "epoch": 6.929044465468307,
8066
+ "grad_norm": 0.4940830080464678,
8067
  "learning_rate": 1.351979287302463e-08,
8068
+ "loss": 0.1519,
8069
  "loss_nan_ranks": 0,
8070
+ "loss_rank_avg": 0.06597153842449188,
8071
  "step": 3665,
8072
+ "valid_targets_mean": 5494.9,
8073
+ "valid_targets_min": 1333
8074
  },
8075
  {
8076
+ "epoch": 6.938505203405866,
8077
+ "grad_norm": 0.5141965819400358,
8078
  "learning_rate": 1.0275675986242128e-08,
8079
+ "loss": 0.1502,
8080
  "loss_nan_ranks": 0,
8081
+ "loss_rank_avg": 0.06885077059268951,
8082
  "step": 3670,
8083
+ "valid_targets_mean": 5493.0,
8084
+ "valid_targets_min": 1550
8085
  },
8086
  {
8087
+ "epoch": 6.9479659413434245,
8088
+ "grad_norm": 0.49214237360203794,
8089
  "learning_rate": 7.475817623194826e-09,
8090
+ "loss": 0.1505,
8091
  "loss_nan_ranks": 0,
8092
+ "loss_rank_avg": 0.06208660453557968,
8093
  "step": 3675,
8094
+ "valid_targets_mean": 4805.9,
8095
+ "valid_targets_min": 699
8096
  },
8097
  {
8098
+ "epoch": 6.957426679280984,
8099
+ "grad_norm": 0.5374621708278976,
8100
  "learning_rate": 5.120280008901191e-09,
8101
+ "loss": 0.1467,
8102
  "loss_nan_ranks": 0,
8103
+ "loss_rank_avg": 0.080912746489048,
8104
  "step": 3680,
8105
+ "valid_targets_mean": 5787.4,
8106
+ "valid_targets_min": 4241
8107
  },
8108
  {
8109
+ "epoch": 6.966887417218543,
8110
+ "grad_norm": 0.5296512592742303,
8111
  "learning_rate": 3.2091154936386705e-09,
8112
+ "loss": 0.1559,
8113
  "loss_nan_ranks": 0,
8114
+ "loss_rank_avg": 0.06907324492931366,
8115
  "step": 3685,
8116
+ "valid_targets_mean": 5773.0,
8117
+ "valid_targets_min": 3043
8118
  },
8119
  {
8120
+ "epoch": 6.976348155156102,
8121
+ "grad_norm": 0.4419333491426742,
8122
  "learning_rate": 1.7423665517868338e-09,
8123
+ "loss": 0.1487,
8124
  "loss_nan_ranks": 0,
8125
+ "loss_rank_avg": 0.05245755612850189,
8126
  "step": 3690,
8127
+ "valid_targets_mean": 5486.1,
8128
+ "valid_targets_min": 1693
8129
  },
8130
  {
8131
+ "epoch": 6.985808893093662,
8132
+ "grad_norm": 0.508769965540501,
8133
  "learning_rate": 7.200657808792422e-10,
8134
+ "loss": 0.1516,
8135
  "loss_nan_ranks": 0,
8136
+ "loss_rank_avg": 0.08677756786346436,
8137
  "step": 3695,
8138
+ "valid_targets_mean": 6286.6,
8139
+ "valid_targets_min": 2001
8140
  },
8141
  {
8142
+ "epoch": 6.995269631031221,
8143
+ "grad_norm": 0.47535964660935565,
8144
  "learning_rate": 1.4223590088180416e-10,
8145
+ "loss": 0.1429,
8146
  "loss_nan_ranks": 0,
8147
+ "loss_rank_avg": 0.07410024106502533,
8148
  "step": 3700,
8149
+ "valid_targets_mean": 5791.9,
8150
+ "valid_targets_min": 2377
8151
  },
8152
  {
8153
+ "epoch": 6.999053926206244,
8154
  "loss_nan_ranks": 0,
8155
+ "loss_rank_avg": 0.14962854981422424,
8156
  "step": 3702,
8157
+ "total_flos": 2.2812121694270915e+18,
8158
+ "train_loss": 0.004161321395542221,
8159
+ "train_runtime": 3645.582,
8160
+ "train_samples_per_second": 16.237,
8161
+ "train_steps_per_second": 1.016,
8162
+ "valid_targets_mean": 6778.8,
8163
+ "valid_targets_min": 4131
8164
  }
8165
  ],
8166
  "logging_steps": 5,
 
8180
  "attributes": {}
8181
  }
8182
  },
8183
+ "total_flos": 2.2812121694270915e+18,
8184
  "train_batch_size": 1,
8185
  "trial_name": null,
8186
  "trial_params": null
training_loss.png CHANGED