apwic commited on
Commit
f41d598
·
verified ·
1 Parent(s): e7ee6e3

Training in progress, epoch 1

Browse files
all_results.json CHANGED
@@ -1,29 +1,9 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_gen_len": 1.0,
4
- "eval_loss": 0.5082412362098694,
5
- "eval_rouge1": 0.3572,
6
- "eval_rouge2": 0.0,
7
- "eval_rougeL": 0.3545,
8
- "eval_rougeLsum": 0.3557,
9
- "eval_runtime": 333.3038,
10
- "eval_samples": 750,
11
- "eval_samples_per_second": 2.25,
12
- "eval_steps_per_second": 0.282,
13
- "predict_gen_len": 1.0,
14
- "predict_loss": 0.5292530655860901,
15
- "predict_rouge1": 0.4599,
16
- "predict_rouge2": 0.0,
17
- "predict_rougeL": 0.4593,
18
- "predict_rougeLsum": 0.4589,
19
- "predict_runtime": 1669.4106,
20
- "predict_samples": 3762,
21
- "predict_samples_per_second": 2.253,
22
- "predict_steps_per_second": 0.282,
23
  "total_flos": 4.883004680306688e+16,
24
- "train_loss": 0.6658353641871583,
25
- "train_runtime": 6067.5028,
26
  "train_samples": 14262,
27
- "train_samples_per_second": 11.753,
28
- "train_steps_per_second": 1.469
29
  }
 
1
  {
2
  "epoch": 5.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "total_flos": 4.883004680306688e+16,
4
+ "train_loss": 0.5858162473670036,
5
+ "train_runtime": 4314.6222,
6
  "train_samples": 14262,
7
+ "train_samples_per_second": 16.528,
8
+ "train_steps_per_second": 1.034
9
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1b965ed5a0be9f18a29eff195e5de1a3f18783eb847a4fb561d2bf10ea64d6e
3
  size 990345064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92d4830afbcee7cdf28a040dc4ee04de1fba9155ae1a9cf5ae4af9ba57d199ae
3
  size 990345064
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 5.0,
3
  "total_flos": 4.883004680306688e+16,
4
- "train_loss": 0.6658353641871583,
5
- "train_runtime": 6067.5028,
6
  "train_samples": 14262,
7
- "train_samples_per_second": 11.753,
8
- "train_steps_per_second": 1.469
9
  }
 
1
  {
2
  "epoch": 5.0,
3
  "total_flos": 4.883004680306688e+16,
4
+ "train_loss": 0.5858162473670036,
5
+ "train_runtime": 4314.6222,
6
  "train_samples": 14262,
7
+ "train_samples_per_second": 16.528,
8
+ "train_steps_per_second": 1.034
9
  }
trainer_state.json CHANGED
@@ -3,128 +3,128 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 5.0,
5
  "eval_steps": 500,
6
- "global_step": 8915,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "grad_norm": 0.9904735088348389,
14
  "learning_rate": 0.0008,
15
- "loss": 1.3499,
16
- "step": 1783
17
  },
18
  {
19
  "epoch": 1.0,
20
  "eval_gen_len": 1.0,
21
- "eval_loss": 0.8387730121612549,
22
- "eval_rouge1": 0.6751,
23
  "eval_rouge2": 0.0,
24
- "eval_rougeL": 0.6714,
25
- "eval_rougeLsum": 0.6724,
26
- "eval_runtime": 39.1228,
27
- "eval_samples_per_second": 19.17,
28
- "eval_steps_per_second": 0.613,
29
- "step": 1783
30
  },
31
  {
32
  "epoch": 2.0,
33
- "grad_norm": 1.0084079504013062,
34
  "learning_rate": 0.0006,
35
- "loss": 0.7508,
36
- "step": 3566
37
  },
38
  {
39
  "epoch": 2.0,
40
  "eval_gen_len": 1.0,
41
- "eval_loss": 0.7121184468269348,
42
- "eval_rouge1": 0.7129,
43
  "eval_rouge2": 0.0,
44
- "eval_rougeL": 0.7106,
45
- "eval_rougeLsum": 0.7091,
46
- "eval_runtime": 38.4298,
47
- "eval_samples_per_second": 19.516,
48
- "eval_steps_per_second": 0.625,
49
- "step": 3566
50
  },
51
  {
52
  "epoch": 3.0,
53
- "grad_norm": 0.5230709314346313,
54
  "learning_rate": 0.0004,
55
- "loss": 0.5557,
56
- "step": 5349
57
  },
58
  {
59
  "epoch": 3.0,
60
  "eval_gen_len": 1.0,
61
- "eval_loss": 0.6559586524963379,
62
- "eval_rouge1": 0.6716,
63
  "eval_rouge2": 0.0,
64
- "eval_rougeL": 0.6666,
65
- "eval_rougeLsum": 0.6669,
66
- "eval_runtime": 38.324,
67
- "eval_samples_per_second": 19.57,
68
- "eval_steps_per_second": 0.626,
69
- "step": 5349
70
  },
71
  {
72
  "epoch": 4.0,
73
- "grad_norm": 0.795143723487854,
74
  "learning_rate": 0.0002,
75
- "loss": 0.4087,
76
- "step": 7132
77
  },
78
  {
79
  "epoch": 4.0,
80
  "eval_gen_len": 1.0,
81
- "eval_loss": 0.6574313640594482,
82
- "eval_rouge1": 0.7079,
83
  "eval_rouge2": 0.0,
84
- "eval_rougeL": 0.7053,
85
- "eval_rougeLsum": 0.7064,
86
- "eval_runtime": 38.2923,
87
- "eval_samples_per_second": 19.586,
88
- "eval_steps_per_second": 0.627,
89
- "step": 7132
90
  },
91
  {
92
  "epoch": 5.0,
93
- "grad_norm": 0.5503818988800049,
94
  "learning_rate": 0.0,
95
- "loss": 0.2641,
96
- "step": 8915
97
  },
98
  {
99
  "epoch": 5.0,
100
  "eval_gen_len": 1.0,
101
- "eval_loss": 0.7164290547370911,
102
- "eval_rouge1": 0.6602,
103
  "eval_rouge2": 0.0,
104
- "eval_rougeL": 0.6558,
105
- "eval_rougeLsum": 0.6584,
106
- "eval_runtime": 38.3942,
107
- "eval_samples_per_second": 19.534,
108
- "eval_steps_per_second": 0.625,
109
- "step": 8915
110
  },
111
  {
112
  "epoch": 5.0,
113
- "step": 8915,
114
  "total_flos": 4.883004680306688e+16,
115
- "train_loss": 0.6658353641871583,
116
- "train_runtime": 6067.5028,
117
- "train_samples_per_second": 11.753,
118
- "train_steps_per_second": 1.469
119
  }
120
  ],
121
  "logging_steps": 500,
122
- "max_steps": 8915,
123
  "num_input_tokens_seen": 0,
124
  "num_train_epochs": 5,
125
  "save_steps": 500,
126
  "total_flos": 4.883004680306688e+16,
127
- "train_batch_size": 8,
128
  "trial_name": null,
129
  "trial_params": null
130
  }
 
3
  "best_model_checkpoint": null,
4
  "epoch": 5.0,
5
  "eval_steps": 500,
6
+ "global_step": 4460,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "grad_norm": 1.152057409286499,
14
  "learning_rate": 0.0008,
15
+ "loss": 1.2189,
16
+ "step": 892
17
  },
18
  {
19
  "epoch": 1.0,
20
  "eval_gen_len": 1.0,
21
+ "eval_loss": 0.7781817317008972,
22
+ "eval_rouge1": 0.6456,
23
  "eval_rouge2": 0.0,
24
+ "eval_rougeL": 0.6409,
25
+ "eval_rougeLsum": 0.6449,
26
+ "eval_runtime": 32.4225,
27
+ "eval_samples_per_second": 23.132,
28
+ "eval_steps_per_second": 0.37,
29
+ "step": 892
30
  },
31
  {
32
  "epoch": 2.0,
33
+ "grad_norm": 1.0295380353927612,
34
  "learning_rate": 0.0006,
35
+ "loss": 0.6795,
36
+ "step": 1784
37
  },
38
  {
39
  "epoch": 2.0,
40
  "eval_gen_len": 1.0,
41
+ "eval_loss": 0.6559634804725647,
42
+ "eval_rouge1": 0.6574,
43
  "eval_rouge2": 0.0,
44
+ "eval_rougeL": 0.6553,
45
+ "eval_rougeLsum": 0.6569,
46
+ "eval_runtime": 30.8201,
47
+ "eval_samples_per_second": 24.335,
48
+ "eval_steps_per_second": 0.389,
49
+ "step": 1784
50
  },
51
  {
52
  "epoch": 3.0,
53
+ "grad_norm": 0.5640166401863098,
54
  "learning_rate": 0.0004,
55
+ "loss": 0.4861,
56
+ "step": 2676
57
  },
58
  {
59
  "epoch": 3.0,
60
  "eval_gen_len": 1.0,
61
+ "eval_loss": 0.624487578868866,
62
+ "eval_rouge1": 0.6717,
63
  "eval_rouge2": 0.0,
64
+ "eval_rougeL": 0.6667,
65
+ "eval_rougeLsum": 0.6691,
66
+ "eval_runtime": 30.8362,
67
+ "eval_samples_per_second": 24.322,
68
+ "eval_steps_per_second": 0.389,
69
+ "step": 2676
70
  },
71
  {
72
  "epoch": 4.0,
73
+ "grad_norm": 0.5083895921707153,
74
  "learning_rate": 0.0002,
75
+ "loss": 0.3405,
76
+ "step": 3568
77
  },
78
  {
79
  "epoch": 4.0,
80
  "eval_gen_len": 1.0,
81
+ "eval_loss": 0.6442595720291138,
82
+ "eval_rouge1": 0.6974,
83
  "eval_rouge2": 0.0,
84
+ "eval_rougeL": 0.6969,
85
+ "eval_rougeLsum": 0.6948,
86
+ "eval_runtime": 33.2196,
87
+ "eval_samples_per_second": 22.577,
88
+ "eval_steps_per_second": 0.361,
89
+ "step": 3568
90
  },
91
  {
92
  "epoch": 5.0,
93
+ "grad_norm": 0.669092059135437,
94
  "learning_rate": 0.0,
95
+ "loss": 0.2041,
96
+ "step": 4460
97
  },
98
  {
99
  "epoch": 5.0,
100
  "eval_gen_len": 1.0,
101
+ "eval_loss": 0.7147085666656494,
102
+ "eval_rouge1": 0.677,
103
  "eval_rouge2": 0.0,
104
+ "eval_rougeL": 0.6766,
105
+ "eval_rougeLsum": 0.6756,
106
+ "eval_runtime": 30.7781,
107
+ "eval_samples_per_second": 24.368,
108
+ "eval_steps_per_second": 0.39,
109
+ "step": 4460
110
  },
111
  {
112
  "epoch": 5.0,
113
+ "step": 4460,
114
  "total_flos": 4.883004680306688e+16,
115
+ "train_loss": 0.5858162473670036,
116
+ "train_runtime": 4314.6222,
117
+ "train_samples_per_second": 16.528,
118
+ "train_steps_per_second": 1.034
119
  }
120
  ],
121
  "logging_steps": 500,
122
+ "max_steps": 4460,
123
  "num_input_tokens_seen": 0,
124
  "num_train_epochs": 5,
125
  "save_steps": 500,
126
  "total_flos": 4.883004680306688e+16,
127
+ "train_batch_size": 16,
128
  "trial_name": null,
129
  "trial_params": null
130
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:583dd8f4fcf4686f0dc00115470013479a64268d00a5c586484114d4d75040b1
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c472aad5e6371b3568f57fdda5fb9ebab151488459970e3d10d23d6061b17b8
3
  size 5176