Jessiecs commited on
Commit
44afc9b
·
verified ·
1 Parent(s): 965003c

Jessiecs/llama-2-7b-a3-4

Browse files
README.md CHANGED
@@ -50,8 +50,8 @@ The following hyperparameters were used during training:
50
 
51
  ### Framework versions
52
 
53
- - PEFT 0.8.2
54
  - Transformers 4.39.0.dev0
55
  - Pytorch 2.1.0+cu121
56
- - Datasets 2.17.1
57
  - Tokenizers 0.15.2
 
50
 
51
  ### Framework versions
52
 
53
+ - PEFT 0.9.1.dev0
54
  - Transformers 4.39.0.dev0
55
  - Pytorch 2.1.0+cu121
56
+ - Datasets 2.18.0
57
  - Tokenizers 0.15.2
adapter_config.json CHANGED
@@ -22,5 +22,6 @@
22
  "base_layer"
23
  ],
24
  "task_type": "CAUSAL_LM",
 
25
  "use_rslora": false
26
  }
 
22
  "base_layer"
23
  ],
24
  "task_type": "CAUSAL_LM",
25
+ "use_dora": false,
26
  "use_rslora": false
27
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74fe0d788196c4869eede2cdf2d976288cc7c2ffdfbd37437e0f9f1d4346d64f
3
  size 479942472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88250486d283fc41c276492b96bda0af9f37ab2a0ae6fceb906439b8aca0b319
3
  size 479942472
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.2,
3
- "total_flos": 3146427009073152.0,
4
- "train_loss": 0.5001711931079627,
5
- "train_runtime": 93.2825,
6
- "train_samples_per_second": 0.858,
7
- "train_steps_per_second": 0.214
8
  }
 
1
  {
2
+ "epoch": 0.62,
3
+ "total_flos": 1616765879623680.0,
4
+ "train_loss": 1.9969253599643708,
5
+ "train_runtime": 79.7523,
6
+ "train_samples_per_second": 1.003,
7
+ "train_steps_per_second": 0.251
8
  }
runs/Mar04_09-17-43_806464d2401f/events.out.tfevents.1709543863.806464d2401f.1315.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f07783ebaefbeca8348320972d0fc63020fba7c9b90e199d55f06ca315593c1f
3
+ size 9582
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.2,
3
- "total_flos": 3146427009073152.0,
4
- "train_loss": 0.5001711931079627,
5
- "train_runtime": 93.2825,
6
- "train_samples_per_second": 0.858,
7
- "train_steps_per_second": 0.214
8
  }
 
1
  {
2
+ "epoch": 0.62,
3
+ "total_flos": 1616765879623680.0,
4
+ "train_loss": 1.9969253599643708,
5
+ "train_runtime": 79.7523,
6
+ "train_samples_per_second": 1.003,
7
+ "train_steps_per_second": 0.251
8
  }
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.2,
5
  "eval_steps": 500,
6
  "global_step": 20,
7
  "is_hyper_param_search": false,
@@ -9,161 +9,161 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.16,
13
- "grad_norm": 0.8734284043312073,
14
  "learning_rate": 0.0001,
15
- "loss": 1.5075,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.32,
20
- "grad_norm": 0.8822978138923645,
21
  "learning_rate": 0.0002,
22
- "loss": 1.7499,
23
  "step": 2
24
  },
25
  {
26
- "epoch": 0.48,
27
- "grad_norm": 0.7636933922767639,
28
  "learning_rate": 0.00018888888888888888,
29
- "loss": 1.4175,
30
  "step": 3
31
  },
32
  {
33
- "epoch": 0.64,
34
- "grad_norm": 0.8630443811416626,
35
  "learning_rate": 0.00017777777777777779,
36
- "loss": 1.0686,
37
  "step": 4
38
  },
39
  {
40
- "epoch": 0.8,
41
- "grad_norm": 0.9574275612831116,
42
  "learning_rate": 0.0001666666666666667,
43
- "loss": 0.7723,
44
  "step": 5
45
  },
46
  {
47
- "epoch": 0.96,
48
- "grad_norm": 1.6739799976348877,
49
  "learning_rate": 0.00015555555555555556,
50
- "loss": 0.5065,
51
  "step": 6
52
  },
53
  {
54
- "epoch": 1.12,
55
- "grad_norm": 1.1822175979614258,
56
  "learning_rate": 0.00014444444444444444,
57
- "loss": 0.4154,
58
  "step": 7
59
  },
60
  {
61
- "epoch": 1.28,
62
- "grad_norm": 0.6320245862007141,
63
  "learning_rate": 0.00013333333333333334,
64
- "loss": 0.2728,
65
  "step": 8
66
  },
67
  {
68
- "epoch": 1.44,
69
- "grad_norm": 0.6290435194969177,
70
  "learning_rate": 0.00012222222222222224,
71
- "loss": 0.2711,
72
  "step": 9
73
  },
74
  {
75
- "epoch": 1.6,
76
- "grad_norm": 0.3685467541217804,
77
  "learning_rate": 0.00011111111111111112,
78
- "loss": 0.1759,
79
  "step": 10
80
  },
81
  {
82
- "epoch": 1.76,
83
- "grad_norm": 0.4463210105895996,
84
  "learning_rate": 0.0001,
85
- "loss": 0.2825,
86
  "step": 11
87
  },
88
  {
89
- "epoch": 1.92,
90
- "grad_norm": 0.4311699867248535,
91
  "learning_rate": 8.888888888888889e-05,
92
- "loss": 0.2496,
93
  "step": 12
94
  },
95
  {
96
- "epoch": 2.08,
97
- "grad_norm": 0.3269866406917572,
98
  "learning_rate": 7.777777777777778e-05,
99
- "loss": 0.1629,
100
  "step": 13
101
  },
102
  {
103
- "epoch": 2.24,
104
- "grad_norm": 0.25907543301582336,
105
  "learning_rate": 6.666666666666667e-05,
106
- "loss": 0.1562,
107
  "step": 14
108
  },
109
  {
110
- "epoch": 2.4,
111
- "grad_norm": 0.2790839672088623,
112
  "learning_rate": 5.555555555555556e-05,
113
- "loss": 0.2234,
114
  "step": 15
115
  },
116
  {
117
- "epoch": 2.56,
118
- "grad_norm": 0.29331278800964355,
119
  "learning_rate": 4.4444444444444447e-05,
120
- "loss": 0.1783,
121
  "step": 16
122
  },
123
  {
124
- "epoch": 2.72,
125
- "grad_norm": 0.26256364583969116,
126
  "learning_rate": 3.3333333333333335e-05,
127
- "loss": 0.1776,
128
  "step": 17
129
  },
130
  {
131
- "epoch": 2.88,
132
- "grad_norm": 0.2941267490386963,
133
  "learning_rate": 2.2222222222222223e-05,
134
- "loss": 0.1771,
135
  "step": 18
136
  },
137
  {
138
- "epoch": 3.04,
139
- "grad_norm": 0.269639790058136,
140
  "learning_rate": 1.1111111111111112e-05,
141
- "loss": 0.136,
142
  "step": 19
143
  },
144
  {
145
- "epoch": 3.2,
146
- "grad_norm": 0.36100998520851135,
147
  "learning_rate": 0.0,
148
- "loss": 0.1024,
149
  "step": 20
150
  },
151
  {
152
- "epoch": 3.2,
153
  "step": 20,
154
- "total_flos": 3146427009073152.0,
155
- "train_loss": 0.5001711931079627,
156
- "train_runtime": 93.2825,
157
- "train_samples_per_second": 0.858,
158
- "train_steps_per_second": 0.214
159
  }
160
  ],
161
  "logging_steps": 1,
162
  "max_steps": 20,
163
  "num_input_tokens_seen": 0,
164
- "num_train_epochs": 4,
165
  "save_steps": 500,
166
- "total_flos": 3146427009073152.0,
167
  "train_batch_size": 1,
168
  "trial_name": null,
169
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.625,
5
  "eval_steps": 500,
6
  "global_step": 20,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.03,
13
+ "grad_norm": 2.219240188598633,
14
  "learning_rate": 0.0001,
15
+ "loss": 2.4598,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.06,
20
+ "grad_norm": 2.17581844329834,
21
  "learning_rate": 0.0002,
22
+ "loss": 2.7545,
23
  "step": 2
24
  },
25
  {
26
+ "epoch": 0.09,
27
+ "grad_norm": 1.500047206878662,
28
  "learning_rate": 0.00018888888888888888,
29
+ "loss": 2.2534,
30
  "step": 3
31
  },
32
  {
33
+ "epoch": 0.12,
34
+ "grad_norm": 1.3228497505187988,
35
  "learning_rate": 0.00017777777777777779,
36
+ "loss": 2.1453,
37
  "step": 4
38
  },
39
  {
40
+ "epoch": 0.16,
41
+ "grad_norm": 1.1492576599121094,
42
  "learning_rate": 0.0001666666666666667,
43
+ "loss": 1.9039,
44
  "step": 5
45
  },
46
  {
47
+ "epoch": 0.19,
48
+ "grad_norm": 1.4337806701660156,
49
  "learning_rate": 0.00015555555555555556,
50
+ "loss": 2.0063,
51
  "step": 6
52
  },
53
  {
54
+ "epoch": 0.22,
55
+ "grad_norm": 0.8797003030776978,
56
  "learning_rate": 0.00014444444444444444,
57
+ "loss": 1.6465,
58
  "step": 7
59
  },
60
  {
61
+ "epoch": 0.25,
62
+ "grad_norm": 0.8188653588294983,
63
  "learning_rate": 0.00013333333333333334,
64
+ "loss": 2.0039,
65
  "step": 8
66
  },
67
  {
68
+ "epoch": 0.28,
69
+ "grad_norm": 1.0144400596618652,
70
  "learning_rate": 0.00012222222222222224,
71
+ "loss": 2.285,
72
  "step": 9
73
  },
74
  {
75
+ "epoch": 0.31,
76
+ "grad_norm": 1.009358286857605,
77
  "learning_rate": 0.00011111111111111112,
78
+ "loss": 2.0178,
79
  "step": 10
80
  },
81
  {
82
+ "epoch": 0.34,
83
+ "grad_norm": 1.2635082006454468,
84
  "learning_rate": 0.0001,
85
+ "loss": 1.8403,
86
  "step": 11
87
  },
88
  {
89
+ "epoch": 0.38,
90
+ "grad_norm": 1.0166571140289307,
91
  "learning_rate": 8.888888888888889e-05,
92
+ "loss": 1.9239,
93
  "step": 12
94
  },
95
  {
96
+ "epoch": 0.41,
97
+ "grad_norm": 0.8766961097717285,
98
  "learning_rate": 7.777777777777778e-05,
99
+ "loss": 1.9722,
100
  "step": 13
101
  },
102
  {
103
+ "epoch": 0.44,
104
+ "grad_norm": 0.8238506317138672,
105
  "learning_rate": 6.666666666666667e-05,
106
+ "loss": 2.0839,
107
  "step": 14
108
  },
109
  {
110
+ "epoch": 0.47,
111
+ "grad_norm": 0.7714257836341858,
112
  "learning_rate": 5.555555555555556e-05,
113
+ "loss": 1.6972,
114
  "step": 15
115
  },
116
  {
117
+ "epoch": 0.5,
118
+ "grad_norm": 0.8575778007507324,
119
  "learning_rate": 4.4444444444444447e-05,
120
+ "loss": 1.8562,
121
  "step": 16
122
  },
123
  {
124
+ "epoch": 0.53,
125
+ "grad_norm": 0.8333547115325928,
126
  "learning_rate": 3.3333333333333335e-05,
127
+ "loss": 1.8273,
128
  "step": 17
129
  },
130
  {
131
+ "epoch": 0.56,
132
+ "grad_norm": 0.7584834098815918,
133
  "learning_rate": 2.2222222222222223e-05,
134
+ "loss": 1.6062,
135
  "step": 18
136
  },
137
  {
138
+ "epoch": 0.59,
139
+ "grad_norm": 0.7320359349250793,
140
  "learning_rate": 1.1111111111111112e-05,
141
+ "loss": 1.6753,
142
  "step": 19
143
  },
144
  {
145
+ "epoch": 0.62,
146
+ "grad_norm": 0.835627555847168,
147
  "learning_rate": 0.0,
148
+ "loss": 1.9796,
149
  "step": 20
150
  },
151
  {
152
+ "epoch": 0.62,
153
  "step": 20,
154
+ "total_flos": 1616765879623680.0,
155
+ "train_loss": 1.9969253599643708,
156
+ "train_runtime": 79.7523,
157
+ "train_samples_per_second": 1.003,
158
+ "train_steps_per_second": 0.251
159
  }
160
  ],
161
  "logging_steps": 1,
162
  "max_steps": 20,
163
  "num_input_tokens_seen": 0,
164
+ "num_train_epochs": 1,
165
  "save_steps": 500,
166
+ "total_flos": 1616765879623680.0,
167
  "train_batch_size": 1,
168
  "trial_name": null,
169
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c80c831d7943e08a8eaaafc45a3783d9c4cb7bd0911dfb150712957d123b1631
3
  size 4920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d31c9963efb464f6b6e3ae1af32a5b56c57ec84938354490ffdd2333d234b7f
3
  size 4920