rayonlabs commited on
Commit
69f739e
·
verified ·
1 Parent(s): d3e47c1

Upload task output test1334test1234test1234test12334

Browse files
README.md CHANGED
@@ -89,7 +89,7 @@ xformers_attention: null
89
 
90
  This model was trained from scratch on the None dataset.
91
  It achieves the following results on the evaluation set:
92
- - Loss: 1.1497
93
 
94
  ## Model description
95
 
@@ -123,10 +123,10 @@ The following hyperparameters were used during training:
123
 
124
  | Training Loss | Epoch | Step | Validation Loss |
125
  |:-------------:|:------:|:----:|:---------------:|
126
- | No log | 0 | 0 | 1.1522 |
127
- | 1.1025 | 0.0372 | 3 | 1.1527 |
128
- | 1.8739 | 0.0743 | 6 | 1.1523 |
129
- | 0.8249 | 0.1115 | 9 | 1.1497 |
130
 
131
 
132
  ### Framework versions
 
89
 
90
  This model was trained from scratch on the None dataset.
91
  It achieves the following results on the evaluation set:
92
+ - Loss: 1.5465
93
 
94
  ## Model description
95
 
 
123
 
124
  | Training Loss | Epoch | Step | Validation Loss |
125
  |:-------------:|:------:|:----:|:---------------:|
126
+ | No log | 0 | 0 | 1.5762 |
127
+ | 1.607 | 0.0349 | 3 | 1.5754 |
128
+ | 1.2308 | 0.0698 | 6 | 1.5678 |
129
+ | 1.9635 | 0.1047 | 9 | 1.5465 |
130
 
131
 
132
  ### Framework versions
adapter_config.json CHANGED
@@ -24,12 +24,12 @@
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
27
- "gate_proj",
28
  "q_proj",
29
- "up_proj",
30
- "o_proj",
31
  "down_proj",
32
  "k_proj",
 
 
 
33
  "v_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
 
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
 
27
  "q_proj",
 
 
28
  "down_proj",
29
  "k_proj",
30
+ "up_proj",
31
+ "o_proj",
32
+ "gate_proj",
33
  "v_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a40762bd6e2ffcbee731f58e2fe98e974c8a2a69c8ee41329fb0bc4b485a78e
3
  size 22573704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ce05b1e88df90aa681e0effc572a2d1bbdccf2fb246de4a8441110399d530bf
3
  size 22573704
checkpoint-10/adapter_config.json CHANGED
@@ -24,12 +24,12 @@
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
27
- "gate_proj",
28
  "q_proj",
29
- "up_proj",
30
- "o_proj",
31
  "down_proj",
32
  "k_proj",
 
 
 
33
  "v_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
 
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
 
27
  "q_proj",
 
 
28
  "down_proj",
29
  "k_proj",
30
+ "up_proj",
31
+ "o_proj",
32
+ "gate_proj",
33
  "v_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
checkpoint-10/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a40762bd6e2ffcbee731f58e2fe98e974c8a2a69c8ee41329fb0bc4b485a78e
3
  size 22573704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ce05b1e88df90aa681e0effc572a2d1bbdccf2fb246de4a8441110399d530bf
3
  size 22573704
checkpoint-10/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a31ff666f84ed1ad343b0af8a91bab87a2b250e1fb6691af9a27591751f92ae
3
  size 11710970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6be4a9397e797cabeae20a9da3191f428aa5f89735d3d5e1b0916fb3b7993889
3
  size 11710970
checkpoint-10/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.1238390092879257,
6
  "eval_steps": 3,
7
  "global_step": 10,
8
  "is_hyper_param_search": false,
@@ -11,104 +11,104 @@
11
  "log_history": [
12
  {
13
  "epoch": 0,
14
- "eval_loss": 1.152190923690796,
15
- "eval_runtime": 6.4433,
16
- "eval_samples_per_second": 5.277,
17
- "eval_steps_per_second": 2.638,
18
  "step": 0
19
  },
20
  {
21
- "epoch": 0.01238390092879257,
22
- "grad_norm": 0.4808008372783661,
23
  "learning_rate": 0.0,
24
- "loss": 0.7383,
25
  "step": 1
26
  },
27
  {
28
- "epoch": 0.02476780185758514,
29
- "grad_norm": 0.833384096622467,
30
  "learning_rate": 2e-05,
31
- "loss": 1.2104,
32
  "step": 2
33
  },
34
  {
35
- "epoch": 0.03715170278637771,
36
- "grad_norm": 0.48235809803009033,
37
  "learning_rate": 4e-05,
38
- "loss": 1.1025,
39
  "step": 3
40
  },
41
  {
42
- "epoch": 0.03715170278637771,
43
- "eval_loss": 1.1526859998703003,
44
- "eval_runtime": 6.1678,
45
- "eval_samples_per_second": 5.513,
46
- "eval_steps_per_second": 2.756,
47
  "step": 3
48
  },
49
  {
50
- "epoch": 0.04953560371517028,
51
- "grad_norm": 0.5662117004394531,
52
  "learning_rate": 6e-05,
53
- "loss": 0.9537,
54
  "step": 4
55
  },
56
  {
57
- "epoch": 0.06191950464396285,
58
- "grad_norm": 0.49891141057014465,
59
  "learning_rate": 8e-05,
60
- "loss": 1.1153,
61
  "step": 5
62
  },
63
  {
64
- "epoch": 0.07430340557275542,
65
- "grad_norm": 0.8236696124076843,
66
  "learning_rate": 0.0001,
67
- "loss": 1.8739,
68
  "step": 6
69
  },
70
  {
71
- "epoch": 0.07430340557275542,
72
- "eval_loss": 1.1522986888885498,
73
- "eval_runtime": 6.1818,
74
- "eval_samples_per_second": 5.5,
75
- "eval_steps_per_second": 2.75,
76
  "step": 6
77
  },
78
  {
79
- "epoch": 0.08668730650154799,
80
- "grad_norm": 0.5597956776618958,
81
  "learning_rate": 0.00012,
82
- "loss": 0.8133,
83
  "step": 7
84
  },
85
  {
86
- "epoch": 0.09907120743034056,
87
- "grad_norm": 0.4315759837627411,
88
  "learning_rate": 0.00014,
89
- "loss": 0.782,
90
  "step": 8
91
  },
92
  {
93
- "epoch": 0.11145510835913312,
94
- "grad_norm": 0.715858519077301,
95
  "learning_rate": 0.00016,
96
- "loss": 0.8249,
97
  "step": 9
98
  },
99
  {
100
- "epoch": 0.11145510835913312,
101
- "eval_loss": 1.1496660709381104,
102
- "eval_runtime": 6.1717,
103
- "eval_samples_per_second": 5.509,
104
- "eval_steps_per_second": 2.755,
105
  "step": 9
106
  },
107
  {
108
- "epoch": 0.1238390092879257,
109
- "grad_norm": 0.4885803759098053,
110
  "learning_rate": 0.00018,
111
- "loss": 0.63,
112
  "step": 10
113
  }
114
  ],
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.11627906976744186,
6
  "eval_steps": 3,
7
  "global_step": 10,
8
  "is_hyper_param_search": false,
 
11
  "log_history": [
12
  {
13
  "epoch": 0,
14
+ "eval_loss": 1.5762042999267578,
15
+ "eval_runtime": 6.5489,
16
+ "eval_samples_per_second": 5.65,
17
+ "eval_steps_per_second": 2.901,
18
  "step": 0
19
  },
20
  {
21
+ "epoch": 0.011627906976744186,
22
+ "grad_norm": 0.7823610901832581,
23
  "learning_rate": 0.0,
24
+ "loss": 1.3599,
25
  "step": 1
26
  },
27
  {
28
+ "epoch": 0.023255813953488372,
29
+ "grad_norm": 0.6125471591949463,
30
  "learning_rate": 2e-05,
31
+ "loss": 1.3481,
32
  "step": 2
33
  },
34
  {
35
+ "epoch": 0.03488372093023256,
36
+ "grad_norm": 0.7715550065040588,
37
  "learning_rate": 4e-05,
38
+ "loss": 1.607,
39
  "step": 3
40
  },
41
  {
42
+ "epoch": 0.03488372093023256,
43
+ "eval_loss": 1.575430154800415,
44
+ "eval_runtime": 6.3129,
45
+ "eval_samples_per_second": 5.861,
46
+ "eval_steps_per_second": 3.01,
47
  "step": 3
48
  },
49
  {
50
+ "epoch": 0.046511627906976744,
51
+ "grad_norm": 1.3339767456054688,
52
  "learning_rate": 6e-05,
53
+ "loss": 2.2903,
54
  "step": 4
55
  },
56
  {
57
+ "epoch": 0.05813953488372093,
58
+ "grad_norm": 0.8708456158638,
59
  "learning_rate": 8e-05,
60
+ "loss": 1.7924,
61
  "step": 5
62
  },
63
  {
64
+ "epoch": 0.06976744186046512,
65
+ "grad_norm": 0.6723288297653198,
66
  "learning_rate": 0.0001,
67
+ "loss": 1.2308,
68
  "step": 6
69
  },
70
  {
71
+ "epoch": 0.06976744186046512,
72
+ "eval_loss": 1.567781686782837,
73
+ "eval_runtime": 6.2826,
74
+ "eval_samples_per_second": 5.889,
75
+ "eval_steps_per_second": 3.024,
76
  "step": 6
77
  },
78
  {
79
+ "epoch": 0.08139534883720931,
80
+ "grad_norm": 0.6576961278915405,
81
  "learning_rate": 0.00012,
82
+ "loss": 1.5055,
83
  "step": 7
84
  },
85
  {
86
+ "epoch": 0.09302325581395349,
87
+ "grad_norm": 1.104688286781311,
88
  "learning_rate": 0.00014,
89
+ "loss": 1.7391,
90
  "step": 8
91
  },
92
  {
93
+ "epoch": 0.10465116279069768,
94
+ "grad_norm": 0.8543928265571594,
95
  "learning_rate": 0.00016,
96
+ "loss": 1.9635,
97
  "step": 9
98
  },
99
  {
100
+ "epoch": 0.10465116279069768,
101
+ "eval_loss": 1.546512246131897,
102
+ "eval_runtime": 6.2614,
103
+ "eval_samples_per_second": 5.909,
104
+ "eval_steps_per_second": 3.034,
105
  "step": 9
106
  },
107
  {
108
+ "epoch": 0.11627906976744186,
109
+ "grad_norm": 1.458423376083374,
110
  "learning_rate": 0.00018,
111
+ "loss": 1.9448,
112
  "step": 10
113
  }
114
  ],
checkpoint-10/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2fe3d956212bf11f8de80d250182cbc6e86552289b4d65d2c04058778f53bb4b
3
  size 7096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59749e575ec6b8bcbb1146ed41a833f989f4544824be840d5def832931454250
3
  size 7096
checkpoint-3/adapter_config.json CHANGED
@@ -24,12 +24,12 @@
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
27
- "gate_proj",
28
  "q_proj",
29
- "up_proj",
30
- "o_proj",
31
  "down_proj",
32
  "k_proj",
 
 
 
33
  "v_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
 
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
 
27
  "q_proj",
 
 
28
  "down_proj",
29
  "k_proj",
30
+ "up_proj",
31
+ "o_proj",
32
+ "gate_proj",
33
  "v_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
checkpoint-3/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:838cd3cb8a3a19af1bf710ec0e0c40f72d12b22a8a6129623a9c475a603c5c9a
3
  size 22573704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:299a7fca611e1bea4bd022dbe5b4378d3f40af56526db534ee6e62f60e31605d
3
  size 22573704
checkpoint-3/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9bdf1ee2b8eab1f8e804a1d7af5bae65c1467d66f1e26da4f77ae4dd3314d56
3
  size 11710970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5563776776d37a1b4e6214856304f010ab346eb3d2e5f2dede74ba638ee2aa7
3
  size 11710970
checkpoint-3/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.03715170278637771,
6
  "eval_steps": 3,
7
  "global_step": 3,
8
  "is_hyper_param_search": false,
@@ -11,39 +11,39 @@
11
  "log_history": [
12
  {
13
  "epoch": 0,
14
- "eval_loss": 1.152190923690796,
15
- "eval_runtime": 6.4433,
16
- "eval_samples_per_second": 5.277,
17
- "eval_steps_per_second": 2.638,
18
  "step": 0
19
  },
20
  {
21
- "epoch": 0.01238390092879257,
22
- "grad_norm": 0.4808008372783661,
23
  "learning_rate": 0.0,
24
- "loss": 0.7383,
25
  "step": 1
26
  },
27
  {
28
- "epoch": 0.02476780185758514,
29
- "grad_norm": 0.833384096622467,
30
  "learning_rate": 2e-05,
31
- "loss": 1.2104,
32
  "step": 2
33
  },
34
  {
35
- "epoch": 0.03715170278637771,
36
- "grad_norm": 0.48235809803009033,
37
  "learning_rate": 4e-05,
38
- "loss": 1.1025,
39
  "step": 3
40
  },
41
  {
42
- "epoch": 0.03715170278637771,
43
- "eval_loss": 1.1526859998703003,
44
- "eval_runtime": 6.1678,
45
- "eval_samples_per_second": 5.513,
46
- "eval_steps_per_second": 2.756,
47
  "step": 3
48
  }
49
  ],
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.03488372093023256,
6
  "eval_steps": 3,
7
  "global_step": 3,
8
  "is_hyper_param_search": false,
 
11
  "log_history": [
12
  {
13
  "epoch": 0,
14
+ "eval_loss": 1.5762042999267578,
15
+ "eval_runtime": 6.5489,
16
+ "eval_samples_per_second": 5.65,
17
+ "eval_steps_per_second": 2.901,
18
  "step": 0
19
  },
20
  {
21
+ "epoch": 0.011627906976744186,
22
+ "grad_norm": 0.7823610901832581,
23
  "learning_rate": 0.0,
24
+ "loss": 1.3599,
25
  "step": 1
26
  },
27
  {
28
+ "epoch": 0.023255813953488372,
29
+ "grad_norm": 0.6125471591949463,
30
  "learning_rate": 2e-05,
31
+ "loss": 1.3481,
32
  "step": 2
33
  },
34
  {
35
+ "epoch": 0.03488372093023256,
36
+ "grad_norm": 0.7715550065040588,
37
  "learning_rate": 4e-05,
38
+ "loss": 1.607,
39
  "step": 3
40
  },
41
  {
42
+ "epoch": 0.03488372093023256,
43
+ "eval_loss": 1.575430154800415,
44
+ "eval_runtime": 6.3129,
45
+ "eval_samples_per_second": 5.861,
46
+ "eval_steps_per_second": 3.01,
47
  "step": 3
48
  }
49
  ],
checkpoint-3/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2fe3d956212bf11f8de80d250182cbc6e86552289b4d65d2c04058778f53bb4b
3
  size 7096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59749e575ec6b8bcbb1146ed41a833f989f4544824be840d5def832931454250
3
  size 7096
checkpoint-6/adapter_config.json CHANGED
@@ -24,12 +24,12 @@
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
27
- "gate_proj",
28
  "q_proj",
29
- "up_proj",
30
- "o_proj",
31
  "down_proj",
32
  "k_proj",
 
 
 
33
  "v_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
 
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
 
27
  "q_proj",
 
 
28
  "down_proj",
29
  "k_proj",
30
+ "up_proj",
31
+ "o_proj",
32
+ "gate_proj",
33
  "v_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
checkpoint-6/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4b0af1dad4b6f86eda74b7d85593def2e1382e2a2d87fb4920e1edf6c7d259e
3
  size 22573704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eb9e1b8b721f99ed9f8155333fc3118cd26926cfeeb795187c0e1c2783c2ca4
3
  size 22573704
checkpoint-6/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93a5eece02971aad02d2d83f7eeaa735564380d25d7bfd55c80bd3de7397d6c5
3
  size 11710970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:985d63bca847c470454805da5e04eeb46811f288469dee785a1492c42a154f99
3
  size 11710970
checkpoint-6/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.07430340557275542,
6
  "eval_steps": 3,
7
  "global_step": 6,
8
  "is_hyper_param_search": false,
@@ -11,68 +11,68 @@
11
  "log_history": [
12
  {
13
  "epoch": 0,
14
- "eval_loss": 1.152190923690796,
15
- "eval_runtime": 6.4433,
16
- "eval_samples_per_second": 5.277,
17
- "eval_steps_per_second": 2.638,
18
  "step": 0
19
  },
20
  {
21
- "epoch": 0.01238390092879257,
22
- "grad_norm": 0.4808008372783661,
23
  "learning_rate": 0.0,
24
- "loss": 0.7383,
25
  "step": 1
26
  },
27
  {
28
- "epoch": 0.02476780185758514,
29
- "grad_norm": 0.833384096622467,
30
  "learning_rate": 2e-05,
31
- "loss": 1.2104,
32
  "step": 2
33
  },
34
  {
35
- "epoch": 0.03715170278637771,
36
- "grad_norm": 0.48235809803009033,
37
  "learning_rate": 4e-05,
38
- "loss": 1.1025,
39
  "step": 3
40
  },
41
  {
42
- "epoch": 0.03715170278637771,
43
- "eval_loss": 1.1526859998703003,
44
- "eval_runtime": 6.1678,
45
- "eval_samples_per_second": 5.513,
46
- "eval_steps_per_second": 2.756,
47
  "step": 3
48
  },
49
  {
50
- "epoch": 0.04953560371517028,
51
- "grad_norm": 0.5662117004394531,
52
  "learning_rate": 6e-05,
53
- "loss": 0.9537,
54
  "step": 4
55
  },
56
  {
57
- "epoch": 0.06191950464396285,
58
- "grad_norm": 0.49891141057014465,
59
  "learning_rate": 8e-05,
60
- "loss": 1.1153,
61
  "step": 5
62
  },
63
  {
64
- "epoch": 0.07430340557275542,
65
- "grad_norm": 0.8236696124076843,
66
  "learning_rate": 0.0001,
67
- "loss": 1.8739,
68
  "step": 6
69
  },
70
  {
71
- "epoch": 0.07430340557275542,
72
- "eval_loss": 1.1522986888885498,
73
- "eval_runtime": 6.1818,
74
- "eval_samples_per_second": 5.5,
75
- "eval_steps_per_second": 2.75,
76
  "step": 6
77
  }
78
  ],
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.06976744186046512,
6
  "eval_steps": 3,
7
  "global_step": 6,
8
  "is_hyper_param_search": false,
 
11
  "log_history": [
12
  {
13
  "epoch": 0,
14
+ "eval_loss": 1.5762042999267578,
15
+ "eval_runtime": 6.5489,
16
+ "eval_samples_per_second": 5.65,
17
+ "eval_steps_per_second": 2.901,
18
  "step": 0
19
  },
20
  {
21
+ "epoch": 0.011627906976744186,
22
+ "grad_norm": 0.7823610901832581,
23
  "learning_rate": 0.0,
24
+ "loss": 1.3599,
25
  "step": 1
26
  },
27
  {
28
+ "epoch": 0.023255813953488372,
29
+ "grad_norm": 0.6125471591949463,
30
  "learning_rate": 2e-05,
31
+ "loss": 1.3481,
32
  "step": 2
33
  },
34
  {
35
+ "epoch": 0.03488372093023256,
36
+ "grad_norm": 0.7715550065040588,
37
  "learning_rate": 4e-05,
38
+ "loss": 1.607,
39
  "step": 3
40
  },
41
  {
42
+ "epoch": 0.03488372093023256,
43
+ "eval_loss": 1.575430154800415,
44
+ "eval_runtime": 6.3129,
45
+ "eval_samples_per_second": 5.861,
46
+ "eval_steps_per_second": 3.01,
47
  "step": 3
48
  },
49
  {
50
+ "epoch": 0.046511627906976744,
51
+ "grad_norm": 1.3339767456054688,
52
  "learning_rate": 6e-05,
53
+ "loss": 2.2903,
54
  "step": 4
55
  },
56
  {
57
+ "epoch": 0.05813953488372093,
58
+ "grad_norm": 0.8708456158638,
59
  "learning_rate": 8e-05,
60
+ "loss": 1.7924,
61
  "step": 5
62
  },
63
  {
64
+ "epoch": 0.06976744186046512,
65
+ "grad_norm": 0.6723288297653198,
66
  "learning_rate": 0.0001,
67
+ "loss": 1.2308,
68
  "step": 6
69
  },
70
  {
71
+ "epoch": 0.06976744186046512,
72
+ "eval_loss": 1.567781686782837,
73
+ "eval_runtime": 6.2826,
74
+ "eval_samples_per_second": 5.889,
75
+ "eval_steps_per_second": 3.024,
76
  "step": 6
77
  }
78
  ],
checkpoint-6/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2fe3d956212bf11f8de80d250182cbc6e86552289b4d65d2c04058778f53bb4b
3
  size 7096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59749e575ec6b8bcbb1146ed41a833f989f4544824be840d5def832931454250
3
  size 7096
checkpoint-9/adapter_config.json CHANGED
@@ -24,12 +24,12 @@
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
27
- "gate_proj",
28
  "q_proj",
29
- "up_proj",
30
- "o_proj",
31
  "down_proj",
32
  "k_proj",
 
 
 
33
  "v_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
 
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
 
27
  "q_proj",
 
 
28
  "down_proj",
29
  "k_proj",
30
+ "up_proj",
31
+ "o_proj",
32
+ "gate_proj",
33
  "v_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
checkpoint-9/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52dca75f07d7c168a5c97a36f909791acdb197ae822cf12cec8c24e09f1fd096
3
  size 22573704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:556d5755f4df397ce374e7b3d9135d78fb073bd6b4f73c564434448b7cd13f0a
3
  size 22573704
checkpoint-9/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7b1796971b5bb3f203cfbf8bd010825db21264a2517271b552c14387899d97a
3
  size 11710970
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33e5bc39d9288161a4f142cd2367ee0d530fe9db3385de7443747ea7983a8dcd
3
  size 11710970
checkpoint-9/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.11145510835913312,
6
  "eval_steps": 3,
7
  "global_step": 9,
8
  "is_hyper_param_search": false,
@@ -11,97 +11,97 @@
11
  "log_history": [
12
  {
13
  "epoch": 0,
14
- "eval_loss": 1.152190923690796,
15
- "eval_runtime": 6.4433,
16
- "eval_samples_per_second": 5.277,
17
- "eval_steps_per_second": 2.638,
18
  "step": 0
19
  },
20
  {
21
- "epoch": 0.01238390092879257,
22
- "grad_norm": 0.4808008372783661,
23
  "learning_rate": 0.0,
24
- "loss": 0.7383,
25
  "step": 1
26
  },
27
  {
28
- "epoch": 0.02476780185758514,
29
- "grad_norm": 0.833384096622467,
30
  "learning_rate": 2e-05,
31
- "loss": 1.2104,
32
  "step": 2
33
  },
34
  {
35
- "epoch": 0.03715170278637771,
36
- "grad_norm": 0.48235809803009033,
37
  "learning_rate": 4e-05,
38
- "loss": 1.1025,
39
  "step": 3
40
  },
41
  {
42
- "epoch": 0.03715170278637771,
43
- "eval_loss": 1.1526859998703003,
44
- "eval_runtime": 6.1678,
45
- "eval_samples_per_second": 5.513,
46
- "eval_steps_per_second": 2.756,
47
  "step": 3
48
  },
49
  {
50
- "epoch": 0.04953560371517028,
51
- "grad_norm": 0.5662117004394531,
52
  "learning_rate": 6e-05,
53
- "loss": 0.9537,
54
  "step": 4
55
  },
56
  {
57
- "epoch": 0.06191950464396285,
58
- "grad_norm": 0.49891141057014465,
59
  "learning_rate": 8e-05,
60
- "loss": 1.1153,
61
  "step": 5
62
  },
63
  {
64
- "epoch": 0.07430340557275542,
65
- "grad_norm": 0.8236696124076843,
66
  "learning_rate": 0.0001,
67
- "loss": 1.8739,
68
  "step": 6
69
  },
70
  {
71
- "epoch": 0.07430340557275542,
72
- "eval_loss": 1.1522986888885498,
73
- "eval_runtime": 6.1818,
74
- "eval_samples_per_second": 5.5,
75
- "eval_steps_per_second": 2.75,
76
  "step": 6
77
  },
78
  {
79
- "epoch": 0.08668730650154799,
80
- "grad_norm": 0.5597956776618958,
81
  "learning_rate": 0.00012,
82
- "loss": 0.8133,
83
  "step": 7
84
  },
85
  {
86
- "epoch": 0.09907120743034056,
87
- "grad_norm": 0.4315759837627411,
88
  "learning_rate": 0.00014,
89
- "loss": 0.782,
90
  "step": 8
91
  },
92
  {
93
- "epoch": 0.11145510835913312,
94
- "grad_norm": 0.715858519077301,
95
  "learning_rate": 0.00016,
96
- "loss": 0.8249,
97
  "step": 9
98
  },
99
  {
100
- "epoch": 0.11145510835913312,
101
- "eval_loss": 1.1496660709381104,
102
- "eval_runtime": 6.1717,
103
- "eval_samples_per_second": 5.509,
104
- "eval_steps_per_second": 2.755,
105
  "step": 9
106
  }
107
  ],
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.10465116279069768,
6
  "eval_steps": 3,
7
  "global_step": 9,
8
  "is_hyper_param_search": false,
 
11
  "log_history": [
12
  {
13
  "epoch": 0,
14
+ "eval_loss": 1.5762042999267578,
15
+ "eval_runtime": 6.5489,
16
+ "eval_samples_per_second": 5.65,
17
+ "eval_steps_per_second": 2.901,
18
  "step": 0
19
  },
20
  {
21
+ "epoch": 0.011627906976744186,
22
+ "grad_norm": 0.7823610901832581,
23
  "learning_rate": 0.0,
24
+ "loss": 1.3599,
25
  "step": 1
26
  },
27
  {
28
+ "epoch": 0.023255813953488372,
29
+ "grad_norm": 0.6125471591949463,
30
  "learning_rate": 2e-05,
31
+ "loss": 1.3481,
32
  "step": 2
33
  },
34
  {
35
+ "epoch": 0.03488372093023256,
36
+ "grad_norm": 0.7715550065040588,
37
  "learning_rate": 4e-05,
38
+ "loss": 1.607,
39
  "step": 3
40
  },
41
  {
42
+ "epoch": 0.03488372093023256,
43
+ "eval_loss": 1.575430154800415,
44
+ "eval_runtime": 6.3129,
45
+ "eval_samples_per_second": 5.861,
46
+ "eval_steps_per_second": 3.01,
47
  "step": 3
48
  },
49
  {
50
+ "epoch": 0.046511627906976744,
51
+ "grad_norm": 1.3339767456054688,
52
  "learning_rate": 6e-05,
53
+ "loss": 2.2903,
54
  "step": 4
55
  },
56
  {
57
+ "epoch": 0.05813953488372093,
58
+ "grad_norm": 0.8708456158638,
59
  "learning_rate": 8e-05,
60
+ "loss": 1.7924,
61
  "step": 5
62
  },
63
  {
64
+ "epoch": 0.06976744186046512,
65
+ "grad_norm": 0.6723288297653198,
66
  "learning_rate": 0.0001,
67
+ "loss": 1.2308,
68
  "step": 6
69
  },
70
  {
71
+ "epoch": 0.06976744186046512,
72
+ "eval_loss": 1.567781686782837,
73
+ "eval_runtime": 6.2826,
74
+ "eval_samples_per_second": 5.889,
75
+ "eval_steps_per_second": 3.024,
76
  "step": 6
77
  },
78
  {
79
+ "epoch": 0.08139534883720931,
80
+ "grad_norm": 0.6576961278915405,
81
  "learning_rate": 0.00012,
82
+ "loss": 1.5055,
83
  "step": 7
84
  },
85
  {
86
+ "epoch": 0.09302325581395349,
87
+ "grad_norm": 1.104688286781311,
88
  "learning_rate": 0.00014,
89
+ "loss": 1.7391,
90
  "step": 8
91
  },
92
  {
93
+ "epoch": 0.10465116279069768,
94
+ "grad_norm": 0.8543928265571594,
95
  "learning_rate": 0.00016,
96
+ "loss": 1.9635,
97
  "step": 9
98
  },
99
  {
100
+ "epoch": 0.10465116279069768,
101
+ "eval_loss": 1.546512246131897,
102
+ "eval_runtime": 6.2614,
103
+ "eval_samples_per_second": 5.909,
104
+ "eval_steps_per_second": 3.034,
105
  "step": 9
106
  }
107
  ],
checkpoint-9/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2fe3d956212bf11f8de80d250182cbc6e86552289b4d65d2c04058778f53bb4b
3
  size 7096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59749e575ec6b8bcbb1146ed41a833f989f4544824be840d5def832931454250
3
  size 7096