bsteveb commited on
Commit
2231611
·
verified ·
1 Parent(s): e5f0f5a

bsteveb/T11

Browse files
README.md CHANGED
@@ -37,7 +37,7 @@ More information needed
37
 
38
  The following hyperparameters were used during training:
39
  - learning_rate: 0.0001
40
- - train_batch_size: 4
41
  - eval_batch_size: 16
42
  - seed: 3407
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 
37
 
38
  The following hyperparameters were used during training:
39
  - learning_rate: 0.0001
40
+ - train_batch_size: 1
41
  - eval_batch_size: 16
42
  - seed: 3407
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "gate_proj",
28
- "k_proj",
29
  "down_proj",
 
 
30
  "o_proj",
31
  "up_proj",
32
- "q_proj"
 
33
  ],
34
  "task_type": null,
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
 
26
  "down_proj",
27
+ "v_proj",
28
+ "q_proj",
29
  "o_proj",
30
  "up_proj",
31
+ "k_proj",
32
+ "gate_proj"
33
  ],
34
  "task_type": null,
35
  "use_dora": false,
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 0.0,
4
- "train_loss": 1.4275426094124957,
5
- "train_runtime": 39.9532,
6
- "train_samples_per_second": 4.055,
7
- "train_steps_per_second": 1.026
8
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.0,
5
+ "train_runtime": 0.0067,
6
+ "train_samples_per_second": 24200.493,
7
+ "train_steps_per_second": 12100.247
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a2d2a0239ffbd585ccde826896aa48275ff5a8a8c4a5bfef31dc5587b90d49a
3
  size 3554214752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:725ade8b1758cf5ff9fa1a412e0d1a086bd8ec7737e6ac69515046e33a9922f6
3
  size 3554214752
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 0.0,
4
- "train_loss": 1.4275426094124957,
5
- "train_runtime": 39.9532,
6
- "train_samples_per_second": 4.055,
7
- "train_steps_per_second": 1.026
8
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.0,
5
+ "train_runtime": 0.0067,
6
+ "train_samples_per_second": 24200.493,
7
+ "train_steps_per_second": 12100.247
8
  }
trainer_state.json CHANGED
@@ -3,58 +3,79 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 41,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.24390243902439024,
13
- "grad_norm": 0.3692132830619812,
14
- "learning_rate": 7.560975609756099e-05,
15
- "loss": 5.4339,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.4878048780487805,
20
- "grad_norm": 0.22680561244487762,
21
- "learning_rate": 5.121951219512195e-05,
22
- "loss": 0.1276,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.7317073170731707,
27
- "grad_norm": 0.1607717126607895,
28
- "learning_rate": 2.682926829268293e-05,
29
- "loss": 0.1394,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.975609756097561,
34
- "grad_norm": 0.14807815849781036,
35
- "learning_rate": 2.4390243902439027e-06,
36
- "loss": 0.14,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 1.0,
41
- "eval_runtime": 4.2052,
42
- "eval_samples_per_second": 9.75,
43
- "eval_steps_per_second": 0.713,
44
- "step": 41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  },
46
  {
47
  "epoch": 1.0,
48
- "step": 41,
49
  "total_flos": 0.0,
50
- "train_loss": 1.4275426094124957,
51
- "train_runtime": 39.9532,
52
- "train_samples_per_second": 4.055,
53
- "train_steps_per_second": 1.026
54
  }
55
  ],
56
  "logging_steps": 10,
57
- "max_steps": 41,
58
  "num_input_tokens_seen": 0,
59
  "num_train_epochs": 1,
60
  "save_steps": 500,
@@ -71,7 +92,7 @@
71
  }
72
  },
73
  "total_flos": 0.0,
74
- "train_batch_size": 4,
75
  "trial_name": null,
76
  "trial_params": null
77
  }
 
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 81,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.12345679012345678,
13
+ "grad_norm": 0.25058820843696594,
14
+ "learning_rate": 8.765432098765433e-05,
15
+ "loss": 5.3001,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.24691358024691357,
20
+ "grad_norm": 0.19951799511909485,
21
+ "learning_rate": 7.530864197530865e-05,
22
+ "loss": 0.1324,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.37037037037037035,
27
+ "grad_norm": 0.1762804090976715,
28
+ "learning_rate": 6.296296296296296e-05,
29
+ "loss": 0.1161,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.49382716049382713,
34
+ "grad_norm": 0.26880860328674316,
35
+ "learning_rate": 5.061728395061729e-05,
36
+ "loss": 0.1252,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.6172839506172839,
41
+ "grad_norm": 0.23334389925003052,
42
+ "learning_rate": 3.82716049382716e-05,
43
+ "loss": 0.1519,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.7407407407407407,
48
+ "grad_norm": 0.16134235262870789,
49
+ "learning_rate": 2.5925925925925925e-05,
50
+ "loss": 0.1167,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.8641975308641975,
55
+ "grad_norm": 0.1925862729549408,
56
+ "learning_rate": 1.3580246913580247e-05,
57
+ "loss": 0.1497,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.9876543209876543,
62
+ "grad_norm": 0.17563721537590027,
63
+ "learning_rate": 1.234567901234568e-06,
64
+ "loss": 0.1209,
65
+ "step": 80
66
  },
67
  {
68
  "epoch": 1.0,
69
+ "step": 81,
70
  "total_flos": 0.0,
71
+ "train_loss": 0.0,
72
+ "train_runtime": 0.0067,
73
+ "train_samples_per_second": 24200.493,
74
+ "train_steps_per_second": 12100.247
75
  }
76
  ],
77
  "logging_steps": 10,
78
+ "max_steps": 81,
79
  "num_input_tokens_seen": 0,
80
  "num_train_epochs": 1,
81
  "save_steps": 500,
 
92
  }
93
  },
94
  "total_flos": 0.0,
95
+ "train_batch_size": 2,
96
  "trial_name": null,
97
  "trial_params": null
98
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:10e9e7a99dc632c55c00de4f1e524544d7f8da3b4e3793d42e768f0ca0cd0dad
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d18d3eae0c6d04fdb9345db767d4c2b090f1fbcd0661ec1d1fbb72b5ec28858e
3
  size 5432