CharlesLi commited on
Commit
3ee2afb
·
verified ·
1 Parent(s): 39c8779

Model save

Browse files
Files changed (4) hide show
  1. README.md +12 -10
  2. all_results.json +6 -11
  3. train_results.json +6 -6
  4. trainer_state.json +70 -39
README.md CHANGED
@@ -3,7 +3,6 @@ library_name: peft
3
  license: apache-2.0
4
  base_model: mistralai/Mistral-7B-Instruct-v0.1
5
  tags:
6
- - alignment-handbook
7
  - trl
8
  - sft
9
  - generated_from_trainer
@@ -21,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) on the generator dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 0.6954
25
 
26
  ## Model description
27
 
@@ -52,17 +51,20 @@ The following hyperparameters were used during training:
52
  - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
55
- - num_epochs: 10
56
 
57
  ### Training results
58
 
59
- | Training Loss | Epoch | Step | Validation Loss |
60
- |:-------------:|:-----:|:----:|:---------------:|
61
- | 1.9369 | 1.0 | 2 | 0.9818 |
62
- | 1.9369 | 2.0 | 4 | 0.7695 |
63
- | 1.2233 | 3.0 | 6 | 0.7315 |
64
- | 1.2233 | 4.0 | 8 | 0.7013 |
65
- | 0.8937 | 5.0 | 10 | 0.6954 |
 
 
 
66
 
67
 
68
  ### Framework versions
 
3
  license: apache-2.0
4
  base_model: mistralai/Mistral-7B-Instruct-v0.1
5
  tags:
 
6
  - trl
7
  - sft
8
  - generated_from_trainer
 
20
 
21
  This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 0.6346
24
 
25
  ## Model description
26
 
 
51
  - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 15
55
 
56
  ### Training results
57
 
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:------:|:----:|:---------------:|
60
+ | 1.9369 | 1.0 | 2 | 0.9818 |
61
+ | 1.9369 | 2.0 | 4 | 0.7654 |
62
+ | 1.2187 | 3.0 | 6 | 0.7218 |
63
+ | 1.2187 | 4.0 | 8 | 0.6669 |
64
+ | 0.8351 | 5.0 | 10 | 0.6480 |
65
+ | 0.8351 | 6.0 | 12 | 0.6381 |
66
+ | 0.8351 | 7.0 | 14 | 0.6353 |
67
+ | 0.7893 | 7.6667 | 15 | 0.6346 |
68
 
69
 
70
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 5.0,
3
- "eval_loss": 0.6953997015953064,
4
- "eval_runtime": 0.8608,
5
- "eval_samples": 20,
6
- "eval_samples_per_second": 3.485,
7
- "eval_steps_per_second": 1.162,
8
- "total_flos": 5536776290566144.0,
9
- "train_loss": 1.1298380970954895,
10
- "train_runtime": 63.6283,
11
  "train_samples": 100,
12
- "train_samples_per_second": 3.143,
13
- "train_steps_per_second": 0.157
14
  }
 
1
  {
2
+ "epoch": 7.666666666666667,
3
+ "total_flos": 8305281071054848.0,
4
+ "train_loss": 0.9955663760503133,
5
+ "train_runtime": 87.8629,
 
 
 
 
 
6
  "train_samples": 100,
7
+ "train_samples_per_second": 3.414,
8
+ "train_steps_per_second": 0.171
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 5.0,
3
- "total_flos": 5536776290566144.0,
4
- "train_loss": 1.1298380970954895,
5
- "train_runtime": 63.6283,
6
  "train_samples": 100,
7
- "train_samples_per_second": 3.143,
8
- "train_steps_per_second": 0.157
9
  }
 
1
  {
2
+ "epoch": 7.666666666666667,
3
+ "total_flos": 8305281071054848.0,
4
+ "train_loss": 0.9955663760503133,
5
+ "train_runtime": 87.8629,
6
  "train_samples": 100,
7
+ "train_samples_per_second": 3.414,
8
+ "train_steps_per_second": 0.171
9
  }
trainer_state.json CHANGED
@@ -1,88 +1,119 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.0,
5
  "eval_steps": 500,
6
- "global_step": 10,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.6666666666666666,
13
- "grad_norm": 0.8433068619524136,
14
- "learning_rate": 0.0002,
15
  "loss": 1.9369,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 1.0,
20
  "eval_loss": 0.9818150401115417,
21
- "eval_runtime": 0.7579,
22
- "eval_samples_per_second": 3.958,
23
- "eval_steps_per_second": 1.319,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 2.0,
28
- "eval_loss": 0.7694724202156067,
29
- "eval_runtime": 0.8108,
30
- "eval_samples_per_second": 3.7,
31
- "eval_steps_per_second": 1.233,
32
  "step": 4
33
  },
34
  {
35
  "epoch": 2.6666666666666665,
36
- "grad_norm": 0.39134348931417895,
37
- "learning_rate": 0.00011736481776669306,
38
- "loss": 1.2233,
39
  "step": 5
40
  },
41
  {
42
  "epoch": 3.0,
43
- "eval_loss": 0.7314517498016357,
44
- "eval_runtime": 0.7623,
45
- "eval_samples_per_second": 3.935,
46
- "eval_steps_per_second": 1.312,
47
  "step": 6
48
  },
49
  {
50
  "epoch": 4.0,
51
- "eval_loss": 0.7012591361999512,
52
- "eval_runtime": 0.8133,
53
- "eval_samples_per_second": 3.688,
54
- "eval_steps_per_second": 1.229,
55
  "step": 8
56
  },
57
  {
58
  "epoch": 5.0,
59
- "grad_norm": 0.27545457961569547,
60
- "learning_rate": 0.0,
61
- "loss": 0.8937,
62
  "step": 10
63
  },
64
  {
65
  "epoch": 5.0,
66
- "eval_loss": 0.6953997015953064,
67
- "eval_runtime": 0.8422,
68
- "eval_samples_per_second": 3.562,
69
- "eval_steps_per_second": 1.187,
70
  "step": 10
71
  },
72
  {
73
- "epoch": 5.0,
74
- "step": 10,
75
- "total_flos": 5536776290566144.0,
76
- "train_loss": 1.1298380970954895,
77
- "train_runtime": 63.6283,
78
- "train_samples_per_second": 3.143,
79
- "train_steps_per_second": 0.157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  }
81
  ],
82
  "logging_steps": 5,
83
- "max_steps": 10,
84
  "num_input_tokens_seen": 0,
85
- "num_train_epochs": 10,
86
  "save_steps": 1000,
87
  "stateful_callbacks": {
88
  "TrainerControl": {
@@ -96,7 +127,7 @@
96
  "attributes": {}
97
  }
98
  },
99
- "total_flos": 5536776290566144.0,
100
  "train_batch_size": 4,
101
  "trial_name": null,
102
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 7.666666666666667,
5
  "eval_steps": 500,
6
+ "global_step": 15,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.6666666666666666,
13
+ "grad_norm": 0.8434562800389038,
14
+ "learning_rate": 0.0001,
15
  "loss": 1.9369,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 1.0,
20
  "eval_loss": 0.9818150401115417,
21
+ "eval_runtime": 0.7574,
22
+ "eval_samples_per_second": 3.961,
23
+ "eval_steps_per_second": 1.32,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 2.0,
28
+ "eval_loss": 0.7653645873069763,
29
+ "eval_runtime": 0.812,
30
+ "eval_samples_per_second": 3.695,
31
+ "eval_steps_per_second": 1.232,
32
  "step": 4
33
  },
34
  {
35
  "epoch": 2.6666666666666665,
36
+ "grad_norm": 0.38835903193953863,
37
+ "learning_rate": 0.00017485107481711012,
38
+ "loss": 1.2187,
39
  "step": 5
40
  },
41
  {
42
  "epoch": 3.0,
43
+ "eval_loss": 0.7217903137207031,
44
+ "eval_runtime": 0.7628,
45
+ "eval_samples_per_second": 3.933,
46
+ "eval_steps_per_second": 1.311,
47
  "step": 6
48
  },
49
  {
50
  "epoch": 4.0,
51
+ "eval_loss": 0.6669471859931946,
52
+ "eval_runtime": 0.816,
53
+ "eval_samples_per_second": 3.677,
54
+ "eval_steps_per_second": 1.226,
55
  "step": 8
56
  },
57
  {
58
  "epoch": 5.0,
59
+ "grad_norm": 0.2836544204991338,
60
+ "learning_rate": 6.453951129574644e-05,
61
+ "loss": 0.8351,
62
  "step": 10
63
  },
64
  {
65
  "epoch": 5.0,
66
+ "eval_loss": 0.648018479347229,
67
+ "eval_runtime": 0.7628,
68
+ "eval_samples_per_second": 3.933,
69
+ "eval_steps_per_second": 1.311,
70
  "step": 10
71
  },
72
  {
73
+ "epoch": 6.0,
74
+ "eval_loss": 0.6381418704986572,
75
+ "eval_runtime": 0.8152,
76
+ "eval_samples_per_second": 3.68,
77
+ "eval_steps_per_second": 1.227,
78
+ "step": 12
79
+ },
80
+ {
81
+ "epoch": 7.0,
82
+ "eval_loss": 0.6352726221084595,
83
+ "eval_runtime": 0.7641,
84
+ "eval_samples_per_second": 3.926,
85
+ "eval_steps_per_second": 1.309,
86
+ "step": 14
87
+ },
88
+ {
89
+ "epoch": 7.666666666666667,
90
+ "grad_norm": 0.2657040105968121,
91
+ "learning_rate": 0.0,
92
+ "loss": 0.7893,
93
+ "step": 15
94
+ },
95
+ {
96
+ "epoch": 7.666666666666667,
97
+ "eval_loss": 0.634563684463501,
98
+ "eval_runtime": 0.845,
99
+ "eval_samples_per_second": 3.55,
100
+ "eval_steps_per_second": 1.183,
101
+ "step": 15
102
+ },
103
+ {
104
+ "epoch": 7.666666666666667,
105
+ "step": 15,
106
+ "total_flos": 8305281071054848.0,
107
+ "train_loss": 0.9955663760503133,
108
+ "train_runtime": 87.8629,
109
+ "train_samples_per_second": 3.414,
110
+ "train_steps_per_second": 0.171
111
  }
112
  ],
113
  "logging_steps": 5,
114
+ "max_steps": 15,
115
  "num_input_tokens_seen": 0,
116
+ "num_train_epochs": 15,
117
  "save_steps": 1000,
118
  "stateful_callbacks": {
119
  "TrainerControl": {
 
127
  "attributes": {}
128
  }
129
  },
130
+ "total_flos": 8305281071054848.0,
131
  "train_batch_size": 4,
132
  "trial_name": null,
133
  "trial_params": null