CharlesLi commited on
Commit
c4ae0fc
·
verified ·
1 Parent(s): 00cc275

Model save

Browse files
Files changed (4) hide show
  1. README.md +6 -4
  2. all_results.json +6 -11
  3. train_results.json +6 -6
  4. trainer_state.json +57 -19
README.md CHANGED
@@ -3,7 +3,6 @@ library_name: peft
3
  license: apache-2.0
4
  base_model: mistralai/Mistral-7B-Instruct-v0.1
5
  tags:
6
- - alignment-handbook
7
  - trl
8
  - sft
9
  - generated_from_trainer
@@ -21,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) on the generator dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 0.8974
25
 
26
  ## Model description
27
 
@@ -50,13 +49,16 @@ The following hyperparameters were used during training:
50
  - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
51
  - lr_scheduler_type: cosine
52
  - lr_scheduler_warmup_ratio: 0.1
53
- - num_epochs: 1
54
 
55
  ### Training results
56
 
57
  | Training Loss | Epoch | Step | Validation Loss |
58
  |:-------------:|:-----:|:----:|:---------------:|
59
- | 1.8904 | 0.8 | 2 | 0.8974 |
 
 
 
60
 
61
 
62
  ### Framework versions
 
3
  license: apache-2.0
4
  base_model: mistralai/Mistral-7B-Instruct-v0.1
5
  tags:
 
6
  - trl
7
  - sft
8
  - generated_from_trainer
 
20
 
21
  This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 0.7150
24
 
25
  ## Model description
26
 
 
49
  - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
50
  - lr_scheduler_type: cosine
51
  - lr_scheduler_warmup_ratio: 0.1
52
+ - num_epochs: 5
53
 
54
  ### Training results
55
 
56
  | Training Loss | Epoch | Step | Validation Loss |
57
  |:-------------:|:-----:|:----:|:---------------:|
58
+ | 1.8904 | 1.0 | 3 | 0.8287 |
59
+ | 1.4592 | 2.0 | 6 | 0.7422 |
60
+ | 1.4592 | 3.0 | 9 | 0.7162 |
61
+ | 1.0477 | 3.4 | 10 | 0.7150 |
62
 
63
 
64
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 0.8,
3
- "eval_loss": 0.8973897099494934,
4
- "eval_runtime": 0.9334,
5
- "eval_samples": 20,
6
- "eval_samples_per_second": 3.214,
7
- "eval_steps_per_second": 1.071,
8
- "total_flos": 1406258997362688.0,
9
- "train_loss": 1.9418977499008179,
10
- "train_runtime": 22.4337,
11
  "train_samples": 100,
12
- "train_samples_per_second": 0.892,
13
- "train_steps_per_second": 0.089
14
  }
 
1
  {
2
+ "epoch": 3.4,
3
+ "total_flos": 5976600671682560.0,
4
+ "train_loss": 1.2965626955032348,
5
+ "train_runtime": 74.9653,
 
 
 
 
 
6
  "train_samples": 100,
7
+ "train_samples_per_second": 1.334,
8
+ "train_steps_per_second": 0.133
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.8,
3
- "total_flos": 1406258997362688.0,
4
- "train_loss": 1.9418977499008179,
5
- "train_runtime": 22.4337,
6
  "train_samples": 100,
7
- "train_samples_per_second": 0.892,
8
- "train_steps_per_second": 0.089
9
  }
 
1
  {
2
+ "epoch": 3.4,
3
+ "total_flos": 5976600671682560.0,
4
+ "train_loss": 1.2965626955032348,
5
+ "train_runtime": 74.9653,
6
  "train_samples": 100,
7
+ "train_samples_per_second": 1.334,
8
+ "train_steps_per_second": 0.133
9
  }
trainer_state.json CHANGED
@@ -1,42 +1,80 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8,
5
  "eval_steps": 500,
6
- "global_step": 2,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.4,
13
- "grad_norm": 1.2139586210250854,
14
  "learning_rate": 0.0002,
15
  "loss": 1.8904,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.8,
20
- "eval_loss": 0.8973897099494934,
21
- "eval_runtime": 0.919,
22
- "eval_samples_per_second": 3.264,
23
- "eval_steps_per_second": 1.088,
24
- "step": 2
25
  },
26
  {
27
- "epoch": 0.8,
28
- "step": 2,
29
- "total_flos": 1406258997362688.0,
30
- "train_loss": 1.9418977499008179,
31
- "train_runtime": 22.4337,
32
- "train_samples_per_second": 0.892,
33
- "train_steps_per_second": 0.089
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  }
35
  ],
36
  "logging_steps": 5,
37
- "max_steps": 2,
38
  "num_input_tokens_seen": 0,
39
- "num_train_epochs": 1,
40
  "save_steps": 1000,
41
  "stateful_callbacks": {
42
  "TrainerControl": {
@@ -50,7 +88,7 @@
50
  "attributes": {}
51
  }
52
  },
53
- "total_flos": 1406258997362688.0,
54
  "train_batch_size": 4,
55
  "trial_name": null,
56
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.4,
5
  "eval_steps": 500,
6
+ "global_step": 10,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.4,
13
+ "grad_norm": 1.2139655351638794,
14
  "learning_rate": 0.0002,
15
  "loss": 1.8904,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 1.0,
20
+ "eval_loss": 0.8287366032600403,
21
+ "eval_runtime": 0.886,
22
+ "eval_samples_per_second": 3.386,
23
+ "eval_steps_per_second": 1.129,
24
+ "step": 3
25
  },
26
  {
27
+ "epoch": 1.8,
28
+ "grad_norm": 0.7967442274093628,
29
+ "learning_rate": 0.00011736481776669306,
30
+ "loss": 1.4592,
31
+ "step": 5
32
+ },
33
+ {
34
+ "epoch": 2.0,
35
+ "eval_loss": 0.7421586513519287,
36
+ "eval_runtime": 0.8866,
37
+ "eval_samples_per_second": 3.384,
38
+ "eval_steps_per_second": 1.128,
39
+ "step": 6
40
+ },
41
+ {
42
+ "epoch": 3.0,
43
+ "eval_loss": 0.7162426114082336,
44
+ "eval_runtime": 0.8865,
45
+ "eval_samples_per_second": 3.384,
46
+ "eval_steps_per_second": 1.128,
47
+ "step": 9
48
+ },
49
+ {
50
+ "epoch": 3.4,
51
+ "grad_norm": 0.5492648482322693,
52
+ "learning_rate": 0.0,
53
+ "loss": 1.0477,
54
+ "step": 10
55
+ },
56
+ {
57
+ "epoch": 3.4,
58
+ "eval_loss": 0.7149809002876282,
59
+ "eval_runtime": 0.9232,
60
+ "eval_samples_per_second": 3.25,
61
+ "eval_steps_per_second": 1.083,
62
+ "step": 10
63
+ },
64
+ {
65
+ "epoch": 3.4,
66
+ "step": 10,
67
+ "total_flos": 5976600671682560.0,
68
+ "train_loss": 1.2965626955032348,
69
+ "train_runtime": 74.9653,
70
+ "train_samples_per_second": 1.334,
71
+ "train_steps_per_second": 0.133
72
  }
73
  ],
74
  "logging_steps": 5,
75
+ "max_steps": 10,
76
  "num_input_tokens_seen": 0,
77
+ "num_train_epochs": 5,
78
  "save_steps": 1000,
79
  "stateful_callbacks": {
80
  "TrainerControl": {
 
88
  "attributes": {}
89
  }
90
  },
91
+ "total_flos": 5976600671682560.0,
92
  "train_batch_size": 4,
93
  "trial_name": null,
94
  "trial_params": null