Upload 10 files
Browse files- all_results.json +7 -0
- final/README.md +21 -0
- final/adapter_config.json +26 -0
- final/adapter_model.bin +3 -0
- final/training_args.bin +3 -0
- race_ft_alpaca_1_quality_2.json +30 -0
- runs/Dec30_17-59-51_u747/events.out.tfevents.1703930449.u747.874524.0 +3 -0
- train.log +45 -0
- train_results.json +7 -0
- trainer_state.json +175 -0
all_results.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 1.0,
|
| 3 |
+
"train_loss": 0.4878490357171921,
|
| 4 |
+
"train_runtime": 6234.3462,
|
| 5 |
+
"train_samples_per_second": 0.405,
|
| 6 |
+
"train_steps_per_second": 0.034
|
| 7 |
+
}
|
final/README.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: peft
|
| 3 |
+
---
|
| 4 |
+
## Training procedure
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
The following `bitsandbytes` quantization config was used during training:
|
| 8 |
+
- quant_method: bitsandbytes
|
| 9 |
+
- load_in_8bit: False
|
| 10 |
+
- load_in_4bit: True
|
| 11 |
+
- llm_int8_threshold: 6.0
|
| 12 |
+
- llm_int8_skip_modules: None
|
| 13 |
+
- llm_int8_enable_fp32_cpu_offload: False
|
| 14 |
+
- llm_int8_has_fp16_weight: False
|
| 15 |
+
- bnb_4bit_quant_type: nf4
|
| 16 |
+
- bnb_4bit_use_double_quant: True
|
| 17 |
+
- bnb_4bit_compute_dtype: float16
|
| 18 |
+
### Framework versions
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
- PEFT 0.5.0
|
final/adapter_config.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"auto_mapping": null,
|
| 3 |
+
"base_model_name_or_path": "/data0/maqi/huggingface_models/option2-models/option2-race_ft_alpaca",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"fan_in_fan_out": false,
|
| 6 |
+
"inference_mode": true,
|
| 7 |
+
"init_lora_weights": true,
|
| 8 |
+
"layers_pattern": null,
|
| 9 |
+
"layers_to_transform": null,
|
| 10 |
+
"lora_alpha": 64,
|
| 11 |
+
"lora_dropout": 0.05,
|
| 12 |
+
"modules_to_save": null,
|
| 13 |
+
"peft_type": "LORA",
|
| 14 |
+
"r": 128,
|
| 15 |
+
"revision": null,
|
| 16 |
+
"target_modules": [
|
| 17 |
+
"k_proj",
|
| 18 |
+
"down_proj",
|
| 19 |
+
"o_proj",
|
| 20 |
+
"up_proj",
|
| 21 |
+
"v_proj",
|
| 22 |
+
"gate_proj",
|
| 23 |
+
"q_proj"
|
| 24 |
+
],
|
| 25 |
+
"task_type": "CAUSAL_LM"
|
| 26 |
+
}
|
final/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b95ac580582a8c312bc9322f60dec4156d7ae6b7abde810cf61bf835b6fe5366
|
| 3 |
+
size 1279424269
|
final/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38f7ee24d5e0d3b7bc1dd6673b48027ffa9a246f24451593591ed366f994c60c
|
| 3 |
+
size 4091
|
race_ft_alpaca_1_quality_2.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"output_dir": "output/option-2/QuALITY/race_ft_alpaca_1_quality_2",
|
| 3 |
+
"model_name_or_path": "/data0/maqi/huggingface_models/option2-models/option2-race_ft_alpaca",
|
| 4 |
+
"train_file": "/data0/maqi/KGLQA-data/datasets/QuALITY/Caption/quality_caption_and_rel_instruct/train.jsonl",
|
| 5 |
+
"num_train_epochs": 1,
|
| 6 |
+
"per_device_train_batch_size": 6,
|
| 7 |
+
"gradient_accumulation_steps": 2,
|
| 8 |
+
"learning_rate": 1e-4,
|
| 9 |
+
"max_seq_length": 2048,
|
| 10 |
+
"logging_steps": 10,
|
| 11 |
+
"save_steps": 100,
|
| 12 |
+
"save_total_limit": 1,
|
| 13 |
+
"lr_scheduler_type": "constant_with_warmup",
|
| 14 |
+
"warmup_ratio": 0.1,
|
| 15 |
+
"lora_rank": 128,
|
| 16 |
+
"lora_alpha": 64,
|
| 17 |
+
"lora_dropout": 0.05,
|
| 18 |
+
|
| 19 |
+
"gradient_checkpointing": true,
|
| 20 |
+
"disable_tqdm": false,
|
| 21 |
+
"optim": "paged_adamw_32bit",
|
| 22 |
+
"seed": 318,
|
| 23 |
+
"fp16": true,
|
| 24 |
+
"report_to": "tensorboard",
|
| 25 |
+
"dataloader_num_workers": 10,
|
| 26 |
+
"save_strategy": "steps",
|
| 27 |
+
"weight_decay": 0,
|
| 28 |
+
"max_grad_norm": 0.3,
|
| 29 |
+
"remove_unused_columns": false
|
| 30 |
+
}
|
runs/Dec30_17-59-51_u747/events.out.tfevents.1703930449.u747.874524.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e99fd7919ea4ab5d608e6fec043cb7e8f1f56cc6fe40e5e2751193b448879718
|
| 3 |
+
size 9420
|
train.log
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 0 |
0%| | 0/210 [00:00<?, ?it/s]
|
| 1 |
0%| | 1/210 [00:33<1:58:19, 33.97s/it]
|
| 2 |
1%| | 2/210 [01:03<1:48:04, 31.18s/it]
|
| 3 |
1%|▏ | 3/210 [01:32<1:44:41, 30.34s/it]
|
| 4 |
2%|▏ | 4/210 [02:01<1:42:54, 29.97s/it]
|
| 5 |
2%|▏ | 5/210 [02:31<1:41:41, 29.76s/it]
|
| 6 |
3%|▎ | 6/210 [03:01<1:41:11, 29.76s/it]
|
| 7 |
3%|▎ | 7/210 [03:30<1:40:18, 29.65s/it]
|
| 8 |
4%|▍ | 8/210 [04:00<1:39:55, 29.68s/it]
|
| 9 |
4%|▍ | 9/210 [04:29<1:39:06, 29.58s/it]
|
| 10 |
5%|▍ | 10/210 [04:58<1:38:23, 29.52s/it]
|
| 11 |
|
|
|
|
| 12 |
5%|▍ | 10/210 [04:59<1:38:23, 29.52s/it]
|
| 13 |
5%|▌ | 11/210 [05:28<1:38:06, 29.58s/it]
|
| 14 |
6%|▌ | 12/210 [05:58<1:37:45, 29.62s/it]
|
| 15 |
6%|▌ | 13/210 [06:28<1:37:21, 29.65s/it]
|
| 16 |
7%|▋ | 14/210 [06:57<1:36:36, 29.57s/it]
|
| 17 |
7%|▋ | 15/210 [07:27<1:36:17, 29.63s/it]
|
| 18 |
8%|▊ | 16/210 [07:57<1:35:56, 29.67s/it]
|
| 19 |
8%|▊ | 17/210 [08:26<1:35:31, 29.70s/it]
|
| 20 |
9%|▊ | 18/210 [08:56<1:35:05, 29.71s/it]
|
| 21 |
9%|▉ | 19/210 [09:27<1:35:15, 29.92s/it]
|
| 22 |
10%|▉ | 20/210 [09:57<1:34:55, 29.97s/it]
|
| 23 |
|
|
|
|
| 24 |
10%|▉ | 20/210 [09:57<1:34:55, 29.97s/it]
|
| 25 |
10%|█ | 21/210 [10:26<1:33:54, 29.81s/it]
|
| 26 |
10%|█ | 22/210 [10:56<1:33:39, 29.89s/it]
|
| 27 |
11%|█ | 23/210 [11:26<1:32:43, 29.75s/it]
|
| 28 |
11%|█▏ | 24/210 [11:55<1:31:56, 29.66s/it]
|
| 29 |
12%|█▏ | 25/210 [12:24<1:31:14, 29.59s/it]
|
| 30 |
12%|█▏ | 26/210 [12:54<1:30:36, 29.55s/it]
|
| 31 |
13%|█▎ | 27/210 [13:24<1:30:16, 29.60s/it]
|
| 32 |
13%|█▎ | 28/210 [13:53<1:29:29, 29.50s/it]
|
| 33 |
14%|█▍ | 29/210 [14:23<1:29:11, 29.57s/it]
|
| 34 |
14%|█▍ | 30/210 [14:53<1:29:08, 29.72s/it]
|
| 35 |
|
|
|
|
| 36 |
14%|█▍ | 30/210 [14:53<1:29:08, 29.72s/it]
|
| 37 |
15%|█▍ | 31/210 [15:22<1:28:25, 29.64s/it]
|
| 38 |
15%|█▌ | 32/210 [15:52<1:28:16, 29.76s/it]
|
| 39 |
16%|█▌ | 33/210 [16:22<1:27:45, 29.75s/it]
|
| 40 |
16%|█▌ | 34/210 [16:51<1:26:58, 29.65s/it]
|
| 41 |
17%|█▋ | 35/210 [17:21<1:26:17, 29.59s/it]
|
| 42 |
17%|█▋ | 36/210 [17:50<1:25:39, 29.54s/it]
|
| 43 |
18%|█▊ | 37/210 [18:20<1:25:38, 29.70s/it]
|
| 44 |
18%|█▊ | 38/210 [18:50<1:24:54, 29.62s/it]
|
| 45 |
19%|█▊ | 39/210 [19:19<1:24:15, 29.56s/it]
|
| 46 |
19%|█▉ | 40/210 [19:49<1:24:10, 29.71s/it]
|
| 47 |
|
|
|
|
| 48 |
19%|█▉ | 40/210 [19:49<1:24:10, 29.71s/it]
|
| 49 |
20%|█▉ | 41/210 [20:19<1:23:43, 29.73s/it]
|
| 50 |
20%|██ | 42/210 [20:48<1:23:00, 29.64s/it]
|
| 51 |
20%|██ | 43/210 [21:18<1:22:36, 29.68s/it]
|
| 52 |
21%|██ | 44/210 [21:48<1:22:11, 29.71s/it]
|
| 53 |
21%|██▏ | 45/210 [22:18<1:22:00, 29.82s/it]
|
| 54 |
22%|██▏ | 46/210 [22:48<1:21:27, 29.80s/it]
|
| 55 |
22%|██▏ | 47/210 [23:18<1:20:56, 29.80s/it]
|
| 56 |
23%|██▎ | 48/210 [23:47<1:20:09, 29.69s/it]
|
| 57 |
23%|██▎ | 49/210 [24:16<1:19:28, 29.62s/it]
|
| 58 |
24%|██▍ | 50/210 [24:46<1:19:05, 29.66s/it]
|
| 59 |
|
|
|
|
| 60 |
24%|██▍ | 50/210 [24:46<1:19:05, 29.66s/it]
|
| 61 |
24%|██▍ | 51/210 [25:16<1:18:25, 29.60s/it]
|
| 62 |
25%|██▍ | 52/210 [25:45<1:17:48, 29.55s/it]
|
| 63 |
25%|██▌ | 53/210 [26:15<1:17:28, 29.61s/it]
|
| 64 |
26%|██▌ | 54/210 [26:44<1:16:50, 29.55s/it]
|
| 65 |
26%|██▌ | 55/210 [27:14<1:16:45, 29.71s/it]
|
| 66 |
27%|██▋ | 56/210 [27:44<1:16:18, 29.73s/it]
|
| 67 |
27%|██▋ | 57/210 [28:14<1:15:50, 29.74s/it]
|
| 68 |
28%|██▊ | 58/210 [28:44<1:15:22, 29.75s/it]
|
| 69 |
28%|██▊ | 59/210 [29:13<1:14:53, 29.76s/it]
|
| 70 |
29%|██▊ | 60/210 [29:43<1:14:09, 29.66s/it]
|
| 71 |
|
|
|
|
| 72 |
29%|██▊ | 60/210 [29:43<1:14:09, 29.66s/it]
|
| 73 |
29%|██▉ | 61/210 [30:12<1:13:31, 29.61s/it]
|
| 74 |
30%|██▉ | 62/210 [30:42<1:12:55, 29.56s/it]
|
| 75 |
30%|███ | 63/210 [31:12<1:12:35, 29.63s/it]
|
| 76 |
30%|███ | 64/210 [31:41<1:11:56, 29.56s/it]
|
| 77 |
31%|███ | 65/210 [32:10<1:11:23, 29.54s/it]
|
| 78 |
31%|███▏ | 66/210 [32:40<1:11:03, 29.61s/it]
|
| 79 |
32%|███▏ | 67/210 [33:10<1:10:27, 29.56s/it]
|
| 80 |
32%|███▏ | 68/210 [33:39<1:09:52, 29.53s/it]
|
| 81 |
33%|███▎ | 69/210 [34:09<1:09:19, 29.50s/it]
|
| 82 |
33%|███▎ | 70/210 [34:38<1:08:48, 29.49s/it]
|
| 83 |
|
|
|
|
| 84 |
33%|███▎ | 70/210 [34:38<1:08:48, 29.49s/it]
|
| 85 |
34%|███▍ | 71/210 [35:07<1:08:18, 29.48s/it]
|
| 86 |
34%|███▍ | 72/210 [35:37<1:07:47, 29.48s/it]
|
| 87 |
35%|███▍ | 73/210 [36:06<1:07:16, 29.46s/it]
|
| 88 |
35%|███▌ | 74/210 [36:36<1:07:00, 29.56s/it]
|
| 89 |
36%|███▌ | 75/210 [37:06<1:06:26, 29.53s/it]
|
| 90 |
36%|███▌ | 76/210 [37:35<1:06:05, 29.59s/it]
|
| 91 |
37%|███▋ | 77/210 [38:05<1:05:41, 29.64s/it]
|
| 92 |
37%|███▋ | 78/210 [38:35<1:05:03, 29.57s/it]
|
| 93 |
38%|███▊ | 79/210 [39:04<1:04:29, 29.54s/it]
|
| 94 |
38%|███▊ | 80/210 [39:33<1:03:55, 29.51s/it]
|
| 95 |
|
|
|
|
| 96 |
38%|███▊ | 80/210 [39:33<1:03:55, 29.51s/it]
|
| 97 |
39%|███▊ | 81/210 [40:03<1:03:34, 29.57s/it]
|
| 98 |
39%|███▉ | 82/210 [40:33<1:03:12, 29.63s/it]
|
| 99 |
40%|███▉ | 83/210 [41:03<1:02:47, 29.67s/it]
|
| 100 |
40%|████ | 84/210 [41:32<1:02:21, 29.70s/it]
|
| 101 |
40%|████ | 85/210 [42:02<1:01:42, 29.62s/it]
|
| 102 |
41%|████ | 86/210 [42:31<1:01:05, 29.56s/it]
|
| 103 |
41%|████▏ | 87/210 [43:01<1:00:32, 29.53s/it]
|
| 104 |
42%|████▏ | 88/210 [43:30<59:59, 29.51s/it]
|
| 105 |
42%|████▏ | 89/210 [44:00<59:38, 29.58s/it]
|
| 106 |
43%|████▎ | 90/210 [44:29<59:04, 29.53s/it]
|
| 107 |
|
|
|
|
| 108 |
43%|████▎ | 90/210 [44:29<59:04, 29.53s/it]
|
| 109 |
43%|████▎ | 91/210 [44:59<58:43, 29.61s/it]
|
| 110 |
44%|████▍ | 92/210 [45:29<58:18, 29.65s/it]
|
| 111 |
44%|████▍ | 93/210 [45:58<57:41, 29.59s/it]
|
| 112 |
45%|████▍ | 94/210 [46:28<57:29, 29.74s/it]
|
| 113 |
45%|████▌ | 95/210 [46:58<57:00, 29.74s/it]
|
| 114 |
46%|████▌ | 96/210 [47:28<56:19, 29.64s/it]
|
| 115 |
46%|████▌ | 97/210 [47:57<55:41, 29.57s/it]
|
| 116 |
47%|████▋ | 98/210 [48:27<55:18, 29.63s/it]
|
| 117 |
47%|████▋ | 99/210 [48:56<54:42, 29.57s/it]
|
| 118 |
48%|████▊ | 100/210 [49:26<54:18, 29.62s/it]
|
| 119 |
|
|
|
|
| 120 |
48%|████▊ | 100/210 [49:26<54:18, 29.62s/it]
|
| 121 |
48%|████▊ | 101/210 [49:59<55:56, 30.80s/it]
|
| 122 |
49%|████▊ | 102/210 [50:29<54:42, 30.39s/it]
|
| 123 |
49%|████▉ | 103/210 [50:59<53:50, 30.20s/it]
|
| 124 |
50%|████▉ | 104/210 [51:28<52:57, 29.98s/it]
|
| 125 |
50%|█████ | 105/210 [51:58<52:20, 29.91s/it]
|
| 126 |
50%|█████ | 106/210 [52:27<51:35, 29.77s/it]
|
| 127 |
51%|█████ | 107/210 [52:57<51:05, 29.77s/it]
|
| 128 |
51%|█████▏ | 108/210 [53:27<50:35, 29.76s/it]
|
| 129 |
52%|█████▏ | 109/210 [53:56<49:54, 29.65s/it]
|
| 130 |
52%|█████▏ | 110/210 [54:26<49:19, 29.59s/it]
|
| 131 |
|
|
|
|
| 132 |
52%|█████▏ | 110/210 [54:26<49:19, 29.59s/it]
|
| 133 |
53%|█████▎ | 111/210 [54:55<48:54, 29.64s/it]
|
| 134 |
53%|█████▎ | 112/210 [55:25<48:16, 29.56s/it]
|
| 135 |
54%|█████▍ | 113/210 [55:54<47:44, 29.53s/it]
|
| 136 |
54%|█████▍ | 114/210 [56:24<47:21, 29.59s/it]
|
| 137 |
55%|█████▍ | 115/210 [56:53<46:46, 29.54s/it]
|
| 138 |
55%|█████▌ | 116/210 [57:24<46:31, 29.70s/it]
|
| 139 |
56%|█████▌ | 117/210 [57:53<46:03, 29.71s/it]
|
| 140 |
56%|█████▌ | 118/210 [58:23<45:25, 29.63s/it]
|
| 141 |
57%|█████▋ | 119/210 [58:52<44:59, 29.66s/it]
|
| 142 |
57%|█████▋ | 120/210 [59:22<44:23, 29.59s/it]
|
| 143 |
|
|
|
|
| 144 |
57%|█████▋ | 120/210 [59:22<44:23, 29.59s/it]
|
| 145 |
58%|█████▊ | 121/210 [59:52<43:55, 29.61s/it]
|
| 146 |
58%|█████▊ | 122/210 [1:00:21<43:29, 29.65s/it]
|
| 147 |
59%|█████▊ | 123/210 [1:00:51<42:53, 29.58s/it]
|
| 148 |
59%|█████▉ | 124/210 [1:01:20<42:19, 29.53s/it]
|
| 149 |
60%|█████▉ | 125/210 [1:01:50<41:56, 29.60s/it]
|
| 150 |
60%|██████ | 126/210 [1:02:19<41:21, 29.54s/it]
|
| 151 |
60%|██████ | 127/210 [1:02:49<40:48, 29.50s/it]
|
| 152 |
61%|██████ | 128/210 [1:03:18<40:19, 29.50s/it]
|
| 153 |
61%|██████▏ | 129/210 [1:03:48<39:55, 29.58s/it]
|
| 154 |
62%|██████▏ | 130/210 [1:04:17<39:22, 29.53s/it]
|
| 155 |
|
|
|
|
| 156 |
62%|██████▏ | 130/210 [1:04:17<39:22, 29.53s/it]
|
| 157 |
62%|██████▏ | 131/210 [1:04:47<38:49, 29.49s/it]
|
| 158 |
63%|██████▎ | 132/210 [1:05:16<38:18, 29.47s/it]
|
| 159 |
63%|██████▎ | 133/210 [1:05:46<37:47, 29.44s/it]
|
| 160 |
64%|██████▍ | 134/210 [1:06:15<37:16, 29.43s/it]
|
| 161 |
64%|██████▍ | 135/210 [1:06:45<36:53, 29.52s/it]
|
| 162 |
65%|██████▍ | 136/210 [1:07:14<36:21, 29.49s/it]
|
| 163 |
65%|██████▌ | 137/210 [1:07:44<35:51, 29.47s/it]
|
| 164 |
66%|██████▌ | 138/210 [1:08:13<35:27, 29.55s/it]
|
| 165 |
66%|██████▌ | 139/210 [1:08:43<35:02, 29.61s/it]
|
| 166 |
67%|██████▋ | 140/210 [1:09:13<34:35, 29.65s/it]
|
| 167 |
|
|
|
|
| 168 |
67%|██████▋ | 140/210 [1:09:13<34:35, 29.65s/it]
|
| 169 |
67%|██████▋ | 141/210 [1:09:42<34:07, 29.67s/it]
|
| 170 |
68%|██████▊ | 142/210 [1:10:12<33:39, 29.69s/it]
|
| 171 |
68%|██████▊ | 143/210 [1:10:42<33:16, 29.80s/it]
|
| 172 |
69%|██████▊ | 144/210 [1:11:12<32:39, 29.69s/it]
|
| 173 |
69%|██████▉ | 145/210 [1:11:41<32:10, 29.70s/it]
|
| 174 |
70%|██████▉ | 146/210 [1:12:11<31:35, 29.62s/it]
|
| 175 |
70%|███████ | 147/210 [1:12:40<31:02, 29.56s/it]
|
| 176 |
70%|███████ | 148/210 [1:13:10<30:30, 29.52s/it]
|
| 177 |
71%|███████ | 149/210 [1:13:39<29:58, 29.49s/it]
|
| 178 |
71%|███████▏ | 150/210 [1:14:09<29:28, 29.47s/it]
|
| 179 |
|
|
|
|
| 180 |
71%|███████▏ | 150/210 [1:14:09<29:28, 29.47s/it]
|
| 181 |
72%|███████▏ | 151/210 [1:14:39<29:08, 29.64s/it]
|
| 182 |
72%|███████▏ | 152/210 [1:15:08<28:35, 29.58s/it]
|
| 183 |
73%|███████▎ | 153/210 [1:15:38<28:14, 29.73s/it]
|
| 184 |
73%|███████▎ | 154/210 [1:16:07<27:39, 29.63s/it]
|
| 185 |
74%|███████▍ | 155/210 [1:16:37<27:10, 29.65s/it]
|
| 186 |
74%|███████▍ | 156/210 [1:17:07<26:37, 29.58s/it]
|
| 187 |
75%|███████▍ | 157/210 [1:17:36<26:04, 29.52s/it]
|
| 188 |
75%|███████▌ | 158/210 [1:18:06<25:36, 29.55s/it]
|
| 189 |
76%|███████▌ | 159/210 [1:18:35<25:09, 29.60s/it]
|
| 190 |
76%|███████▌ | 160/210 [1:19:05<24:42, 29.65s/it]
|
| 191 |
|
|
|
|
| 192 |
76%|███████▌ | 160/210 [1:19:05<24:42, 29.65s/it]
|
| 193 |
77%|███████▋ | 161/210 [1:19:35<24:14, 29.68s/it]
|
| 194 |
77%|███████▋ | 162/210 [1:20:04<23:41, 29.60s/it]
|
| 195 |
78%|███████▊ | 163/210 [1:20:34<23:12, 29.64s/it]
|
| 196 |
78%|███████▊ | 164/210 [1:21:04<22:44, 29.66s/it]
|
| 197 |
79%|███████▊ | 165/210 [1:21:33<22:15, 29.67s/it]
|
| 198 |
79%|███████▉ | 166/210 [1:22:03<21:40, 29.56s/it]
|
| 199 |
80%|███████▉ | 167/210 [1:22:32<21:08, 29.51s/it]
|
| 200 |
80%|████████ | 168/210 [1:23:01<20:38, 29.48s/it]
|
| 201 |
80%|████████ | 169/210 [1:23:31<20:11, 29.56s/it]
|
| 202 |
81%|████████ | 170/210 [1:24:01<19:40, 29.51s/it]
|
| 203 |
|
|
|
|
| 204 |
81%|████████ | 170/210 [1:24:01<19:40, 29.51s/it]
|
| 205 |
81%|████████▏ | 171/210 [1:24:30<19:13, 29.57s/it]
|
| 206 |
82%|████████▏ | 172/210 [1:25:00<18:42, 29.53s/it]
|
| 207 |
82%|████████▏ | 173/210 [1:25:30<18:18, 29.68s/it]
|
| 208 |
83%|████████▎ | 174/210 [1:25:59<17:45, 29.61s/it]
|
| 209 |
83%|████████▎ | 175/210 [1:26:29<17:14, 29.54s/it]
|
| 210 |
84%|████████▍ | 176/210 [1:26:58<16:43, 29.51s/it]
|
| 211 |
84%|████████▍ | 177/210 [1:27:28<16:15, 29.57s/it]
|
| 212 |
85%|████████▍ | 178/210 [1:27:57<15:44, 29.52s/it]
|
| 213 |
85%|████████▌ | 179/210 [1:28:27<15:13, 29.48s/it]
|
| 214 |
86%|████████▌ | 180/210 [1:28:56<14:43, 29.44s/it]
|
| 215 |
|
|
|
|
| 216 |
86%|████████▌ | 180/210 [1:28:56<14:43, 29.44s/it]
|
| 217 |
86%|████████▌ | 181/210 [1:29:26<14:16, 29.52s/it]
|
| 218 |
87%|████████▋ | 182/210 [1:29:55<13:45, 29.48s/it]
|
| 219 |
87%|████████▋ | 183/210 [1:30:25<13:17, 29.55s/it]
|
| 220 |
88%|████████▊ | 184/210 [1:30:54<12:49, 29.60s/it]
|
| 221 |
88%|████████▊ | 185/210 [1:31:24<12:18, 29.54s/it]
|
| 222 |
89%|████████▊ | 186/210 [1:31:53<11:47, 29.50s/it]
|
| 223 |
89%|████████▉ | 187/210 [1:32:23<11:17, 29.47s/it]
|
| 224 |
90%|████████▉ | 188/210 [1:32:52<10:49, 29.54s/it]
|
| 225 |
90%|█████████ | 189/210 [1:33:22<10:21, 29.60s/it]
|
| 226 |
90%|█████████ | 190/210 [1:33:52<09:52, 29.64s/it]
|
| 227 |
|
|
|
|
| 228 |
90%|█████████ | 190/210 [1:33:52<09:52, 29.64s/it]
|
| 229 |
91%|█████████ | 191/210 [1:34:22<09:25, 29.76s/it]
|
| 230 |
91%|█████████▏| 192/210 [1:34:52<08:55, 29.75s/it]
|
| 231 |
92%|█████████▏| 193/210 [1:35:21<08:25, 29.75s/it]
|
| 232 |
92%|█████████▏| 194/210 [1:35:51<07:55, 29.74s/it]
|
| 233 |
93%|█████████▎| 195/210 [1:36:20<07:24, 29.65s/it]
|
| 234 |
93%|█████████▎| 196/210 [1:36:50<06:55, 29.67s/it]
|
| 235 |
94%|█████████▍| 197/210 [1:37:20<06:27, 29.79s/it]
|
| 236 |
94%|█████████▍| 198/210 [1:37:50<05:56, 29.71s/it]
|
| 237 |
95%|█████████▍| 199/210 [1:38:19<05:25, 29.61s/it]
|
| 238 |
95%|█████████▌| 200/210 [1:38:49<04:55, 29.55s/it]
|
| 239 |
|
|
|
|
| 240 |
95%|█████████▌| 200/210 [1:38:49<04:55, 29.55s/it]
|
| 241 |
96%|█████████▌| 201/210 [1:39:22<04:36, 30.70s/it]
|
| 242 |
96%|█████████▌| 202/210 [1:39:51<04:02, 30.31s/it]
|
| 243 |
97%|█████████▋| 203/210 [1:40:21<03:30, 30.05s/it]
|
| 244 |
97%|█████████▋| 204/210 [1:40:50<02:59, 29.86s/it]
|
| 245 |
98%|█████████▊| 205/210 [1:41:20<02:28, 29.73s/it]
|
| 246 |
98%|█████████▊| 206/210 [1:41:49<01:58, 29.64s/it]
|
| 247 |
99%|█████████▊| 207/210 [1:42:19<01:28, 29.58s/it]
|
| 248 |
99%|█████████▉| 208/210 [1:42:48<00:59, 29.53s/it]
|
| 249 |
|
|
|
|
| 250 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model training desc: initialize model training...
|
| 2 |
+
2023-12-30 17:59:51.341 | INFO | __main__:init_components:108 - Initializing components...
|
| 3 |
+
|
| 4 |
+
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
|
| 5 |
+
2023-12-30 18:00:03.415 | INFO | __main__:init_components:155 -
|
| 6 |
+
|
| 7 |
+
2023-12-30 18:00:03.415 | INFO | __main__:init_components:156 - ********************
|
| 8 |
+
2023-12-30 18:00:03.415 | INFO | __main__:init_components:157 - using llama2 model
|
| 9 |
+
2023-12-30 18:00:03.415 | INFO | __main__:init_components:158 - ********************
|
| 10 |
+
2023-12-30 18:00:03.415 | INFO | __main__:init_components:159 -
|
| 11 |
+
|
| 12 |
+
memory footprint of model: 4.024436950683594 GB
|
| 13 |
+
trainable params: 319,815,680 || all params: 7,058,231,296 || trainable%: 4.531102291607305
|
| 14 |
+
2023-12-30 18:00:48.703 | INFO | component.dataset:__init__:14 - Loading data: /data0/maqi/KGLQA-data/datasets/QuALITY/Caption/quality_caption_and_rel_instruct/train.jsonl
|
| 15 |
+
2023-12-30 18:00:48.807 | INFO | component.dataset:__init__:19 - there are 2523 data in dataset
|
| 16 |
+
2023-12-30 18:00:49.225 | INFO | __main__:main:231 - *** starting training ***
|
| 17 |
+
|
| 18 |
0%| | 0/210 [00:00<?, ?it/s]
|
| 19 |
0%| | 1/210 [00:33<1:58:19, 33.97s/it]
|
| 20 |
1%| | 2/210 [01:03<1:48:04, 31.18s/it]
|
| 21 |
1%|▏ | 3/210 [01:32<1:44:41, 30.34s/it]
|
| 22 |
2%|▏ | 4/210 [02:01<1:42:54, 29.97s/it]
|
| 23 |
2%|▏ | 5/210 [02:31<1:41:41, 29.76s/it]
|
| 24 |
3%|▎ | 6/210 [03:01<1:41:11, 29.76s/it]
|
| 25 |
3%|▎ | 7/210 [03:30<1:40:18, 29.65s/it]
|
| 26 |
4%|▍ | 8/210 [04:00<1:39:55, 29.68s/it]
|
| 27 |
4%|▍ | 9/210 [04:29<1:39:06, 29.58s/it]
|
| 28 |
5%|▍ | 10/210 [04:58<1:38:23, 29.52s/it]
|
| 29 |
|
| 30 |
+
|
| 31 |
5%|▍ | 10/210 [04:59<1:38:23, 29.52s/it]
|
| 32 |
5%|▌ | 11/210 [05:28<1:38:06, 29.58s/it]
|
| 33 |
6%|▌ | 12/210 [05:58<1:37:45, 29.62s/it]
|
| 34 |
6%|▌ | 13/210 [06:28<1:37:21, 29.65s/it]
|
| 35 |
7%|▋ | 14/210 [06:57<1:36:36, 29.57s/it]
|
| 36 |
7%|▋ | 15/210 [07:27<1:36:17, 29.63s/it]
|
| 37 |
8%|▊ | 16/210 [07:57<1:35:56, 29.67s/it]
|
| 38 |
8%|▊ | 17/210 [08:26<1:35:31, 29.70s/it]
|
| 39 |
9%|▊ | 18/210 [08:56<1:35:05, 29.71s/it]
|
| 40 |
9%|▉ | 19/210 [09:27<1:35:15, 29.92s/it]
|
| 41 |
10%|▉ | 20/210 [09:57<1:34:55, 29.97s/it]
|
| 42 |
|
| 43 |
+
|
| 44 |
10%|▉ | 20/210 [09:57<1:34:55, 29.97s/it]
|
| 45 |
10%|█ | 21/210 [10:26<1:33:54, 29.81s/it]
|
| 46 |
10%|█ | 22/210 [10:56<1:33:39, 29.89s/it]
|
| 47 |
11%|█ | 23/210 [11:26<1:32:43, 29.75s/it]
|
| 48 |
11%|█▏ | 24/210 [11:55<1:31:56, 29.66s/it]
|
| 49 |
12%|█▏ | 25/210 [12:24<1:31:14, 29.59s/it]
|
| 50 |
12%|█▏ | 26/210 [12:54<1:30:36, 29.55s/it]
|
| 51 |
13%|█▎ | 27/210 [13:24<1:30:16, 29.60s/it]
|
| 52 |
13%|█▎ | 28/210 [13:53<1:29:29, 29.50s/it]
|
| 53 |
14%|█▍ | 29/210 [14:23<1:29:11, 29.57s/it]
|
| 54 |
14%|█▍ | 30/210 [14:53<1:29:08, 29.72s/it]
|
| 55 |
|
| 56 |
+
|
| 57 |
14%|█▍ | 30/210 [14:53<1:29:08, 29.72s/it]
|
| 58 |
15%|█▍ | 31/210 [15:22<1:28:25, 29.64s/it]
|
| 59 |
15%|█▌ | 32/210 [15:52<1:28:16, 29.76s/it]
|
| 60 |
16%|█▌ | 33/210 [16:22<1:27:45, 29.75s/it]
|
| 61 |
16%|█▌ | 34/210 [16:51<1:26:58, 29.65s/it]
|
| 62 |
17%|█▋ | 35/210 [17:21<1:26:17, 29.59s/it]
|
| 63 |
17%|█▋ | 36/210 [17:50<1:25:39, 29.54s/it]
|
| 64 |
18%|█▊ | 37/210 [18:20<1:25:38, 29.70s/it]
|
| 65 |
18%|█▊ | 38/210 [18:50<1:24:54, 29.62s/it]
|
| 66 |
19%|█▊ | 39/210 [19:19<1:24:15, 29.56s/it]
|
| 67 |
19%|█▉ | 40/210 [19:49<1:24:10, 29.71s/it]
|
| 68 |
|
| 69 |
+
|
| 70 |
19%|█▉ | 40/210 [19:49<1:24:10, 29.71s/it]
|
| 71 |
20%|█▉ | 41/210 [20:19<1:23:43, 29.73s/it]
|
| 72 |
20%|██ | 42/210 [20:48<1:23:00, 29.64s/it]
|
| 73 |
20%|██ | 43/210 [21:18<1:22:36, 29.68s/it]
|
| 74 |
21%|██ | 44/210 [21:48<1:22:11, 29.71s/it]
|
| 75 |
21%|██▏ | 45/210 [22:18<1:22:00, 29.82s/it]
|
| 76 |
22%|██▏ | 46/210 [22:48<1:21:27, 29.80s/it]
|
| 77 |
22%|██▏ | 47/210 [23:18<1:20:56, 29.80s/it]
|
| 78 |
23%|██▎ | 48/210 [23:47<1:20:09, 29.69s/it]
|
| 79 |
23%|██▎ | 49/210 [24:16<1:19:28, 29.62s/it]
|
| 80 |
24%|██▍ | 50/210 [24:46<1:19:05, 29.66s/it]
|
| 81 |
|
| 82 |
+
|
| 83 |
24%|██▍ | 50/210 [24:46<1:19:05, 29.66s/it]
|
| 84 |
24%|██▍ | 51/210 [25:16<1:18:25, 29.60s/it]
|
| 85 |
25%|██▍ | 52/210 [25:45<1:17:48, 29.55s/it]
|
| 86 |
25%|██▌ | 53/210 [26:15<1:17:28, 29.61s/it]
|
| 87 |
26%|██▌ | 54/210 [26:44<1:16:50, 29.55s/it]
|
| 88 |
26%|██▌ | 55/210 [27:14<1:16:45, 29.71s/it]
|
| 89 |
27%|██▋ | 56/210 [27:44<1:16:18, 29.73s/it]
|
| 90 |
27%|██▋ | 57/210 [28:14<1:15:50, 29.74s/it]
|
| 91 |
28%|██▊ | 58/210 [28:44<1:15:22, 29.75s/it]
|
| 92 |
28%|██▊ | 59/210 [29:13<1:14:53, 29.76s/it]
|
| 93 |
29%|██▊ | 60/210 [29:43<1:14:09, 29.66s/it]
|
| 94 |
|
| 95 |
+
|
| 96 |
29%|██▊ | 60/210 [29:43<1:14:09, 29.66s/it]
|
| 97 |
29%|██▉ | 61/210 [30:12<1:13:31, 29.61s/it]
|
| 98 |
30%|██▉ | 62/210 [30:42<1:12:55, 29.56s/it]
|
| 99 |
30%|███ | 63/210 [31:12<1:12:35, 29.63s/it]
|
| 100 |
30%|███ | 64/210 [31:41<1:11:56, 29.56s/it]
|
| 101 |
31%|███ | 65/210 [32:10<1:11:23, 29.54s/it]
|
| 102 |
31%|███▏ | 66/210 [32:40<1:11:03, 29.61s/it]
|
| 103 |
32%|███▏ | 67/210 [33:10<1:10:27, 29.56s/it]
|
| 104 |
32%|███▏ | 68/210 [33:39<1:09:52, 29.53s/it]
|
| 105 |
33%|███▎ | 69/210 [34:09<1:09:19, 29.50s/it]
|
| 106 |
33%|███▎ | 70/210 [34:38<1:08:48, 29.49s/it]
|
| 107 |
|
| 108 |
+
|
| 109 |
33%|███▎ | 70/210 [34:38<1:08:48, 29.49s/it]
|
| 110 |
34%|███▍ | 71/210 [35:07<1:08:18, 29.48s/it]
|
| 111 |
34%|███▍ | 72/210 [35:37<1:07:47, 29.48s/it]
|
| 112 |
35%|███▍ | 73/210 [36:06<1:07:16, 29.46s/it]
|
| 113 |
35%|███▌ | 74/210 [36:36<1:07:00, 29.56s/it]
|
| 114 |
36%|███▌ | 75/210 [37:06<1:06:26, 29.53s/it]
|
| 115 |
36%|███▌ | 76/210 [37:35<1:06:05, 29.59s/it]
|
| 116 |
37%|███▋ | 77/210 [38:05<1:05:41, 29.64s/it]
|
| 117 |
37%|███▋ | 78/210 [38:35<1:05:03, 29.57s/it]
|
| 118 |
38%|███▊ | 79/210 [39:04<1:04:29, 29.54s/it]
|
| 119 |
38%|███▊ | 80/210 [39:33<1:03:55, 29.51s/it]
|
| 120 |
|
| 121 |
+
|
| 122 |
38%|███▊ | 80/210 [39:33<1:03:55, 29.51s/it]
|
| 123 |
39%|███▊ | 81/210 [40:03<1:03:34, 29.57s/it]
|
| 124 |
39%|███▉ | 82/210 [40:33<1:03:12, 29.63s/it]
|
| 125 |
40%|███▉ | 83/210 [41:03<1:02:47, 29.67s/it]
|
| 126 |
40%|████ | 84/210 [41:32<1:02:21, 29.70s/it]
|
| 127 |
40%|████ | 85/210 [42:02<1:01:42, 29.62s/it]
|
| 128 |
41%|████ | 86/210 [42:31<1:01:05, 29.56s/it]
|
| 129 |
41%|████▏ | 87/210 [43:01<1:00:32, 29.53s/it]
|
| 130 |
42%|████▏ | 88/210 [43:30<59:59, 29.51s/it]
|
| 131 |
42%|████▏ | 89/210 [44:00<59:38, 29.58s/it]
|
| 132 |
43%|████▎ | 90/210 [44:29<59:04, 29.53s/it]
|
| 133 |
|
| 134 |
+
|
| 135 |
43%|████▎ | 90/210 [44:29<59:04, 29.53s/it]
|
| 136 |
43%|████▎ | 91/210 [44:59<58:43, 29.61s/it]
|
| 137 |
44%|████▍ | 92/210 [45:29<58:18, 29.65s/it]
|
| 138 |
44%|████▍ | 93/210 [45:58<57:41, 29.59s/it]
|
| 139 |
45%|████▍ | 94/210 [46:28<57:29, 29.74s/it]
|
| 140 |
45%|████▌ | 95/210 [46:58<57:00, 29.74s/it]
|
| 141 |
46%|████▌ | 96/210 [47:28<56:19, 29.64s/it]
|
| 142 |
46%|████▌ | 97/210 [47:57<55:41, 29.57s/it]
|
| 143 |
47%|████▋ | 98/210 [48:27<55:18, 29.63s/it]
|
| 144 |
47%|████▋ | 99/210 [48:56<54:42, 29.57s/it]
|
| 145 |
48%|████▊ | 100/210 [49:26<54:18, 29.62s/it]
|
| 146 |
|
| 147 |
+
|
| 148 |
48%|████▊ | 100/210 [49:26<54:18, 29.62s/it]
|
| 149 |
48%|████▊ | 101/210 [49:59<55:56, 30.80s/it]
|
| 150 |
49%|████▊ | 102/210 [50:29<54:42, 30.39s/it]
|
| 151 |
49%|████▉ | 103/210 [50:59<53:50, 30.20s/it]
|
| 152 |
50%|████▉ | 104/210 [51:28<52:57, 29.98s/it]
|
| 153 |
50%|█████ | 105/210 [51:58<52:20, 29.91s/it]
|
| 154 |
50%|█████ | 106/210 [52:27<51:35, 29.77s/it]
|
| 155 |
51%|█████ | 107/210 [52:57<51:05, 29.77s/it]
|
| 156 |
51%|█████▏ | 108/210 [53:27<50:35, 29.76s/it]
|
| 157 |
52%|█████▏ | 109/210 [53:56<49:54, 29.65s/it]
|
| 158 |
52%|█████▏ | 110/210 [54:26<49:19, 29.59s/it]
|
| 159 |
|
| 160 |
+
|
| 161 |
52%|█████▏ | 110/210 [54:26<49:19, 29.59s/it]
|
| 162 |
53%|█████▎ | 111/210 [54:55<48:54, 29.64s/it]
|
| 163 |
53%|█████▎ | 112/210 [55:25<48:16, 29.56s/it]
|
| 164 |
54%|█████▍ | 113/210 [55:54<47:44, 29.53s/it]
|
| 165 |
54%|█████▍ | 114/210 [56:24<47:21, 29.59s/it]
|
| 166 |
55%|█████▍ | 115/210 [56:53<46:46, 29.54s/it]
|
| 167 |
55%|█████▌ | 116/210 [57:24<46:31, 29.70s/it]
|
| 168 |
56%|█████▌ | 117/210 [57:53<46:03, 29.71s/it]
|
| 169 |
56%|█████▌ | 118/210 [58:23<45:25, 29.63s/it]
|
| 170 |
57%|█████▋ | 119/210 [58:52<44:59, 29.66s/it]
|
| 171 |
57%|█████▋ | 120/210 [59:22<44:23, 29.59s/it]
|
| 172 |
|
| 173 |
+
|
| 174 |
57%|█████▋ | 120/210 [59:22<44:23, 29.59s/it]
|
| 175 |
58%|█████▊ | 121/210 [59:52<43:55, 29.61s/it]
|
| 176 |
58%|█████▊ | 122/210 [1:00:21<43:29, 29.65s/it]
|
| 177 |
59%|█████▊ | 123/210 [1:00:51<42:53, 29.58s/it]
|
| 178 |
59%|█████▉ | 124/210 [1:01:20<42:19, 29.53s/it]
|
| 179 |
60%|█████▉ | 125/210 [1:01:50<41:56, 29.60s/it]
|
| 180 |
60%|██████ | 126/210 [1:02:19<41:21, 29.54s/it]
|
| 181 |
60%|██████ | 127/210 [1:02:49<40:48, 29.50s/it]
|
| 182 |
61%|██████ | 128/210 [1:03:18<40:19, 29.50s/it]
|
| 183 |
61%|██████▏ | 129/210 [1:03:48<39:55, 29.58s/it]
|
| 184 |
62%|██████▏ | 130/210 [1:04:17<39:22, 29.53s/it]
|
| 185 |
|
| 186 |
+
|
| 187 |
62%|██████▏ | 130/210 [1:04:17<39:22, 29.53s/it]
|
| 188 |
62%|██████▏ | 131/210 [1:04:47<38:49, 29.49s/it]
|
| 189 |
63%|██████▎ | 132/210 [1:05:16<38:18, 29.47s/it]
|
| 190 |
63%|██████▎ | 133/210 [1:05:46<37:47, 29.44s/it]
|
| 191 |
64%|██████▍ | 134/210 [1:06:15<37:16, 29.43s/it]
|
| 192 |
64%|██████▍ | 135/210 [1:06:45<36:53, 29.52s/it]
|
| 193 |
65%|██████▍ | 136/210 [1:07:14<36:21, 29.49s/it]
|
| 194 |
65%|██████▌ | 137/210 [1:07:44<35:51, 29.47s/it]
|
| 195 |
66%|██████▌ | 138/210 [1:08:13<35:27, 29.55s/it]
|
| 196 |
66%|██████▌ | 139/210 [1:08:43<35:02, 29.61s/it]
|
| 197 |
67%|██████▋ | 140/210 [1:09:13<34:35, 29.65s/it]
|
| 198 |
|
| 199 |
+
|
| 200 |
67%|██████▋ | 140/210 [1:09:13<34:35, 29.65s/it]
|
| 201 |
67%|██████▋ | 141/210 [1:09:42<34:07, 29.67s/it]
|
| 202 |
68%|██████▊ | 142/210 [1:10:12<33:39, 29.69s/it]
|
| 203 |
68%|██████▊ | 143/210 [1:10:42<33:16, 29.80s/it]
|
| 204 |
69%|██████▊ | 144/210 [1:11:12<32:39, 29.69s/it]
|
| 205 |
69%|██████▉ | 145/210 [1:11:41<32:10, 29.70s/it]
|
| 206 |
70%|██████▉ | 146/210 [1:12:11<31:35, 29.62s/it]
|
| 207 |
70%|███████ | 147/210 [1:12:40<31:02, 29.56s/it]
|
| 208 |
70%|███████ | 148/210 [1:13:10<30:30, 29.52s/it]
|
| 209 |
71%|███████ | 149/210 [1:13:39<29:58, 29.49s/it]
|
| 210 |
71%|███████▏ | 150/210 [1:14:09<29:28, 29.47s/it]
|
| 211 |
|
| 212 |
+
|
| 213 |
71%|███████▏ | 150/210 [1:14:09<29:28, 29.47s/it]
|
| 214 |
72%|███████▏ | 151/210 [1:14:39<29:08, 29.64s/it]
|
| 215 |
72%|███████▏ | 152/210 [1:15:08<28:35, 29.58s/it]
|
| 216 |
73%|███████▎ | 153/210 [1:15:38<28:14, 29.73s/it]
|
| 217 |
73%|███████▎ | 154/210 [1:16:07<27:39, 29.63s/it]
|
| 218 |
74%|███████▍ | 155/210 [1:16:37<27:10, 29.65s/it]
|
| 219 |
74%|███████▍ | 156/210 [1:17:07<26:37, 29.58s/it]
|
| 220 |
75%|███████▍ | 157/210 [1:17:36<26:04, 29.52s/it]
|
| 221 |
75%|███████▌ | 158/210 [1:18:06<25:36, 29.55s/it]
|
| 222 |
76%|███████▌ | 159/210 [1:18:35<25:09, 29.60s/it]
|
| 223 |
76%|███████▌ | 160/210 [1:19:05<24:42, 29.65s/it]
|
| 224 |
|
| 225 |
+
|
| 226 |
76%|███████▌ | 160/210 [1:19:05<24:42, 29.65s/it]
|
| 227 |
77%|███████▋ | 161/210 [1:19:35<24:14, 29.68s/it]
|
| 228 |
77%|███████▋ | 162/210 [1:20:04<23:41, 29.60s/it]
|
| 229 |
78%|███████▊ | 163/210 [1:20:34<23:12, 29.64s/it]
|
| 230 |
78%|███████▊ | 164/210 [1:21:04<22:44, 29.66s/it]
|
| 231 |
79%|███████▊ | 165/210 [1:21:33<22:15, 29.67s/it]
|
| 232 |
79%|███████▉ | 166/210 [1:22:03<21:40, 29.56s/it]
|
| 233 |
80%|███████▉ | 167/210 [1:22:32<21:08, 29.51s/it]
|
| 234 |
80%|████████ | 168/210 [1:23:01<20:38, 29.48s/it]
|
| 235 |
80%|████████ | 169/210 [1:23:31<20:11, 29.56s/it]
|
| 236 |
81%|████████ | 170/210 [1:24:01<19:40, 29.51s/it]
|
| 237 |
|
| 238 |
+
|
| 239 |
81%|████████ | 170/210 [1:24:01<19:40, 29.51s/it]
|
| 240 |
81%|████████▏ | 171/210 [1:24:30<19:13, 29.57s/it]
|
| 241 |
82%|████████▏ | 172/210 [1:25:00<18:42, 29.53s/it]
|
| 242 |
82%|████████▏ | 173/210 [1:25:30<18:18, 29.68s/it]
|
| 243 |
83%|████████▎ | 174/210 [1:25:59<17:45, 29.61s/it]
|
| 244 |
83%|████████▎ | 175/210 [1:26:29<17:14, 29.54s/it]
|
| 245 |
84%|████████▍ | 176/210 [1:26:58<16:43, 29.51s/it]
|
| 246 |
84%|████████▍ | 177/210 [1:27:28<16:15, 29.57s/it]
|
| 247 |
85%|████████▍ | 178/210 [1:27:57<15:44, 29.52s/it]
|
| 248 |
85%|████████▌ | 179/210 [1:28:27<15:13, 29.48s/it]
|
| 249 |
86%|████████▌ | 180/210 [1:28:56<14:43, 29.44s/it]
|
| 250 |
|
| 251 |
+
|
| 252 |
86%|████████▌ | 180/210 [1:28:56<14:43, 29.44s/it]
|
| 253 |
86%|████████▌ | 181/210 [1:29:26<14:16, 29.52s/it]
|
| 254 |
87%|████████▋ | 182/210 [1:29:55<13:45, 29.48s/it]
|
| 255 |
87%|████████▋ | 183/210 [1:30:25<13:17, 29.55s/it]
|
| 256 |
88%|████████▊ | 184/210 [1:30:54<12:49, 29.60s/it]
|
| 257 |
88%|████████▊ | 185/210 [1:31:24<12:18, 29.54s/it]
|
| 258 |
89%|████████▊ | 186/210 [1:31:53<11:47, 29.50s/it]
|
| 259 |
89%|████████▉ | 187/210 [1:32:23<11:17, 29.47s/it]
|
| 260 |
90%|████████▉ | 188/210 [1:32:52<10:49, 29.54s/it]
|
| 261 |
90%|█████████ | 189/210 [1:33:22<10:21, 29.60s/it]
|
| 262 |
90%|█████████ | 190/210 [1:33:52<09:52, 29.64s/it]
|
| 263 |
|
| 264 |
+
|
| 265 |
90%|█████████ | 190/210 [1:33:52<09:52, 29.64s/it]
|
| 266 |
91%|█████████ | 191/210 [1:34:22<09:25, 29.76s/it]
|
| 267 |
91%|█████████▏| 192/210 [1:34:52<08:55, 29.75s/it]
|
| 268 |
92%|█████████▏| 193/210 [1:35:21<08:25, 29.75s/it]
|
| 269 |
92%|█████████▏| 194/210 [1:35:51<07:55, 29.74s/it]
|
| 270 |
93%|█████████▎| 195/210 [1:36:20<07:24, 29.65s/it]
|
| 271 |
93%|█████████▎| 196/210 [1:36:50<06:55, 29.67s/it]
|
| 272 |
94%|█████████▍| 197/210 [1:37:20<06:27, 29.79s/it]
|
| 273 |
94%|█████████▍| 198/210 [1:37:50<05:56, 29.71s/it]
|
| 274 |
95%|█████████▍| 199/210 [1:38:19<05:25, 29.61s/it]
|
| 275 |
95%|█████████▌| 200/210 [1:38:49<04:55, 29.55s/it]
|
| 276 |
|
| 277 |
+
|
| 278 |
95%|█████████▌| 200/210 [1:38:49<04:55, 29.55s/it]
|
| 279 |
96%|█████████▌| 201/210 [1:39:22<04:36, 30.70s/it]
|
| 280 |
96%|█████████▌| 202/210 [1:39:51<04:02, 30.31s/it]
|
| 281 |
97%|█████████▋| 203/210 [1:40:21<03:30, 30.05s/it]
|
| 282 |
97%|█████████▋| 204/210 [1:40:50<02:59, 29.86s/it]
|
| 283 |
98%|█████████▊| 205/210 [1:41:20<02:28, 29.73s/it]
|
| 284 |
98%|█████████▊| 206/210 [1:41:49<01:58, 29.64s/it]
|
| 285 |
99%|█████████▊| 207/210 [1:42:19<01:28, 29.58s/it]
|
| 286 |
99%|█████████▉| 208/210 [1:42:48<00:59, 29.53s/it]
|
| 287 |
|
| 288 |
+
|
| 289 |
|
| 290 |
+
|
| 291 |
+
***** train metrics *****
|
| 292 |
+
epoch = 1.0
|
| 293 |
+
train_loss = 0.4878
|
| 294 |
+
train_runtime = 1:43:54.34
|
| 295 |
+
train_samples_per_second = 0.405
|
| 296 |
+
train_steps_per_second = 0.034
|
train_results.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 1.0,
|
| 3 |
+
"train_loss": 0.4878490357171921,
|
| 4 |
+
"train_runtime": 6234.3462,
|
| 5 |
+
"train_samples_per_second": 0.405,
|
| 6 |
+
"train_steps_per_second": 0.034
|
| 7 |
+
}
|
trainer_state.json
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_metric": null,
|
| 3 |
+
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 0.997624703087886,
|
| 5 |
+
"eval_steps": 500,
|
| 6 |
+
"global_step": 210,
|
| 7 |
+
"is_hyper_param_search": false,
|
| 8 |
+
"is_local_process_zero": true,
|
| 9 |
+
"is_world_process_zero": true,
|
| 10 |
+
"log_history": [
|
| 11 |
+
{
|
| 12 |
+
"epoch": 0.05,
|
| 13 |
+
"global_step": 10,
|
| 14 |
+
"learning_rate": 4.761904761904762e-05,
|
| 15 |
+
"loss": 0.5879,
|
| 16 |
+
"step": 10
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"epoch": 0.1,
|
| 20 |
+
"global_step": 20,
|
| 21 |
+
"learning_rate": 9.523809523809524e-05,
|
| 22 |
+
"loss": 0.506,
|
| 23 |
+
"step": 20
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"epoch": 0.14,
|
| 27 |
+
"global_step": 30,
|
| 28 |
+
"learning_rate": 0.0001,
|
| 29 |
+
"loss": 0.5252,
|
| 30 |
+
"step": 30
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"epoch": 0.19,
|
| 34 |
+
"global_step": 40,
|
| 35 |
+
"learning_rate": 0.0001,
|
| 36 |
+
"loss": 0.5572,
|
| 37 |
+
"step": 40
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"epoch": 0.24,
|
| 41 |
+
"global_step": 50,
|
| 42 |
+
"learning_rate": 0.0001,
|
| 43 |
+
"loss": 0.4937,
|
| 44 |
+
"step": 50
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"epoch": 0.29,
|
| 48 |
+
"global_step": 60,
|
| 49 |
+
"learning_rate": 0.0001,
|
| 50 |
+
"loss": 0.4925,
|
| 51 |
+
"step": 60
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"epoch": 0.33,
|
| 55 |
+
"global_step": 70,
|
| 56 |
+
"learning_rate": 0.0001,
|
| 57 |
+
"loss": 0.4309,
|
| 58 |
+
"step": 70
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"epoch": 0.38,
|
| 62 |
+
"global_step": 80,
|
| 63 |
+
"learning_rate": 0.0001,
|
| 64 |
+
"loss": 0.4831,
|
| 65 |
+
"step": 80
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"epoch": 0.43,
|
| 69 |
+
"global_step": 90,
|
| 70 |
+
"learning_rate": 0.0001,
|
| 71 |
+
"loss": 0.4896,
|
| 72 |
+
"step": 90
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"epoch": 0.48,
|
| 76 |
+
"global_step": 100,
|
| 77 |
+
"learning_rate": 0.0001,
|
| 78 |
+
"loss": 0.4257,
|
| 79 |
+
"step": 100
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"epoch": 0.52,
|
| 83 |
+
"global_step": 110,
|
| 84 |
+
"learning_rate": 0.0001,
|
| 85 |
+
"loss": 0.5,
|
| 86 |
+
"step": 110
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"epoch": 0.57,
|
| 90 |
+
"global_step": 120,
|
| 91 |
+
"learning_rate": 0.0001,
|
| 92 |
+
"loss": 0.4954,
|
| 93 |
+
"step": 120
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"epoch": 0.62,
|
| 97 |
+
"global_step": 130,
|
| 98 |
+
"learning_rate": 0.0001,
|
| 99 |
+
"loss": 0.4691,
|
| 100 |
+
"step": 130
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"epoch": 0.67,
|
| 104 |
+
"global_step": 140,
|
| 105 |
+
"learning_rate": 0.0001,
|
| 106 |
+
"loss": 0.4373,
|
| 107 |
+
"step": 140
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"epoch": 0.71,
|
| 111 |
+
"global_step": 150,
|
| 112 |
+
"learning_rate": 0.0001,
|
| 113 |
+
"loss": 0.526,
|
| 114 |
+
"step": 150
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 0.76,
|
| 118 |
+
"global_step": 160,
|
| 119 |
+
"learning_rate": 0.0001,
|
| 120 |
+
"loss": 0.4297,
|
| 121 |
+
"step": 160
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"epoch": 0.81,
|
| 125 |
+
"global_step": 170,
|
| 126 |
+
"learning_rate": 0.0001,
|
| 127 |
+
"loss": 0.4708,
|
| 128 |
+
"step": 170
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"epoch": 0.86,
|
| 132 |
+
"global_step": 180,
|
| 133 |
+
"learning_rate": 0.0001,
|
| 134 |
+
"loss": 0.4872,
|
| 135 |
+
"step": 180
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"epoch": 0.9,
|
| 139 |
+
"global_step": 190,
|
| 140 |
+
"learning_rate": 0.0001,
|
| 141 |
+
"loss": 0.4888,
|
| 142 |
+
"step": 190
|
| 143 |
+
},
|
| 144 |
+
{
|
| 145 |
+
"epoch": 0.95,
|
| 146 |
+
"global_step": 200,
|
| 147 |
+
"learning_rate": 0.0001,
|
| 148 |
+
"loss": 0.4754,
|
| 149 |
+
"step": 200
|
| 150 |
+
},
|
| 151 |
+
{
|
| 152 |
+
"epoch": 1.0,
|
| 153 |
+
"global_step": 210,
|
| 154 |
+
"learning_rate": 0.0001,
|
| 155 |
+
"loss": 0.4733,
|
| 156 |
+
"step": 210
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"epoch": 1.0,
|
| 160 |
+
"step": 210,
|
| 161 |
+
"total_flos": 1.1368810483705446e+17,
|
| 162 |
+
"train_loss": 0.4878490357171921,
|
| 163 |
+
"train_runtime": 6234.3462,
|
| 164 |
+
"train_samples_per_second": 0.405,
|
| 165 |
+
"train_steps_per_second": 0.034
|
| 166 |
+
}
|
| 167 |
+
],
|
| 168 |
+
"logging_steps": 10,
|
| 169 |
+
"max_steps": 210,
|
| 170 |
+
"num_train_epochs": 1,
|
| 171 |
+
"save_steps": 100,
|
| 172 |
+
"total_flos": 1.1368810483705446e+17,
|
| 173 |
+
"trial_name": null,
|
| 174 |
+
"trial_params": null
|
| 175 |
+
}
|