riyadule commited on
Commit
a096952
·
verified ·
1 Parent(s): ad08e26

upload initial files

Browse files
Files changed (4) hide show
  1. ck_report.json +14 -0
  2. hyperparams.json +21 -0
  3. log.out +889 -0
  4. modules.json +14 -0
ck_report.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "training_examples (when pos_num=1 for ranking)": 130556,
3
+ "evaluation_steps": 200,
4
+ "train_batch_size": 16,
5
+ "epoch": 1,
6
+ "total_epochs": 5,
7
+ "steps": 8000,
8
+ "saved_at_total_steps": 8000,
9
+ "steps_per_epoch": 8160,
10
+ "eval_scores_on_dev": {
11
+ "loss": 1.1733529567718506,
12
+ "perplexity": 3.232813835144043
13
+ }
14
+ }
hyperparams.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_select": "distilgpt2",
3
+ "dataset_name": "source_code",
4
+ "per_gpu_train_batch_size": 4,
5
+ "dev_batch_size": 8,
6
+ "num_epochs_train": 5,
7
+ "max_seq_length": 256,
8
+ "lr": 2e-05,
9
+ "warmup_ratio": 0.2,
10
+ "early_stop": 3,
11
+ "scheduler": "warmuplinear",
12
+ "seed": 122,
13
+ "accumulation_steps": 1,
14
+ "n_gpu": 4,
15
+ "visiable_device": "0",
16
+ "evaluation_steps": 200,
17
+ "wandb_project_name": "code_generate",
18
+ "restore_training": false,
19
+ "with_wandb": true,
20
+ "wandb_run_name": "model/distilgpt2_fine_tuned_coder"
21
+ }
log.out ADDED
@@ -0,0 +1,889 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-04-26 02:18:42,687 - trainer - INFO - Use pytorch device: cuda, with gpu_number=4
2
+ 2024-04-26 02:18:42,687 - trainer - INFO - see seed for random, numpy and torch 122
3
+ 2024-04-26 02:18:43,540 - trainer - INFO - module.0.gpt.transformer.wte.weight torch.Size([50259, 768])
4
+ 2024-04-26 02:18:43,540 - trainer - INFO - module.0.gpt.transformer.wpe.weight torch.Size([1024, 768])
5
+ 2024-04-26 02:18:43,542 - trainer - INFO - module.0.gpt.transformer.h.0.ln_1.weight torch.Size([768])
6
+ 2024-04-26 02:18:43,542 - trainer - INFO - module.0.gpt.transformer.h.0.ln_1.bias torch.Size([768])
7
+ 2024-04-26 02:18:43,543 - trainer - INFO - module.0.gpt.transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
8
+ 2024-04-26 02:18:43,543 - trainer - INFO - module.0.gpt.transformer.h.0.attn.c_attn.bias torch.Size([2304])
9
+ 2024-04-26 02:18:43,544 - trainer - INFO - module.0.gpt.transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
10
+ 2024-04-26 02:18:43,544 - trainer - INFO - module.0.gpt.transformer.h.0.attn.c_proj.bias torch.Size([768])
11
+ 2024-04-26 02:18:43,544 - trainer - INFO - module.0.gpt.transformer.h.0.ln_2.weight torch.Size([768])
12
+ 2024-04-26 02:18:43,545 - trainer - INFO - module.0.gpt.transformer.h.0.ln_2.bias torch.Size([768])
13
+ 2024-04-26 02:18:43,545 - trainer - INFO - module.0.gpt.transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
14
+ 2024-04-26 02:18:43,546 - trainer - INFO - module.0.gpt.transformer.h.0.mlp.c_fc.bias torch.Size([3072])
15
+ 2024-04-26 02:18:43,546 - trainer - INFO - module.0.gpt.transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
16
+ 2024-04-26 02:18:43,547 - trainer - INFO - module.0.gpt.transformer.h.0.mlp.c_proj.bias torch.Size([768])
17
+ 2024-04-26 02:18:43,547 - trainer - INFO - module.0.gpt.transformer.h.1.ln_1.weight torch.Size([768])
18
+ 2024-04-26 02:18:43,547 - trainer - INFO - module.0.gpt.transformer.h.1.ln_1.bias torch.Size([768])
19
+ 2024-04-26 02:18:43,548 - trainer - INFO - module.0.gpt.transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
20
+ 2024-04-26 02:18:43,548 - trainer - INFO - module.0.gpt.transformer.h.1.attn.c_attn.bias torch.Size([2304])
21
+ 2024-04-26 02:18:43,549 - trainer - INFO - module.0.gpt.transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
22
+ 2024-04-26 02:18:43,549 - trainer - INFO - module.0.gpt.transformer.h.1.attn.c_proj.bias torch.Size([768])
23
+ 2024-04-26 02:18:43,549 - trainer - INFO - module.0.gpt.transformer.h.1.ln_2.weight torch.Size([768])
24
+ 2024-04-26 02:18:43,550 - trainer - INFO - module.0.gpt.transformer.h.1.ln_2.bias torch.Size([768])
25
+ 2024-04-26 02:18:43,550 - trainer - INFO - module.0.gpt.transformer.h.1.mlp.c_fc.weight torch.Size([768, 3072])
26
+ 2024-04-26 02:18:43,551 - trainer - INFO - module.0.gpt.transformer.h.1.mlp.c_fc.bias torch.Size([3072])
27
+ 2024-04-26 02:18:43,551 - trainer - INFO - module.0.gpt.transformer.h.1.mlp.c_proj.weight torch.Size([3072, 768])
28
+ 2024-04-26 02:18:43,551 - trainer - INFO - module.0.gpt.transformer.h.1.mlp.c_proj.bias torch.Size([768])
29
+ 2024-04-26 02:18:43,552 - trainer - INFO - module.0.gpt.transformer.h.2.ln_1.weight torch.Size([768])
30
+ 2024-04-26 02:18:43,552 - trainer - INFO - module.0.gpt.transformer.h.2.ln_1.bias torch.Size([768])
31
+ 2024-04-26 02:18:43,553 - trainer - INFO - module.0.gpt.transformer.h.2.attn.c_attn.weight torch.Size([768, 2304])
32
+ 2024-04-26 02:18:43,553 - trainer - INFO - module.0.gpt.transformer.h.2.attn.c_attn.bias torch.Size([2304])
33
+ 2024-04-26 02:18:43,554 - trainer - INFO - module.0.gpt.transformer.h.2.attn.c_proj.weight torch.Size([768, 768])
34
+ 2024-04-26 02:18:43,554 - trainer - INFO - module.0.gpt.transformer.h.2.attn.c_proj.bias torch.Size([768])
35
+ 2024-04-26 02:18:43,554 - trainer - INFO - module.0.gpt.transformer.h.2.ln_2.weight torch.Size([768])
36
+ 2024-04-26 02:18:43,555 - trainer - INFO - module.0.gpt.transformer.h.2.ln_2.bias torch.Size([768])
37
+ 2024-04-26 02:18:43,555 - trainer - INFO - module.0.gpt.transformer.h.2.mlp.c_fc.weight torch.Size([768, 3072])
38
+ 2024-04-26 02:18:43,555 - trainer - INFO - module.0.gpt.transformer.h.2.mlp.c_fc.bias torch.Size([3072])
39
+ 2024-04-26 02:18:43,556 - trainer - INFO - module.0.gpt.transformer.h.2.mlp.c_proj.weight torch.Size([3072, 768])
40
+ 2024-04-26 02:18:43,556 - trainer - INFO - module.0.gpt.transformer.h.2.mlp.c_proj.bias torch.Size([768])
41
+ 2024-04-26 02:18:43,557 - trainer - INFO - module.0.gpt.transformer.h.3.ln_1.weight torch.Size([768])
42
+ 2024-04-26 02:18:43,557 - trainer - INFO - module.0.gpt.transformer.h.3.ln_1.bias torch.Size([768])
43
+ 2024-04-26 02:18:43,558 - trainer - INFO - module.0.gpt.transformer.h.3.attn.c_attn.weight torch.Size([768, 2304])
44
+ 2024-04-26 02:18:43,558 - trainer - INFO - module.0.gpt.transformer.h.3.attn.c_attn.bias torch.Size([2304])
45
+ 2024-04-26 02:18:43,559 - trainer - INFO - module.0.gpt.transformer.h.3.attn.c_proj.weight torch.Size([768, 768])
46
+ 2024-04-26 02:18:43,559 - trainer - INFO - module.0.gpt.transformer.h.3.attn.c_proj.bias torch.Size([768])
47
+ 2024-04-26 02:18:43,559 - trainer - INFO - module.0.gpt.transformer.h.3.ln_2.weight torch.Size([768])
48
+ 2024-04-26 02:18:43,560 - trainer - INFO - module.0.gpt.transformer.h.3.ln_2.bias torch.Size([768])
49
+ 2024-04-26 02:18:43,560 - trainer - INFO - module.0.gpt.transformer.h.3.mlp.c_fc.weight torch.Size([768, 3072])
50
+ 2024-04-26 02:18:43,561 - trainer - INFO - module.0.gpt.transformer.h.3.mlp.c_fc.bias torch.Size([3072])
51
+ 2024-04-26 02:18:43,561 - trainer - INFO - module.0.gpt.transformer.h.3.mlp.c_proj.weight torch.Size([3072, 768])
52
+ 2024-04-26 02:18:43,562 - trainer - INFO - module.0.gpt.transformer.h.3.mlp.c_proj.bias torch.Size([768])
53
+ 2024-04-26 02:18:43,562 - trainer - INFO - module.0.gpt.transformer.h.4.ln_1.weight torch.Size([768])
54
+ 2024-04-26 02:18:43,562 - trainer - INFO - module.0.gpt.transformer.h.4.ln_1.bias torch.Size([768])
55
+ 2024-04-26 02:18:43,563 - trainer - INFO - module.0.gpt.transformer.h.4.attn.c_attn.weight torch.Size([768, 2304])
56
+ 2024-04-26 02:18:43,563 - trainer - INFO - module.0.gpt.transformer.h.4.attn.c_attn.bias torch.Size([2304])
57
+ 2024-04-26 02:18:43,564 - trainer - INFO - module.0.gpt.transformer.h.4.attn.c_proj.weight torch.Size([768, 768])
58
+ 2024-04-26 02:18:43,564 - trainer - INFO - module.0.gpt.transformer.h.4.attn.c_proj.bias torch.Size([768])
59
+ 2024-04-26 02:18:43,564 - trainer - INFO - module.0.gpt.transformer.h.4.ln_2.weight torch.Size([768])
60
+ 2024-04-26 02:18:43,565 - trainer - INFO - module.0.gpt.transformer.h.4.ln_2.bias torch.Size([768])
61
+ 2024-04-26 02:18:43,565 - trainer - INFO - module.0.gpt.transformer.h.4.mlp.c_fc.weight torch.Size([768, 3072])
62
+ 2024-04-26 02:18:43,566 - trainer - INFO - module.0.gpt.transformer.h.4.mlp.c_fc.bias torch.Size([3072])
63
+ 2024-04-26 02:18:43,566 - trainer - INFO - module.0.gpt.transformer.h.4.mlp.c_proj.weight torch.Size([3072, 768])
64
+ 2024-04-26 02:18:43,567 - trainer - INFO - module.0.gpt.transformer.h.4.mlp.c_proj.bias torch.Size([768])
65
+ 2024-04-26 02:18:43,567 - trainer - INFO - module.0.gpt.transformer.h.5.ln_1.weight torch.Size([768])
66
+ 2024-04-26 02:18:43,567 - trainer - INFO - module.0.gpt.transformer.h.5.ln_1.bias torch.Size([768])
67
+ 2024-04-26 02:18:43,568 - trainer - INFO - module.0.gpt.transformer.h.5.attn.c_attn.weight torch.Size([768, 2304])
68
+ 2024-04-26 02:18:43,568 - trainer - INFO - module.0.gpt.transformer.h.5.attn.c_attn.bias torch.Size([2304])
69
+ 2024-04-26 02:18:43,569 - trainer - INFO - module.0.gpt.transformer.h.5.attn.c_proj.weight torch.Size([768, 768])
70
+ 2024-04-26 02:18:43,569 - trainer - INFO - module.0.gpt.transformer.h.5.attn.c_proj.bias torch.Size([768])
71
+ 2024-04-26 02:18:43,570 - trainer - INFO - module.0.gpt.transformer.h.5.ln_2.weight torch.Size([768])
72
+ 2024-04-26 02:18:43,570 - trainer - INFO - module.0.gpt.transformer.h.5.ln_2.bias torch.Size([768])
73
+ 2024-04-26 02:18:43,570 - trainer - INFO - module.0.gpt.transformer.h.5.mlp.c_fc.weight torch.Size([768, 3072])
74
+ 2024-04-26 02:18:43,571 - trainer - INFO - module.0.gpt.transformer.h.5.mlp.c_fc.bias torch.Size([3072])
75
+ 2024-04-26 02:18:43,571 - trainer - INFO - module.0.gpt.transformer.h.5.mlp.c_proj.weight torch.Size([3072, 768])
76
+ 2024-04-26 02:18:43,572 - trainer - INFO - module.0.gpt.transformer.h.5.mlp.c_proj.bias torch.Size([768])
77
+ 2024-04-26 02:18:43,572 - trainer - INFO - module.0.gpt.transformer.ln_f.weight torch.Size([768])
78
+ 2024-04-26 02:18:43,573 - trainer - INFO - module.0.gpt.transformer.ln_f.bias torch.Size([768])
79
+ 2024-04-26 02:18:43,573 - trainer - INFO - module.0.gpt.lm_head.weight torch.Size([50259, 768])
80
+ 2024-04-26 02:18:43,573 - trainer - INFO - DataParallel(
81
+ (module): Sequential(
82
+ (0): GPTSingleHead(
83
+ (gpt): GPT2LMHeadModel(
84
+ (transformer): GPT2Model(
85
+ (wte): Embedding(50259, 768)
86
+ (wpe): Embedding(1024, 768)
87
+ (drop): Dropout(p=0.1, inplace=False)
88
+ (h): ModuleList(
89
+ (0-5): 6 x GPT2Block(
90
+ (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
91
+ (attn): GPT2Attention(
92
+ (c_attn): Conv1D()
93
+ (c_proj): Conv1D()
94
+ (attn_dropout): Dropout(p=0.1, inplace=False)
95
+ (resid_dropout): Dropout(p=0.1, inplace=False)
96
+ )
97
+ (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
98
+ (mlp): GPT2MLP(
99
+ (c_fc): Conv1D()
100
+ (c_proj): Conv1D()
101
+ (act): NewGELUActivation()
102
+ (dropout): Dropout(p=0.1, inplace=False)
103
+ )
104
+ )
105
+ )
106
+ (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
107
+ )
108
+ (lm_head): Linear(in_features=768, out_features=50259, bias=False)
109
+ )
110
+ )
111
+ (1): EmptyHeads()
112
+ )
113
+ )
114
+ 2024-04-26 02:18:43,574 - trainer - INFO - Total params: 81914112
115
+ 2024-04-26 02:18:43,574 - trainer - INFO - Trainable params: 81914112
116
+ 2024-04-26 02:18:43,574 - trainer - INFO - Non-trainable params: 0
117
+ 2024-04-26 02:18:43,590 - trainer - INFO - Warmup-steps: 8160
118
+ 2024-04-26 02:18:43,594 - trainer - INFO - ***** Running training *****
119
+ 2024-04-26 02:18:43,594 - trainer - INFO - Num of training examples (actually iterations per epoch for Iterable Dataset) = 130556
120
+ 2024-04-26 02:18:43,594 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
121
+ 2024-04-26 02:18:43,594 - trainer - INFO - Steps per Epoch = 8160 or iterations per epoch = 8160
122
+ 2024-04-26 02:18:43,594 - trainer - INFO - Num of Epochs = 5
123
+ 2024-04-26 02:18:43,594 - trainer - INFO - Best score (perplexity) = -inf
124
+ 2024-04-26 02:18:43,594 - trainer - INFO - Eval every 200 steps or every 200 iterations
125
+ 2024-04-26 02:18:43,594 - trainer - INFO - Early stop = 3
126
+ 2024-04-26 02:18:43,594 - trainer - INFO - Gradient Accumulation steps = 1
127
+ 2024-04-26 02:18:43,594 - trainer - INFO - Total optimization steps = 40800
128
+ 2024-04-26 02:18:43,594 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
129
+ 2024-04-26 02:25:03,634 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
130
+ 2024-04-26 02:25:04,787 - trainer - INFO - Save check-point at epoch=0 step=200
131
+ 2024-04-26 02:25:04,788 - trainer - INFO - ***** Evaluation report *****
132
+ 2024-04-26 02:25:04,788 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
133
+ 2024-04-26 02:25:04,788 - trainer - INFO - Early stop on: perplexity
134
+ 2024-04-26 02:25:04,788 - trainer - INFO - Early stop count = 0/3
135
+ 2024-04-26 02:25:04,788 - trainer - INFO - Eval steps = 200 or (iterations = 200)
136
+ 2024-04-26 02:25:04,788 - trainer - INFO - Best score (perplexity) = -270.8600158691406
137
+ 2024-04-26 02:25:04,788 - trainer - INFO - Gradient Accumulation steps = 1
138
+ 2024-04-26 02:25:04,788 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
139
+ 2024-04-26 02:25:04,788 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
140
+ 2024-04-26 02:25:04,788 - trainer - INFO - Time spent since last evaluation = 0h 6m 21s
141
+ 2024-04-26 02:25:04,788 - trainer - INFO - Epoch = 1/5
142
+ 2024-04-26 02:25:04,788 - trainer - INFO - Steps = 200/40800
143
+ 2024-04-26 02:25:04,788 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
144
+ 2024-04-26 02:25:04,788 - trainer - INFO - dev_loss = 5.601602 || dev_eval_scores = {'perplexity': 270.8600158691406}
145
+ 2024-04-26 02:25:04,789 - trainer - INFO - train_loss = 14.094216346740723
146
+ 2024-04-26 02:25:04,789 - trainer - INFO -
147
+ ********************************************
148
+ 2024-04-26 02:31:25,346 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
149
+ 2024-04-26 02:31:31,186 - trainer - INFO - Save check-point at epoch=0 step=400
150
+ 2024-04-26 02:31:31,187 - trainer - INFO - ***** Evaluation report *****
151
+ 2024-04-26 02:31:31,187 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
152
+ 2024-04-26 02:31:31,187 - trainer - INFO - Early stop on: perplexity
153
+ 2024-04-26 02:31:31,187 - trainer - INFO - Early stop count = 0/3
154
+ 2024-04-26 02:31:31,187 - trainer - INFO - Eval steps = 200 or (iterations = 200)
155
+ 2024-04-26 02:31:31,187 - trainer - INFO - Best score (perplexity) = -10.156302452087402
156
+ 2024-04-26 02:31:31,187 - trainer - INFO - Gradient Accumulation steps = 1
157
+ 2024-04-26 02:31:31,187 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
158
+ 2024-04-26 02:31:31,187 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
159
+ 2024-04-26 02:31:31,187 - trainer - INFO - Time spent since last evaluation = 0h 6m 26s
160
+ 2024-04-26 02:31:31,187 - trainer - INFO - Epoch = 1/5
161
+ 2024-04-26 02:31:31,187 - trainer - INFO - Steps = 400/40800
162
+ 2024-04-26 02:31:31,187 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
163
+ 2024-04-26 02:31:31,187 - trainer - INFO - dev_loss = 2.318094 || dev_eval_scores = {'perplexity': 10.156302452087402}
164
+ 2024-04-26 02:31:31,220 - trainer - INFO - train_loss = 8.5648775100708
165
+ 2024-04-26 02:31:31,220 - trainer - INFO -
166
+ ********************************************
167
+ 2024-04-26 02:37:51,756 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
168
+ 2024-04-26 02:37:57,711 - trainer - INFO - Save check-point at epoch=0 step=600
169
+ 2024-04-26 02:37:57,711 - trainer - INFO - ***** Evaluation report *****
170
+ 2024-04-26 02:37:57,711 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
171
+ 2024-04-26 02:37:57,711 - trainer - INFO - Early stop on: perplexity
172
+ 2024-04-26 02:37:57,711 - trainer - INFO - Early stop count = 0/3
173
+ 2024-04-26 02:37:57,711 - trainer - INFO - Eval steps = 200 or (iterations = 200)
174
+ 2024-04-26 02:37:57,711 - trainer - INFO - Best score (perplexity) = -7.607259750366211
175
+ 2024-04-26 02:37:57,712 - trainer - INFO - Gradient Accumulation steps = 1
176
+ 2024-04-26 02:37:57,712 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
177
+ 2024-04-26 02:37:57,712 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
178
+ 2024-04-26 02:37:57,712 - trainer - INFO - Time spent since last evaluation = 0h 6m 26s
179
+ 2024-04-26 02:37:57,712 - trainer - INFO - Epoch = 1/5
180
+ 2024-04-26 02:37:57,712 - trainer - INFO - Steps = 600/40800
181
+ 2024-04-26 02:37:57,712 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
182
+ 2024-04-26 02:37:57,712 - trainer - INFO - dev_loss = 2.029103 || dev_eval_scores = {'perplexity': 7.607259750366211}
183
+ 2024-04-26 02:37:57,712 - trainer - INFO - train_loss = 6.4544525146484375
184
+ 2024-04-26 02:37:57,712 - trainer - INFO -
185
+ ********************************************
186
+ 2024-04-26 02:44:17,920 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
187
+ 2024-04-26 02:44:23,766 - trainer - INFO - Save check-point at epoch=0 step=800
188
+ 2024-04-26 02:44:23,766 - trainer - INFO - ***** Evaluation report *****
189
+ 2024-04-26 02:44:23,767 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
190
+ 2024-04-26 02:44:23,767 - trainer - INFO - Early stop on: perplexity
191
+ 2024-04-26 02:44:23,767 - trainer - INFO - Early stop count = 0/3
192
+ 2024-04-26 02:44:23,767 - trainer - INFO - Eval steps = 200 or (iterations = 200)
193
+ 2024-04-26 02:44:23,767 - trainer - INFO - Best score (perplexity) = -6.791029453277588
194
+ 2024-04-26 02:44:23,767 - trainer - INFO - Gradient Accumulation steps = 1
195
+ 2024-04-26 02:44:23,767 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
196
+ 2024-04-26 02:44:23,767 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
197
+ 2024-04-26 02:44:23,767 - trainer - INFO - Time spent since last evaluation = 0h 6m 26s
198
+ 2024-04-26 02:44:23,767 - trainer - INFO - Epoch = 1/5
199
+ 2024-04-26 02:44:23,767 - trainer - INFO - Steps = 800/40800
200
+ 2024-04-26 02:44:23,767 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
201
+ 2024-04-26 02:44:23,767 - trainer - INFO - dev_loss = 1.915603 || dev_eval_scores = {'perplexity': 6.791029453277588}
202
+ 2024-04-26 02:44:23,767 - trainer - INFO - train_loss = 5.3493781089782715
203
+ 2024-04-26 02:44:23,768 - trainer - INFO -
204
+ ********************************************
205
+ 2024-04-26 02:50:43,526 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
206
+ 2024-04-26 02:50:49,464 - trainer - INFO - Save check-point at epoch=0 step=1000
207
+ 2024-04-26 02:50:49,464 - trainer - INFO - ***** Evaluation report *****
208
+ 2024-04-26 02:50:49,464 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
209
+ 2024-04-26 02:50:49,464 - trainer - INFO - Early stop on: perplexity
210
+ 2024-04-26 02:50:49,464 - trainer - INFO - Early stop count = 0/3
211
+ 2024-04-26 02:50:49,464 - trainer - INFO - Eval steps = 200 or (iterations = 200)
212
+ 2024-04-26 02:50:49,464 - trainer - INFO - Best score (perplexity) = -6.073063373565674
213
+ 2024-04-26 02:50:49,464 - trainer - INFO - Gradient Accumulation steps = 1
214
+ 2024-04-26 02:50:49,464 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
215
+ 2024-04-26 02:50:49,464 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
216
+ 2024-04-26 02:50:49,464 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
217
+ 2024-04-26 02:50:49,465 - trainer - INFO - Epoch = 1/5
218
+ 2024-04-26 02:50:49,465 - trainer - INFO - Steps = 1000/40800
219
+ 2024-04-26 02:50:49,465 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
220
+ 2024-04-26 02:50:49,465 - trainer - INFO - dev_loss = 1.803863 || dev_eval_scores = {'perplexity': 6.073063373565674}
221
+ 2024-04-26 02:50:49,465 - trainer - INFO - train_loss = 4.66662073135376
222
+ 2024-04-26 02:50:49,465 - trainer - INFO -
223
+ ********************************************
224
+ 2024-04-26 02:57:09,707 - trainer - INFO - ***** Evaluation report *****
225
+ 2024-04-26 02:57:09,707 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
226
+ 2024-04-26 02:57:09,707 - trainer - INFO - Early stop on: perplexity
227
+ 2024-04-26 02:57:09,707 - trainer - INFO - Early stop count = 1/3
228
+ 2024-04-26 02:57:09,707 - trainer - INFO - Eval steps = 200 or (iterations = 200)
229
+ 2024-04-26 02:57:09,707 - trainer - INFO - Best score (perplexity) = -6.073063373565674
230
+ 2024-04-26 02:57:09,708 - trainer - INFO - Gradient Accumulation steps = 1
231
+ 2024-04-26 02:57:09,708 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
232
+ 2024-04-26 02:57:09,708 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
233
+ 2024-04-26 02:57:09,708 - trainer - INFO - Time spent since last evaluation = 0h 6m 20s
234
+ 2024-04-26 02:57:09,708 - trainer - INFO - Epoch = 1/5
235
+ 2024-04-26 02:57:09,708 - trainer - INFO - Steps = 1200/40800
236
+ 2024-04-26 02:57:09,708 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
237
+ 2024-04-26 02:57:09,708 - trainer - INFO - dev_loss = 1.808444 || dev_eval_scores = {'perplexity': 6.100945472717285}
238
+ 2024-04-26 02:57:09,708 - trainer - INFO - train_loss = 4.205338001251221
239
+ 2024-04-26 02:57:09,708 - trainer - INFO -
240
+ ********************************************
241
+ 2024-04-26 03:03:30,335 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
242
+ 2024-04-26 03:03:36,292 - trainer - INFO - Save check-point at epoch=0 step=1400
243
+ 2024-04-26 03:03:36,292 - trainer - INFO - ***** Evaluation report *****
244
+ 2024-04-26 03:03:36,292 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
245
+ 2024-04-26 03:03:36,292 - trainer - INFO - Early stop on: perplexity
246
+ 2024-04-26 03:03:36,292 - trainer - INFO - Early stop count = 0/3
247
+ 2024-04-26 03:03:36,292 - trainer - INFO - Eval steps = 200 or (iterations = 200)
248
+ 2024-04-26 03:03:36,292 - trainer - INFO - Best score (perplexity) = -5.51066780090332
249
+ 2024-04-26 03:03:36,292 - trainer - INFO - Gradient Accumulation steps = 1
250
+ 2024-04-26 03:03:36,292 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
251
+ 2024-04-26 03:03:36,292 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
252
+ 2024-04-26 03:03:36,292 - trainer - INFO - Time spent since last evaluation = 0h 6m 26s
253
+ 2024-04-26 03:03:36,293 - trainer - INFO - Epoch = 1/5
254
+ 2024-04-26 03:03:36,293 - trainer - INFO - Steps = 1400/40800
255
+ 2024-04-26 03:03:36,293 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
256
+ 2024-04-26 03:03:36,293 - trainer - INFO - dev_loss = 1.706686 || dev_eval_scores = {'perplexity': 5.51066780090332}
257
+ 2024-04-26 03:03:36,293 - trainer - INFO - train_loss = 3.8646857738494873
258
+ 2024-04-26 03:03:36,293 - trainer - INFO -
259
+ ********************************************
260
+ 2024-04-26 03:09:56,141 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
261
+ 2024-04-26 03:10:02,087 - trainer - INFO - Save check-point at epoch=0 step=1600
262
+ 2024-04-26 03:10:02,087 - trainer - INFO - ***** Evaluation report *****
263
+ 2024-04-26 03:10:02,088 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
264
+ 2024-04-26 03:10:02,088 - trainer - INFO - Early stop on: perplexity
265
+ 2024-04-26 03:10:02,088 - trainer - INFO - Early stop count = 0/3
266
+ 2024-04-26 03:10:02,088 - trainer - INFO - Eval steps = 200 or (iterations = 200)
267
+ 2024-04-26 03:10:02,088 - trainer - INFO - Best score (perplexity) = -5.361582279205322
268
+ 2024-04-26 03:10:02,088 - trainer - INFO - Gradient Accumulation steps = 1
269
+ 2024-04-26 03:10:02,088 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
270
+ 2024-04-26 03:10:02,088 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
271
+ 2024-04-26 03:10:02,088 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
272
+ 2024-04-26 03:10:02,088 - trainer - INFO - Epoch = 1/5
273
+ 2024-04-26 03:10:02,088 - trainer - INFO - Steps = 1600/40800
274
+ 2024-04-26 03:10:02,088 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
275
+ 2024-04-26 03:10:02,088 - trainer - INFO - dev_loss = 1.679259 || dev_eval_scores = {'perplexity': 5.361582279205322}
276
+ 2024-04-26 03:10:02,088 - trainer - INFO - train_loss = 3.60662579536438
277
+ 2024-04-26 03:10:02,089 - trainer - INFO -
278
+ ********************************************
279
+ 2024-04-26 03:16:22,224 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
280
+ 2024-04-26 03:16:28,030 - trainer - INFO - Save check-point at epoch=0 step=1800
281
+ 2024-04-26 03:16:28,030 - trainer - INFO - ***** Evaluation report *****
282
+ 2024-04-26 03:16:28,030 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
283
+ 2024-04-26 03:16:28,030 - trainer - INFO - Early stop on: perplexity
284
+ 2024-04-26 03:16:28,030 - trainer - INFO - Early stop count = 0/3
285
+ 2024-04-26 03:16:28,030 - trainer - INFO - Eval steps = 200 or (iterations = 200)
286
+ 2024-04-26 03:16:28,030 - trainer - INFO - Best score (perplexity) = -5.1808762550354
287
+ 2024-04-26 03:16:28,031 - trainer - INFO - Gradient Accumulation steps = 1
288
+ 2024-04-26 03:16:28,031 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
289
+ 2024-04-26 03:16:28,031 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
290
+ 2024-04-26 03:16:28,031 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
291
+ 2024-04-26 03:16:28,031 - trainer - INFO - Epoch = 1/5
292
+ 2024-04-26 03:16:28,031 - trainer - INFO - Steps = 1800/40800
293
+ 2024-04-26 03:16:28,031 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
294
+ 2024-04-26 03:16:28,031 - trainer - INFO - dev_loss = 1.644974 || dev_eval_scores = {'perplexity': 5.1808762550354}
295
+ 2024-04-26 03:16:28,031 - trainer - INFO - train_loss = 3.401608943939209
296
+ 2024-04-26 03:16:28,031 - trainer - INFO -
297
+ ********************************************
298
+ 2024-04-26 03:22:47,629 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
299
+ 2024-04-26 03:22:53,548 - trainer - INFO - Save check-point at epoch=0 step=2000
300
+ 2024-04-26 03:22:53,549 - trainer - INFO - ***** Evaluation report *****
301
+ 2024-04-26 03:22:53,549 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
302
+ 2024-04-26 03:22:53,549 - trainer - INFO - Early stop on: perplexity
303
+ 2024-04-26 03:22:53,549 - trainer - INFO - Early stop count = 0/3
304
+ 2024-04-26 03:22:53,549 - trainer - INFO - Eval steps = 200 or (iterations = 200)
305
+ 2024-04-26 03:22:53,549 - trainer - INFO - Best score (perplexity) = -4.970845699310303
306
+ 2024-04-26 03:22:53,549 - trainer - INFO - Gradient Accumulation steps = 1
307
+ 2024-04-26 03:22:53,549 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
308
+ 2024-04-26 03:22:53,549 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
309
+ 2024-04-26 03:22:53,549 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
310
+ 2024-04-26 03:22:53,549 - trainer - INFO - Epoch = 1/5
311
+ 2024-04-26 03:22:53,549 - trainer - INFO - Steps = 2000/40800
312
+ 2024-04-26 03:22:53,549 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
313
+ 2024-04-26 03:22:53,549 - trainer - INFO - dev_loss = 1.603590 || dev_eval_scores = {'perplexity': 4.970845699310303}
314
+ 2024-04-26 03:22:53,550 - trainer - INFO - train_loss = 3.2337915897369385
315
+ 2024-04-26 03:22:53,550 - trainer - INFO -
316
+ ********************************************
317
+ 2024-04-26 03:29:13,045 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
318
+ 2024-04-26 03:29:18,974 - trainer - INFO - Save check-point at epoch=0 step=2200
319
+ 2024-04-26 03:29:18,975 - trainer - INFO - ***** Evaluation report *****
320
+ 2024-04-26 03:29:18,975 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
321
+ 2024-04-26 03:29:18,975 - trainer - INFO - Early stop on: perplexity
322
+ 2024-04-26 03:29:18,975 - trainer - INFO - Early stop count = 0/3
323
+ 2024-04-26 03:29:18,975 - trainer - INFO - Eval steps = 200 or (iterations = 200)
324
+ 2024-04-26 03:29:18,975 - trainer - INFO - Best score (perplexity) = -4.858333587646484
325
+ 2024-04-26 03:29:18,975 - trainer - INFO - Gradient Accumulation steps = 1
326
+ 2024-04-26 03:29:18,975 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
327
+ 2024-04-26 03:29:18,975 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
328
+ 2024-04-26 03:29:18,976 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
329
+ 2024-04-26 03:29:18,976 - trainer - INFO - Epoch = 1/5
330
+ 2024-04-26 03:29:18,976 - trainer - INFO - Steps = 2200/40800
331
+ 2024-04-26 03:29:18,976 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
332
+ 2024-04-26 03:29:18,976 - trainer - INFO - dev_loss = 1.580696 || dev_eval_scores = {'perplexity': 4.858333587646484}
333
+ 2024-04-26 03:29:18,976 - trainer - INFO - train_loss = 3.092155694961548
334
+ 2024-04-26 03:29:18,976 - trainer - INFO -
335
+ ********************************************
336
+ 2024-04-26 03:35:38,899 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
337
+ 2024-04-26 03:35:44,832 - trainer - INFO - Save check-point at epoch=0 step=2400
338
+ 2024-04-26 03:35:44,832 - trainer - INFO - ***** Evaluation report *****
339
+ 2024-04-26 03:35:44,832 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
340
+ 2024-04-26 03:35:44,832 - trainer - INFO - Early stop on: perplexity
341
+ 2024-04-26 03:35:44,832 - trainer - INFO - Early stop count = 0/3
342
+ 2024-04-26 03:35:44,832 - trainer - INFO - Eval steps = 200 or (iterations = 200)
343
+ 2024-04-26 03:35:44,832 - trainer - INFO - Best score (perplexity) = -4.7346601486206055
344
+ 2024-04-26 03:35:44,833 - trainer - INFO - Gradient Accumulation steps = 1
345
+ 2024-04-26 03:35:44,833 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
346
+ 2024-04-26 03:35:44,833 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
347
+ 2024-04-26 03:35:44,833 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
348
+ 2024-04-26 03:35:44,833 - trainer - INFO - Epoch = 1/5
349
+ 2024-04-26 03:35:44,833 - trainer - INFO - Steps = 2400/40800
350
+ 2024-04-26 03:35:44,833 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
351
+ 2024-04-26 03:35:44,833 - trainer - INFO - dev_loss = 1.554910 || dev_eval_scores = {'perplexity': 4.7346601486206055}
352
+ 2024-04-26 03:35:44,833 - trainer - INFO - train_loss = 2.974703311920166
353
+ 2024-04-26 03:35:44,833 - trainer - INFO -
354
+ ********************************************
355
+ 2024-04-26 03:42:04,939 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
356
+ 2024-04-26 03:42:10,876 - trainer - INFO - Save check-point at epoch=0 step=2600
357
+ 2024-04-26 03:42:10,877 - trainer - INFO - ***** Evaluation report *****
358
+ 2024-04-26 03:42:10,877 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
359
+ 2024-04-26 03:42:10,877 - trainer - INFO - Early stop on: perplexity
360
+ 2024-04-26 03:42:10,877 - trainer - INFO - Early stop count = 0/3
361
+ 2024-04-26 03:42:10,877 - trainer - INFO - Eval steps = 200 or (iterations = 200)
362
+ 2024-04-26 03:42:10,877 - trainer - INFO - Best score (perplexity) = -4.624922275543213
363
+ 2024-04-26 03:42:10,877 - trainer - INFO - Gradient Accumulation steps = 1
364
+ 2024-04-26 03:42:10,877 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
365
+ 2024-04-26 03:42:10,877 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
366
+ 2024-04-26 03:42:10,877 - trainer - INFO - Time spent since last evaluation = 0h 6m 26s
367
+ 2024-04-26 03:42:10,877 - trainer - INFO - Epoch = 1/5
368
+ 2024-04-26 03:42:10,877 - trainer - INFO - Steps = 2600/40800
369
+ 2024-04-26 03:42:10,877 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
370
+ 2024-04-26 03:42:10,877 - trainer - INFO - dev_loss = 1.531460 || dev_eval_scores = {'perplexity': 4.624922275543213}
371
+ 2024-04-26 03:42:10,878 - trainer - INFO - train_loss = 2.8716752529144287
372
+ 2024-04-26 03:42:10,878 - trainer - INFO -
373
+ ********************************************
374
+ 2024-04-26 03:48:30,754 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
375
+ 2024-04-26 03:48:36,689 - trainer - INFO - Save check-point at epoch=0 step=2800
376
+ 2024-04-26 03:48:36,690 - trainer - INFO - ***** Evaluation report *****
377
+ 2024-04-26 03:48:36,690 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
378
+ 2024-04-26 03:48:36,690 - trainer - INFO - Early stop on: perplexity
379
+ 2024-04-26 03:48:36,690 - trainer - INFO - Early stop count = 0/3
380
+ 2024-04-26 03:48:36,690 - trainer - INFO - Eval steps = 200 or (iterations = 200)
381
+ 2024-04-26 03:48:36,690 - trainer - INFO - Best score (perplexity) = -4.533045291900635
382
+ 2024-04-26 03:48:36,690 - trainer - INFO - Gradient Accumulation steps = 1
383
+ 2024-04-26 03:48:36,690 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
384
+ 2024-04-26 03:48:36,690 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
385
+ 2024-04-26 03:48:36,690 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
386
+ 2024-04-26 03:48:36,690 - trainer - INFO - Epoch = 1/5
387
+ 2024-04-26 03:48:36,690 - trainer - INFO - Steps = 2800/40800
388
+ 2024-04-26 03:48:36,690 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
389
+ 2024-04-26 03:48:36,691 - trainer - INFO - dev_loss = 1.511394 || dev_eval_scores = {'perplexity': 4.533045291900635}
390
+ 2024-04-26 03:48:36,691 - trainer - INFO - train_loss = 2.781400680541992
391
+ 2024-04-26 03:48:36,691 - trainer - INFO -
392
+ ********************************************
393
+ 2024-04-26 03:54:56,573 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
394
+ 2024-04-26 03:55:02,481 - trainer - INFO - Save check-point at epoch=0 step=3000
395
+ 2024-04-26 03:55:02,482 - trainer - INFO - ***** Evaluation report *****
396
+ 2024-04-26 03:55:02,482 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
397
+ 2024-04-26 03:55:02,482 - trainer - INFO - Early stop on: perplexity
398
+ 2024-04-26 03:55:02,482 - trainer - INFO - Early stop count = 0/3
399
+ 2024-04-26 03:55:02,482 - trainer - INFO - Eval steps = 200 or (iterations = 200)
400
+ 2024-04-26 03:55:02,482 - trainer - INFO - Best score (perplexity) = -4.453883647918701
401
+ 2024-04-26 03:55:02,482 - trainer - INFO - Gradient Accumulation steps = 1
402
+ 2024-04-26 03:55:02,482 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
403
+ 2024-04-26 03:55:02,482 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
404
+ 2024-04-26 03:55:02,482 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
405
+ 2024-04-26 03:55:02,482 - trainer - INFO - Epoch = 1/5
406
+ 2024-04-26 03:55:02,482 - trainer - INFO - Steps = 3000/40800
407
+ 2024-04-26 03:55:02,482 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
408
+ 2024-04-26 03:55:02,482 - trainer - INFO - dev_loss = 1.493776 || dev_eval_scores = {'perplexity': 4.453883647918701}
409
+ 2024-04-26 03:55:02,482 - trainer - INFO - train_loss = 2.702195167541504
410
+ 2024-04-26 03:55:02,483 - trainer - INFO -
411
+ ********************************************
412
+ 2024-04-26 04:01:21,916 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
413
+ 2024-04-26 04:01:27,748 - trainer - INFO - Save check-point at epoch=0 step=3200
414
+ 2024-04-26 04:01:27,748 - trainer - INFO - ***** Evaluation report *****
415
+ 2024-04-26 04:01:27,749 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
416
+ 2024-04-26 04:01:27,749 - trainer - INFO - Early stop on: perplexity
417
+ 2024-04-26 04:01:27,749 - trainer - INFO - Early stop count = 0/3
418
+ 2024-04-26 04:01:27,749 - trainer - INFO - Eval steps = 200 or (iterations = 200)
419
+ 2024-04-26 04:01:27,749 - trainer - INFO - Best score (perplexity) = -4.359768867492676
420
+ 2024-04-26 04:01:27,749 - trainer - INFO - Gradient Accumulation steps = 1
421
+ 2024-04-26 04:01:27,749 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
422
+ 2024-04-26 04:01:27,749 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
423
+ 2024-04-26 04:01:27,749 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
424
+ 2024-04-26 04:01:27,749 - trainer - INFO - Epoch = 1/5
425
+ 2024-04-26 04:01:27,749 - trainer - INFO - Steps = 3200/40800
426
+ 2024-04-26 04:01:27,749 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
427
+ 2024-04-26 04:01:27,749 - trainer - INFO - dev_loss = 1.472419 || dev_eval_scores = {'perplexity': 4.359768867492676}
428
+ 2024-04-26 04:01:27,749 - trainer - INFO - train_loss = 2.6316800117492676
429
+ 2024-04-26 04:01:27,749 - trainer - INFO -
430
+ ********************************************
431
+ 2024-04-26 04:07:47,186 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
432
+ 2024-04-26 04:07:52,996 - trainer - INFO - Save check-point at epoch=0 step=3400
433
+ 2024-04-26 04:07:52,996 - trainer - INFO - ***** Evaluation report *****
434
+ 2024-04-26 04:07:52,996 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
435
+ 2024-04-26 04:07:52,996 - trainer - INFO - Early stop on: perplexity
436
+ 2024-04-26 04:07:52,996 - trainer - INFO - Early stop count = 0/3
437
+ 2024-04-26 04:07:52,996 - trainer - INFO - Eval steps = 200 or (iterations = 200)
438
+ 2024-04-26 04:07:52,996 - trainer - INFO - Best score (perplexity) = -4.2930779457092285
439
+ 2024-04-26 04:07:52,996 - trainer - INFO - Gradient Accumulation steps = 1
440
+ 2024-04-26 04:07:52,996 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
441
+ 2024-04-26 04:07:52,997 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
442
+ 2024-04-26 04:07:52,997 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
443
+ 2024-04-26 04:07:52,997 - trainer - INFO - Epoch = 1/5
444
+ 2024-04-26 04:07:52,997 - trainer - INFO - Steps = 3400/40800
445
+ 2024-04-26 04:07:52,997 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
446
+ 2024-04-26 04:07:52,997 - trainer - INFO - dev_loss = 1.457004 || dev_eval_scores = {'perplexity': 4.2930779457092285}
447
+ 2024-04-26 04:07:52,997 - trainer - INFO - train_loss = 2.5693111419677734
448
+ 2024-04-26 04:07:52,997 - trainer - INFO -
449
+ ********************************************
450
+ 2024-04-26 04:14:12,633 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
451
+ 2024-04-26 04:14:18,310 - trainer - INFO - Save check-point at epoch=0 step=3600
452
+ 2024-04-26 04:14:18,310 - trainer - INFO - ***** Evaluation report *****
453
+ 2024-04-26 04:14:18,310 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
454
+ 2024-04-26 04:14:18,310 - trainer - INFO - Early stop on: perplexity
455
+ 2024-04-26 04:14:18,310 - trainer - INFO - Early stop count = 0/3
456
+ 2024-04-26 04:14:18,310 - trainer - INFO - Eval steps = 200 or (iterations = 200)
457
+ 2024-04-26 04:14:18,310 - trainer - INFO - Best score (perplexity) = -4.221639633178711
458
+ 2024-04-26 04:14:18,310 - trainer - INFO - Gradient Accumulation steps = 1
459
+ 2024-04-26 04:14:18,310 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
460
+ 2024-04-26 04:14:18,310 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
461
+ 2024-04-26 04:14:18,311 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
462
+ 2024-04-26 04:14:18,311 - trainer - INFO - Epoch = 1/5
463
+ 2024-04-26 04:14:18,311 - trainer - INFO - Steps = 3600/40800
464
+ 2024-04-26 04:14:18,311 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
465
+ 2024-04-26 04:14:18,311 - trainer - INFO - dev_loss = 1.440224 || dev_eval_scores = {'perplexity': 4.221639633178711}
466
+ 2024-04-26 04:14:18,311 - trainer - INFO - train_loss = 2.5129594802856445
467
+ 2024-04-26 04:14:18,311 - trainer - INFO -
468
+ ********************************************
469
+ 2024-04-26 04:20:38,684 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
470
+ 2024-04-26 04:20:44,511 - trainer - INFO - Save check-point at epoch=0 step=3800
471
+ 2024-04-26 04:20:44,512 - trainer - INFO - ***** Evaluation report *****
472
+ 2024-04-26 04:20:44,512 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
473
+ 2024-04-26 04:20:44,512 - trainer - INFO - Early stop on: perplexity
474
+ 2024-04-26 04:20:44,512 - trainer - INFO - Early stop count = 0/3
475
+ 2024-04-26 04:20:44,512 - trainer - INFO - Eval steps = 200 or (iterations = 200)
476
+ 2024-04-26 04:20:44,512 - trainer - INFO - Best score (perplexity) = -4.147531986236572
477
+ 2024-04-26 04:20:44,512 - trainer - INFO - Gradient Accumulation steps = 1
478
+ 2024-04-26 04:20:44,512 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
479
+ 2024-04-26 04:20:44,512 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
480
+ 2024-04-26 04:20:44,512 - trainer - INFO - Time spent since last evaluation = 0h 6m 26s
481
+ 2024-04-26 04:20:44,512 - trainer - INFO - Epoch = 1/5
482
+ 2024-04-26 04:20:44,512 - trainer - INFO - Steps = 3800/40800
483
+ 2024-04-26 04:20:44,512 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
484
+ 2024-04-26 04:20:44,513 - trainer - INFO - dev_loss = 1.422513 || dev_eval_scores = {'perplexity': 4.147531986236572}
485
+ 2024-04-26 04:20:44,513 - trainer - INFO - train_loss = 2.460688829421997
486
+ 2024-04-26 04:20:44,513 - trainer - INFO -
487
+ ********************************************
488
+ 2024-04-26 04:27:04,526 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
489
+ 2024-04-26 04:27:10,385 - trainer - INFO - Save check-point at epoch=0 step=4000
490
+ 2024-04-26 04:27:10,386 - trainer - INFO - ***** Evaluation report *****
491
+ 2024-04-26 04:27:10,386 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
492
+ 2024-04-26 04:27:10,386 - trainer - INFO - Early stop on: perplexity
493
+ 2024-04-26 04:27:10,386 - trainer - INFO - Early stop count = 0/3
494
+ 2024-04-26 04:27:10,386 - trainer - INFO - Eval steps = 200 or (iterations = 200)
495
+ 2024-04-26 04:27:10,386 - trainer - INFO - Best score (perplexity) = -4.087435722351074
496
+ 2024-04-26 04:27:10,386 - trainer - INFO - Gradient Accumulation steps = 1
497
+ 2024-04-26 04:27:10,386 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
498
+ 2024-04-26 04:27:10,386 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
499
+ 2024-04-26 04:27:10,386 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
500
+ 2024-04-26 04:27:10,386 - trainer - INFO - Epoch = 1/5
501
+ 2024-04-26 04:27:10,386 - trainer - INFO - Steps = 4000/40800
502
+ 2024-04-26 04:27:10,386 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
503
+ 2024-04-26 04:27:10,386 - trainer - INFO - dev_loss = 1.407918 || dev_eval_scores = {'perplexity': 4.087435722351074}
504
+ 2024-04-26 04:27:10,387 - trainer - INFO - train_loss = 2.4136698246002197
505
+ 2024-04-26 04:27:10,387 - trainer - INFO -
506
+ ********************************************
507
+ 2024-04-26 04:33:30,165 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
508
+ 2024-04-26 04:33:36,001 - trainer - INFO - Save check-point at epoch=0 step=4200
509
+ 2024-04-26 04:33:36,001 - trainer - INFO - ***** Evaluation report *****
510
+ 2024-04-26 04:33:36,001 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
511
+ 2024-04-26 04:33:36,001 - trainer - INFO - Early stop on: perplexity
512
+ 2024-04-26 04:33:36,001 - trainer - INFO - Early stop count = 0/3
513
+ 2024-04-26 04:33:36,001 - trainer - INFO - Eval steps = 200 or (iterations = 200)
514
+ 2024-04-26 04:33:36,001 - trainer - INFO - Best score (perplexity) = -4.028451442718506
515
+ 2024-04-26 04:33:36,002 - trainer - INFO - Gradient Accumulation steps = 1
516
+ 2024-04-26 04:33:36,002 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
517
+ 2024-04-26 04:33:36,002 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
518
+ 2024-04-26 04:33:36,002 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
519
+ 2024-04-26 04:33:36,002 - trainer - INFO - Epoch = 1/5
520
+ 2024-04-26 04:33:36,002 - trainer - INFO - Steps = 4200/40800
521
+ 2024-04-26 04:33:36,002 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
522
+ 2024-04-26 04:33:36,002 - trainer - INFO - dev_loss = 1.393382 || dev_eval_scores = {'perplexity': 4.028451442718506}
523
+ 2024-04-26 04:33:36,002 - trainer - INFO - train_loss = 2.3706307411193848
524
+ 2024-04-26 04:33:36,002 - trainer - INFO -
525
+ ********************************************
526
+ 2024-04-26 04:39:55,706 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
527
+ 2024-04-26 04:40:01,545 - trainer - INFO - Save check-point at epoch=0 step=4400
528
+ 2024-04-26 04:40:01,545 - trainer - INFO - ***** Evaluation report *****
529
+ 2024-04-26 04:40:01,545 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
530
+ 2024-04-26 04:40:01,545 - trainer - INFO - Early stop on: perplexity
531
+ 2024-04-26 04:40:01,545 - trainer - INFO - Early stop count = 0/3
532
+ 2024-04-26 04:40:01,545 - trainer - INFO - Eval steps = 200 or (iterations = 200)
533
+ 2024-04-26 04:40:01,545 - trainer - INFO - Best score (perplexity) = -3.976846694946289
534
+ 2024-04-26 04:40:01,546 - trainer - INFO - Gradient Accumulation steps = 1
535
+ 2024-04-26 04:40:01,546 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
536
+ 2024-04-26 04:40:01,546 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
537
+ 2024-04-26 04:40:01,546 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
538
+ 2024-04-26 04:40:01,546 - trainer - INFO - Epoch = 1/5
539
+ 2024-04-26 04:40:01,546 - trainer - INFO - Steps = 4400/40800
540
+ 2024-04-26 04:40:01,546 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
541
+ 2024-04-26 04:40:01,546 - trainer - INFO - dev_loss = 1.380489 || dev_eval_scores = {'perplexity': 3.976846694946289}
542
+ 2024-04-26 04:40:01,546 - trainer - INFO - train_loss = 2.330047369003296
543
+ 2024-04-26 04:40:01,546 - trainer - INFO -
544
+ ********************************************
545
+ 2024-04-26 04:46:21,905 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
546
+ 2024-04-26 04:46:27,763 - trainer - INFO - Save check-point at epoch=0 step=4600
547
+ 2024-04-26 04:46:27,764 - trainer - INFO - ***** Evaluation report *****
548
+ 2024-04-26 04:46:27,764 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
549
+ 2024-04-26 04:46:27,764 - trainer - INFO - Early stop on: perplexity
550
+ 2024-04-26 04:46:27,764 - trainer - INFO - Early stop count = 0/3
551
+ 2024-04-26 04:46:27,764 - trainer - INFO - Eval steps = 200 or (iterations = 200)
552
+ 2024-04-26 04:46:27,764 - trainer - INFO - Best score (perplexity) = -3.920635461807251
553
+ 2024-04-26 04:46:27,764 - trainer - INFO - Gradient Accumulation steps = 1
554
+ 2024-04-26 04:46:27,764 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
555
+ 2024-04-26 04:46:27,764 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
556
+ 2024-04-26 04:46:27,764 - trainer - INFO - Time spent since last evaluation = 0h 6m 26s
557
+ 2024-04-26 04:46:27,764 - trainer - INFO - Epoch = 1/5
558
+ 2024-04-26 04:46:27,764 - trainer - INFO - Steps = 4600/40800
559
+ 2024-04-26 04:46:27,764 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
560
+ 2024-04-26 04:46:27,764 - trainer - INFO - dev_loss = 1.366254 || dev_eval_scores = {'perplexity': 3.920635461807251}
561
+ 2024-04-26 04:46:27,765 - trainer - INFO - train_loss = 2.2929983139038086
562
+ 2024-04-26 04:46:27,765 - trainer - INFO -
563
+ ********************************************
564
+ 2024-04-26 04:52:47,547 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
565
+ 2024-04-26 04:52:53,122 - trainer - INFO - Save check-point at epoch=0 step=4800
566
+ 2024-04-26 04:52:53,122 - trainer - INFO - ***** Evaluation report *****
567
+ 2024-04-26 04:52:53,122 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
568
+ 2024-04-26 04:52:53,122 - trainer - INFO - Early stop on: perplexity
569
+ 2024-04-26 04:52:53,122 - trainer - INFO - Early stop count = 0/3
570
+ 2024-04-26 04:52:53,122 - trainer - INFO - Eval steps = 200 or (iterations = 200)
571
+ 2024-04-26 04:52:53,122 - trainer - INFO - Best score (perplexity) = -3.866814613342285
572
+ 2024-04-26 04:52:53,122 - trainer - INFO - Gradient Accumulation steps = 1
573
+ 2024-04-26 04:52:53,122 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
574
+ 2024-04-26 04:52:53,122 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
575
+ 2024-04-26 04:52:53,123 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
576
+ 2024-04-26 04:52:53,123 - trainer - INFO - Epoch = 1/5
577
+ 2024-04-26 04:52:53,123 - trainer - INFO - Steps = 4800/40800
578
+ 2024-04-26 04:52:53,123 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
579
+ 2024-04-26 04:52:53,123 - trainer - INFO - dev_loss = 1.352431 || dev_eval_scores = {'perplexity': 3.866814613342285}
580
+ 2024-04-26 04:52:53,123 - trainer - INFO - train_loss = 2.2579383850097656
581
+ 2024-04-26 04:52:53,123 - trainer - INFO -
582
+ ********************************************
583
+ 2024-04-26 04:59:12,707 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
584
+ 2024-04-26 04:59:18,585 - trainer - INFO - Save check-point at epoch=0 step=5000
585
+ 2024-04-26 04:59:18,586 - trainer - INFO - ***** Evaluation report *****
586
+ 2024-04-26 04:59:18,586 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
587
+ 2024-04-26 04:59:18,586 - trainer - INFO - Early stop on: perplexity
588
+ 2024-04-26 04:59:18,586 - trainer - INFO - Early stop count = 0/3
589
+ 2024-04-26 04:59:18,586 - trainer - INFO - Eval steps = 200 or (iterations = 200)
590
+ 2024-04-26 04:59:18,586 - trainer - INFO - Best score (perplexity) = -3.827284574508667
591
+ 2024-04-26 04:59:18,586 - trainer - INFO - Gradient Accumulation steps = 1
592
+ 2024-04-26 04:59:18,586 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
593
+ 2024-04-26 04:59:18,586 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
594
+ 2024-04-26 04:59:18,586 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
595
+ 2024-04-26 04:59:18,586 - trainer - INFO - Epoch = 1/5
596
+ 2024-04-26 04:59:18,586 - trainer - INFO - Steps = 5000/40800
597
+ 2024-04-26 04:59:18,586 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
598
+ 2024-04-26 04:59:18,586 - trainer - INFO - dev_loss = 1.342156 || dev_eval_scores = {'perplexity': 3.827284574508667}
599
+ 2024-04-26 04:59:18,587 - trainer - INFO - train_loss = 2.225395679473877
600
+ 2024-04-26 04:59:18,587 - trainer - INFO -
601
+ ********************************************
602
+ 2024-04-26 05:05:39,819 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
603
+ 2024-04-26 05:05:45,329 - trainer - INFO - Save check-point at epoch=0 step=5200
604
+ 2024-04-26 05:05:45,330 - trainer - INFO - ***** Evaluation report *****
605
+ 2024-04-26 05:05:45,330 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
606
+ 2024-04-26 05:05:45,330 - trainer - INFO - Early stop on: perplexity
607
+ 2024-04-26 05:05:45,330 - trainer - INFO - Early stop count = 0/3
608
+ 2024-04-26 05:05:45,330 - trainer - INFO - Eval steps = 200 or (iterations = 200)
609
+ 2024-04-26 05:05:45,330 - trainer - INFO - Best score (perplexity) = -3.7697432041168213
610
+ 2024-04-26 05:05:45,330 - trainer - INFO - Gradient Accumulation steps = 1
611
+ 2024-04-26 05:05:45,330 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
612
+ 2024-04-26 05:05:45,330 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
613
+ 2024-04-26 05:05:45,330 - trainer - INFO - Time spent since last evaluation = 0h 6m 26s
614
+ 2024-04-26 05:05:45,330 - trainer - INFO - Epoch = 1/5
615
+ 2024-04-26 05:05:45,330 - trainer - INFO - Steps = 5200/40800
616
+ 2024-04-26 05:05:45,330 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
617
+ 2024-04-26 05:05:45,330 - trainer - INFO - dev_loss = 1.327007 || dev_eval_scores = {'perplexity': 3.7697432041168213}
618
+ 2024-04-26 05:05:45,331 - trainer - INFO - train_loss = 2.194683790206909
619
+ 2024-04-26 05:05:45,331 - trainer - INFO -
620
+ ********************************************
621
+ 2024-04-26 05:12:05,019 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
622
+ 2024-04-26 05:12:10,879 - trainer - INFO - Save check-point at epoch=0 step=5400
623
+ 2024-04-26 05:12:10,879 - trainer - INFO - ***** Evaluation report *****
624
+ 2024-04-26 05:12:10,879 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
625
+ 2024-04-26 05:12:10,879 - trainer - INFO - Early stop on: perplexity
626
+ 2024-04-26 05:12:10,879 - trainer - INFO - Early stop count = 0/3
627
+ 2024-04-26 05:12:10,880 - trainer - INFO - Eval steps = 200 or (iterations = 200)
628
+ 2024-04-26 05:12:10,880 - trainer - INFO - Best score (perplexity) = -3.732077121734619
629
+ 2024-04-26 05:12:10,880 - trainer - INFO - Gradient Accumulation steps = 1
630
+ 2024-04-26 05:12:10,880 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
631
+ 2024-04-26 05:12:10,880 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
632
+ 2024-04-26 05:12:10,880 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
633
+ 2024-04-26 05:12:10,880 - trainer - INFO - Epoch = 1/5
634
+ 2024-04-26 05:12:10,880 - trainer - INFO - Steps = 5400/40800
635
+ 2024-04-26 05:12:10,880 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
636
+ 2024-04-26 05:12:10,880 - trainer - INFO - dev_loss = 1.316965 || dev_eval_scores = {'perplexity': 3.732077121734619}
637
+ 2024-04-26 05:12:10,880 - trainer - INFO - train_loss = 2.16521954536438
638
+ 2024-04-26 05:12:10,880 - trainer - INFO -
639
+ ********************************************
640
+ 2024-04-26 05:18:31,741 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
641
+ 2024-04-26 05:18:37,604 - trainer - INFO - Save check-point at epoch=0 step=5600
642
+ 2024-04-26 05:18:37,605 - trainer - INFO - ***** Evaluation report *****
643
+ 2024-04-26 05:18:37,605 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
644
+ 2024-04-26 05:18:37,605 - trainer - INFO - Early stop on: perplexity
645
+ 2024-04-26 05:18:37,605 - trainer - INFO - Early stop count = 0/3
646
+ 2024-04-26 05:18:37,605 - trainer - INFO - Eval steps = 200 or (iterations = 200)
647
+ 2024-04-26 05:18:37,605 - trainer - INFO - Best score (perplexity) = -3.6822173595428467
648
+ 2024-04-26 05:18:37,605 - trainer - INFO - Gradient Accumulation steps = 1
649
+ 2024-04-26 05:18:37,605 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
650
+ 2024-04-26 05:18:37,605 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
651
+ 2024-04-26 05:18:37,605 - trainer - INFO - Time spent since last evaluation = 0h 6m 26s
652
+ 2024-04-26 05:18:37,605 - trainer - INFO - Epoch = 1/5
653
+ 2024-04-26 05:18:37,605 - trainer - INFO - Steps = 5600/40800
654
+ 2024-04-26 05:18:37,605 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
655
+ 2024-04-26 05:18:37,605 - trainer - INFO - dev_loss = 1.303515 || dev_eval_scores = {'perplexity': 3.6822173595428467}
656
+ 2024-04-26 05:18:37,606 - trainer - INFO - train_loss = 2.1381325721740723
657
+ 2024-04-26 05:18:37,606 - trainer - INFO -
658
+ ********************************************
659
+ 2024-04-26 05:24:57,533 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
660
+ 2024-04-26 05:25:03,410 - trainer - INFO - Save check-point at epoch=0 step=5800
661
+ 2024-04-26 05:25:03,411 - trainer - INFO - ***** Evaluation report *****
662
+ 2024-04-26 05:25:03,411 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
663
+ 2024-04-26 05:25:03,411 - trainer - INFO - Early stop on: perplexity
664
+ 2024-04-26 05:25:03,411 - trainer - INFO - Early stop count = 0/3
665
+ 2024-04-26 05:25:03,411 - trainer - INFO - Eval steps = 200 or (iterations = 200)
666
+ 2024-04-26 05:25:03,411 - trainer - INFO - Best score (perplexity) = -3.641592264175415
667
+ 2024-04-26 05:25:03,411 - trainer - INFO - Gradient Accumulation steps = 1
668
+ 2024-04-26 05:25:03,411 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
669
+ 2024-04-26 05:25:03,411 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
670
+ 2024-04-26 05:25:03,411 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
671
+ 2024-04-26 05:25:03,411 - trainer - INFO - Epoch = 1/5
672
+ 2024-04-26 05:25:03,411 - trainer - INFO - Steps = 5800/40800
673
+ 2024-04-26 05:25:03,411 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
674
+ 2024-04-26 05:25:03,411 - trainer - INFO - dev_loss = 1.292421 || dev_eval_scores = {'perplexity': 3.641592264175415}
675
+ 2024-04-26 05:25:03,412 - trainer - INFO - train_loss = 2.113192319869995
676
+ 2024-04-26 05:25:03,412 - trainer - INFO -
677
+ ********************************************
678
+ 2024-04-26 05:31:23,054 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
679
+ 2024-04-26 05:31:28,936 - trainer - INFO - Save check-point at epoch=0 step=6000
680
+ 2024-04-26 05:31:28,937 - trainer - INFO - ***** Evaluation report *****
681
+ 2024-04-26 05:31:28,937 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
682
+ 2024-04-26 05:31:28,937 - trainer - INFO - Early stop on: perplexity
683
+ 2024-04-26 05:31:28,937 - trainer - INFO - Early stop count = 0/3
684
+ 2024-04-26 05:31:28,937 - trainer - INFO - Eval steps = 200 or (iterations = 200)
685
+ 2024-04-26 05:31:28,937 - trainer - INFO - Best score (perplexity) = -3.602872133255005
686
+ 2024-04-26 05:31:28,937 - trainer - INFO - Gradient Accumulation steps = 1
687
+ 2024-04-26 05:31:28,937 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
688
+ 2024-04-26 05:31:28,937 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
689
+ 2024-04-26 05:31:28,937 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
690
+ 2024-04-26 05:31:28,937 - trainer - INFO - Epoch = 1/5
691
+ 2024-04-26 05:31:28,937 - trainer - INFO - Steps = 6000/40800
692
+ 2024-04-26 05:31:28,937 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
693
+ 2024-04-26 05:31:28,938 - trainer - INFO - dev_loss = 1.281731 || dev_eval_scores = {'perplexity': 3.602872133255005}
694
+ 2024-04-26 05:31:28,938 - trainer - INFO - train_loss = 2.0891873836517334
695
+ 2024-04-26 05:31:28,938 - trainer - INFO -
696
+ ********************************************
697
+ 2024-04-26 05:37:49,590 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
698
+ 2024-04-26 05:37:55,449 - trainer - INFO - Save check-point at epoch=0 step=6200
699
+ 2024-04-26 05:37:55,449 - trainer - INFO - ***** Evaluation report *****
700
+ 2024-04-26 05:37:55,450 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
701
+ 2024-04-26 05:37:55,450 - trainer - INFO - Early stop on: perplexity
702
+ 2024-04-26 05:37:55,450 - trainer - INFO - Early stop count = 0/3
703
+ 2024-04-26 05:37:55,450 - trainer - INFO - Eval steps = 200 or (iterations = 200)
704
+ 2024-04-26 05:37:55,450 - trainer - INFO - Best score (perplexity) = -3.5650696754455566
705
+ 2024-04-26 05:37:55,450 - trainer - INFO - Gradient Accumulation steps = 1
706
+ 2024-04-26 05:37:55,450 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
707
+ 2024-04-26 05:37:55,450 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
708
+ 2024-04-26 05:37:55,450 - trainer - INFO - Time spent since last evaluation = 0h 6m 26s
709
+ 2024-04-26 05:37:55,450 - trainer - INFO - Epoch = 1/5
710
+ 2024-04-26 05:37:55,450 - trainer - INFO - Steps = 6200/40800
711
+ 2024-04-26 05:37:55,450 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
712
+ 2024-04-26 05:37:55,450 - trainer - INFO - dev_loss = 1.271184 || dev_eval_scores = {'perplexity': 3.5650696754455566}
713
+ 2024-04-26 05:37:55,450 - trainer - INFO - train_loss = 2.066126585006714
714
+ 2024-04-26 05:37:55,451 - trainer - INFO -
715
+ ********************************************
716
+ 2024-04-26 05:44:15,283 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
717
+ 2024-04-26 05:44:21,132 - trainer - INFO - Save check-point at epoch=0 step=6400
718
+ 2024-04-26 05:44:21,133 - trainer - INFO - ***** Evaluation report *****
719
+ 2024-04-26 05:44:21,133 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
720
+ 2024-04-26 05:44:21,133 - trainer - INFO - Early stop on: perplexity
721
+ 2024-04-26 05:44:21,133 - trainer - INFO - Early stop count = 0/3
722
+ 2024-04-26 05:44:21,133 - trainer - INFO - Eval steps = 200 or (iterations = 200)
723
+ 2024-04-26 05:44:21,133 - trainer - INFO - Best score (perplexity) = -3.517021894454956
724
+ 2024-04-26 05:44:21,133 - trainer - INFO - Gradient Accumulation steps = 1
725
+ 2024-04-26 05:44:21,133 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
726
+ 2024-04-26 05:44:21,133 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
727
+ 2024-04-26 05:44:21,133 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
728
+ 2024-04-26 05:44:21,133 - trainer - INFO - Epoch = 1/5
729
+ 2024-04-26 05:44:21,133 - trainer - INFO - Steps = 6400/40800
730
+ 2024-04-26 05:44:21,133 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
731
+ 2024-04-26 05:44:21,133 - trainer - INFO - dev_loss = 1.257615 || dev_eval_scores = {'perplexity': 3.517021894454956}
732
+ 2024-04-26 05:44:21,134 - trainer - INFO - train_loss = 2.0438156127929688
733
+ 2024-04-26 05:44:21,134 - trainer - INFO -
734
+ ********************************************
735
+ 2024-04-26 05:50:41,206 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
736
+ 2024-04-26 05:50:47,135 - trainer - INFO - Save check-point at epoch=0 step=6600
737
+ 2024-04-26 05:50:47,135 - trainer - INFO - ***** Evaluation report *****
738
+ 2024-04-26 05:50:47,135 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
739
+ 2024-04-26 05:50:47,135 - trainer - INFO - Early stop on: perplexity
740
+ 2024-04-26 05:50:47,135 - trainer - INFO - Early stop count = 0/3
741
+ 2024-04-26 05:50:47,135 - trainer - INFO - Eval steps = 200 or (iterations = 200)
742
+ 2024-04-26 05:50:47,135 - trainer - INFO - Best score (perplexity) = -3.4847798347473145
743
+ 2024-04-26 05:50:47,135 - trainer - INFO - Gradient Accumulation steps = 1
744
+ 2024-04-26 05:50:47,135 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
745
+ 2024-04-26 05:50:47,136 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
746
+ 2024-04-26 05:50:47,136 - trainer - INFO - Time spent since last evaluation = 0h 6m 26s
747
+ 2024-04-26 05:50:47,136 - trainer - INFO - Epoch = 1/5
748
+ 2024-04-26 05:50:47,136 - trainer - INFO - Steps = 6600/40800
749
+ 2024-04-26 05:50:47,136 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
750
+ 2024-04-26 05:50:47,136 - trainer - INFO - dev_loss = 1.248405 || dev_eval_scores = {'perplexity': 3.4847798347473145}
751
+ 2024-04-26 05:50:47,136 - trainer - INFO - train_loss = 2.022505283355713
752
+ 2024-04-26 05:50:47,136 - trainer - INFO -
753
+ ********************************************
754
+ 2024-04-26 05:57:06,888 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
755
+ 2024-04-26 05:57:12,755 - trainer - INFO - Save check-point at epoch=0 step=6800
756
+ 2024-04-26 05:57:12,755 - trainer - INFO - ***** Evaluation report *****
757
+ 2024-04-26 05:57:12,756 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
758
+ 2024-04-26 05:57:12,756 - trainer - INFO - Early stop on: perplexity
759
+ 2024-04-26 05:57:12,756 - trainer - INFO - Early stop count = 0/3
760
+ 2024-04-26 05:57:12,756 - trainer - INFO - Eval steps = 200 or (iterations = 200)
761
+ 2024-04-26 05:57:12,756 - trainer - INFO - Best score (perplexity) = -3.441448450088501
762
+ 2024-04-26 05:57:12,756 - trainer - INFO - Gradient Accumulation steps = 1
763
+ 2024-04-26 05:57:12,756 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
764
+ 2024-04-26 05:57:12,756 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
765
+ 2024-04-26 05:57:12,756 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
766
+ 2024-04-26 05:57:12,756 - trainer - INFO - Epoch = 1/5
767
+ 2024-04-26 05:57:12,756 - trainer - INFO - Steps = 6800/40800
768
+ 2024-04-26 05:57:12,756 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
769
+ 2024-04-26 05:57:12,756 - trainer - INFO - dev_loss = 1.235892 || dev_eval_scores = {'perplexity': 3.441448450088501}
770
+ 2024-04-26 05:57:12,756 - trainer - INFO - train_loss = 2.0026967525482178
771
+ 2024-04-26 05:57:12,757 - trainer - INFO -
772
+ ********************************************
773
+ 2024-04-26 06:03:32,820 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
774
+ 2024-04-26 06:03:38,713 - trainer - INFO - Save check-point at epoch=0 step=7000
775
+ 2024-04-26 06:03:38,714 - trainer - INFO - ***** Evaluation report *****
776
+ 2024-04-26 06:03:38,714 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
777
+ 2024-04-26 06:03:38,714 - trainer - INFO - Early stop on: perplexity
778
+ 2024-04-26 06:03:38,714 - trainer - INFO - Early stop count = 0/3
779
+ 2024-04-26 06:03:38,714 - trainer - INFO - Eval steps = 200 or (iterations = 200)
780
+ 2024-04-26 06:03:38,714 - trainer - INFO - Best score (perplexity) = -3.3976998329162598
781
+ 2024-04-26 06:03:38,714 - trainer - INFO - Gradient Accumulation steps = 1
782
+ 2024-04-26 06:03:38,714 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
783
+ 2024-04-26 06:03:38,714 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
784
+ 2024-04-26 06:03:38,714 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
785
+ 2024-04-26 06:03:38,714 - trainer - INFO - Epoch = 1/5
786
+ 2024-04-26 06:03:38,714 - trainer - INFO - Steps = 7000/40800
787
+ 2024-04-26 06:03:38,714 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
788
+ 2024-04-26 06:03:38,715 - trainer - INFO - dev_loss = 1.223099 || dev_eval_scores = {'perplexity': 3.3976998329162598}
789
+ 2024-04-26 06:03:38,715 - trainer - INFO - train_loss = 1.983184576034546
790
+ 2024-04-26 06:03:38,715 - trainer - INFO -
791
+ ********************************************
792
+ 2024-04-26 06:09:59,334 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
793
+ 2024-04-26 06:10:05,217 - trainer - INFO - Save check-point at epoch=0 step=7200
794
+ 2024-04-26 06:10:05,217 - trainer - INFO - ***** Evaluation report *****
795
+ 2024-04-26 06:10:05,217 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
796
+ 2024-04-26 06:10:05,217 - trainer - INFO - Early stop on: perplexity
797
+ 2024-04-26 06:10:05,217 - trainer - INFO - Early stop count = 0/3
798
+ 2024-04-26 06:10:05,217 - trainer - INFO - Eval steps = 200 or (iterations = 200)
799
+ 2024-04-26 06:10:05,217 - trainer - INFO - Best score (perplexity) = -3.3713600635528564
800
+ 2024-04-26 06:10:05,217 - trainer - INFO - Gradient Accumulation steps = 1
801
+ 2024-04-26 06:10:05,217 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
802
+ 2024-04-26 06:10:05,218 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
803
+ 2024-04-26 06:10:05,218 - trainer - INFO - Time spent since last evaluation = 0h 6m 26s
804
+ 2024-04-26 06:10:05,218 - trainer - INFO - Epoch = 1/5
805
+ 2024-04-26 06:10:05,218 - trainer - INFO - Steps = 7200/40800
806
+ 2024-04-26 06:10:05,218 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
807
+ 2024-04-26 06:10:05,218 - trainer - INFO - dev_loss = 1.215316 || dev_eval_scores = {'perplexity': 3.3713600635528564}
808
+ 2024-04-26 06:10:05,218 - trainer - INFO - train_loss = 1.9642337560653687
809
+ 2024-04-26 06:10:05,218 - trainer - INFO -
810
+ ********************************************
811
+ 2024-04-26 06:16:24,736 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
812
+ 2024-04-26 06:16:30,613 - trainer - INFO - Save check-point at epoch=0 step=7400
813
+ 2024-04-26 06:16:30,613 - trainer - INFO - ***** Evaluation report *****
814
+ 2024-04-26 06:16:30,613 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
815
+ 2024-04-26 06:16:30,613 - trainer - INFO - Early stop on: perplexity
816
+ 2024-04-26 06:16:30,613 - trainer - INFO - Early stop count = 0/3
817
+ 2024-04-26 06:16:30,613 - trainer - INFO - Eval steps = 200 or (iterations = 200)
818
+ 2024-04-26 06:16:30,613 - trainer - INFO - Best score (perplexity) = -3.334381341934204
819
+ 2024-04-26 06:16:30,613 - trainer - INFO - Gradient Accumulation steps = 1
820
+ 2024-04-26 06:16:30,613 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
821
+ 2024-04-26 06:16:30,613 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
822
+ 2024-04-26 06:16:30,613 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
823
+ 2024-04-26 06:16:30,613 - trainer - INFO - Epoch = 1/5
824
+ 2024-04-26 06:16:30,614 - trainer - INFO - Steps = 7400/40800
825
+ 2024-04-26 06:16:30,614 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
826
+ 2024-04-26 06:16:30,614 - trainer - INFO - dev_loss = 1.204287 || dev_eval_scores = {'perplexity': 3.334381341934204}
827
+ 2024-04-26 06:16:30,614 - trainer - INFO - train_loss = 1.9464004039764404
828
+ 2024-04-26 06:16:30,614 - trainer - INFO -
829
+ ********************************************
830
+ 2024-04-26 06:22:50,090 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
831
+ 2024-04-26 06:22:55,903 - trainer - INFO - Save check-point at epoch=0 step=7600
832
+ 2024-04-26 06:22:55,903 - trainer - INFO - ***** Evaluation report *****
833
+ 2024-04-26 06:22:55,903 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
834
+ 2024-04-26 06:22:55,903 - trainer - INFO - Early stop on: perplexity
835
+ 2024-04-26 06:22:55,903 - trainer - INFO - Early stop count = 0/3
836
+ 2024-04-26 06:22:55,903 - trainer - INFO - Eval steps = 200 or (iterations = 200)
837
+ 2024-04-26 06:22:55,903 - trainer - INFO - Best score (perplexity) = -3.299593448638916
838
+ 2024-04-26 06:22:55,903 - trainer - INFO - Gradient Accumulation steps = 1
839
+ 2024-04-26 06:22:55,903 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
840
+ 2024-04-26 06:22:55,903 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
841
+ 2024-04-26 06:22:55,903 - trainer - INFO - Time spent since last evaluation = 0h 6m 25s
842
+ 2024-04-26 06:22:55,903 - trainer - INFO - Epoch = 1/5
843
+ 2024-04-26 06:22:55,904 - trainer - INFO - Steps = 7600/40800
844
+ 2024-04-26 06:22:55,904 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
845
+ 2024-04-26 06:22:55,904 - trainer - INFO - dev_loss = 1.193799 || dev_eval_scores = {'perplexity': 3.299593448638916}
846
+ 2024-04-26 06:22:55,904 - trainer - INFO - train_loss = 1.9291884899139404
847
+ 2024-04-26 06:22:55,904 - trainer - INFO -
848
+ ********************************************
849
+ 2024-04-26 06:29:16,451 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
850
+ 2024-04-26 06:29:22,361 - trainer - INFO - Save check-point at epoch=0 step=7800
851
+ 2024-04-26 06:29:22,361 - trainer - INFO - ***** Evaluation report *****
852
+ 2024-04-26 06:29:22,361 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
853
+ 2024-04-26 06:29:22,361 - trainer - INFO - Early stop on: perplexity
854
+ 2024-04-26 06:29:22,361 - trainer - INFO - Early stop count = 0/3
855
+ 2024-04-26 06:29:22,362 - trainer - INFO - Eval steps = 200 or (iterations = 200)
856
+ 2024-04-26 06:29:22,362 - trainer - INFO - Best score (perplexity) = -3.2615699768066406
857
+ 2024-04-26 06:29:22,362 - trainer - INFO - Gradient Accumulation steps = 1
858
+ 2024-04-26 06:29:22,362 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
859
+ 2024-04-26 06:29:22,362 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
860
+ 2024-04-26 06:29:22,362 - trainer - INFO - Time spent since last evaluation = 0h 6m 26s
861
+ 2024-04-26 06:29:22,362 - trainer - INFO - Epoch = 1/5
862
+ 2024-04-26 06:29:22,362 - trainer - INFO - Steps = 7800/40800
863
+ 2024-04-26 06:29:22,362 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
864
+ 2024-04-26 06:29:22,362 - trainer - INFO - dev_loss = 1.182209 || dev_eval_scores = {'perplexity': 3.2615699768066406}
865
+ 2024-04-26 06:29:22,362 - trainer - INFO - train_loss = 1.9121544361114502
866
+ 2024-04-26 06:29:22,362 - trainer - INFO -
867
+ ********************************************
868
+ 2024-04-26 06:35:42,699 - trainer - INFO - Save model to tmp/model/distilgpt2_fine_tuned_coder
869
+ 2024-04-26 06:35:48,617 - trainer - INFO - Save check-point at epoch=0 step=8000
870
+ 2024-04-26 06:35:48,617 - trainer - INFO - ***** Evaluation report *****
871
+ 2024-04-26 06:35:48,618 - trainer - INFO - Output path (short): tmp/model/distilgpt2_fine_tuned_coder
872
+ 2024-04-26 06:35:48,618 - trainer - INFO - Early stop on: perplexity
873
+ 2024-04-26 06:35:48,618 - trainer - INFO - Early stop count = 0/3
874
+ 2024-04-26 06:35:48,618 - trainer - INFO - Eval steps = 200 or (iterations = 200)
875
+ 2024-04-26 06:35:48,618 - trainer - INFO - Best score (perplexity) = -3.232813835144043
876
+ 2024-04-26 06:35:48,618 - trainer - INFO - Gradient Accumulation steps = 1
877
+ 2024-04-26 06:35:48,618 - trainer - INFO - Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = 130556
878
+ 2024-04-26 06:35:48,618 - trainer - INFO - Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = 14507
879
+ 2024-04-26 06:35:48,618 - trainer - INFO - Time spent since last evaluation = 0h 6m 26s
880
+ 2024-04-26 06:35:48,618 - trainer - INFO - Epoch = 1/5
881
+ 2024-04-26 06:35:48,618 - trainer - INFO - Steps = 8000/40800
882
+ 2024-04-26 06:35:48,618 - trainer - INFO - Instantaneous batch size per GPU = 4 and n_gpu = 4 so the input batch size = 16
883
+ 2024-04-26 06:35:48,618 - trainer - INFO - dev_loss = 1.173353 || dev_eval_scores = {'perplexity': 3.232813835144043}
884
+ 2024-04-26 06:35:48,618 - trainer - INFO - train_loss = 1.8961435556411743
885
+ 2024-04-26 06:35:48,619 - trainer - INFO -
886
+ ********************************************
887
+ 2024-04-26 06:37:54,340 - trainer - INFO - epoch 1 ends, 4 epoches left
888
+ 2024-04-26 06:37:54,862 - trainer - INFO -
889
+ global_average_loss=1.8839226961135864,global_steps=8160 on training set
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "0_GPTSingleHead",
6
+ "type": "model"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_EmptyHeads",
12
+ "type": "model"
13
+ }
14
+ ]