soughtlin commited on
Commit
bb8080b
·
verified ·
1 Parent(s): 6d1eefc

Upload 9 files

Browse files
transformer/GQA/best_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19a03700f4a6dcb8daeccacc4e6722a721dd37e345f3002e212d078141a222ac
3
+ size 115478760
transformer/GQA/config.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------- 分词器 -----------------
2
+ tokenizer: tokenizer.NltkTokenizer # 自定义分词器
3
+
4
+ # ------------- 模型结构 (针对 100K 数据 & 8G 显存优化) ----------------
5
+ model:
6
+ type: transformer
7
+ enc_layers: 3 # Transformer Encoder 层数
8
+ dec_layers: 3 # Transformer Decoder 层数
9
+ emb_size: 256 # 词向量 / 隐层维度
10
+ nhead: 4 # Multi-Head Attention 头数,每个头维度为 64 (256/4)
11
+ ffn_dim: 1024 # Feed-Forward 隐层
12
+ dropout: 0.05 # Dropout 概率
13
+ # 新参数,用于attention
14
+ attn_type: gqa # 代码中逻辑:走 GroupedQueryAttention
15
+ num_kv_heads: 2 # 2 个 Q 头共享 1 个 KV 头 (4/2=2)
16
+
17
+ # ------------- 训练超参 ----------------
18
+ train:
19
+ batch_size: 64
20
+ epochs: 15 # batchsize减少,epoch增加
21
+ lr: 0.0005 #
22
+ weight_decay: 0.0001
23
+ lr_step: 8 # batchsize减少,epoch增加,lr衰减适当减少
24
+ lr_gamma: 0.5 # 衰减系数
25
+ save_dir: runs/train/Nltk_100k
26
+ num_workers: 0
27
+
28
+ # ------------- 数据路径 ----------------
29
+ data:
30
+ raw_train: data/train_100k.jsonl # 输入文件不变
31
+ raw_val: data/valid.jsonl
32
+ raw_test: data/test.jsonl
33
+
34
+ processed_dir: data/processed_nltk_100k
35
+ train_processed: data/processed_nltk_100k/train.jsonl
36
+ val_processed: data/processed_nltk_100k/val.jsonl
37
+ test_processed: data/processed_nltk_100k/test.jsonl
38
+
39
+ src_vocab: data/processed_nltk_100k/src_vocab.pkl
40
+ tgt_vocab: data/processed_nltk_100k/tgt_vocab.pkl
41
+ min_freq: 2
42
+
43
+ # ------------- 其余 --------------------
44
+ seed: 3407 # 固定随机种子,保证可复现
transformer/GQA/log.txt ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [Info] Logging started. Output will be saved to runs\train\Nltk_100k\transformer\20251226_1433\log.txt
2
+ [Info] Model Architecture: TRANSFORMER
3
+ Seq2SeqTransformer(
4
+ (encoder): Encoder(
5
+ (embed): Embedding(34122, 256, padding_idx=0)
6
+ (pe): PositionalEncoding()
7
+ (layers): ModuleList(
8
+ (0-2): 3 x EncoderLayer(
9
+ (self_attn): GroupedQueryAttention(
10
+ (q_proj): Linear(in_features=256, out_features=256, bias=False)
11
+ (k_proj): Linear(in_features=256, out_features=128, bias=False)
12
+ (v_proj): Linear(in_features=256, out_features=128, bias=False)
13
+ (out_proj): Linear(in_features=256, out_features=256, bias=True)
14
+ (dropout): Dropout(p=0.05, inplace=False)
15
+ )
16
+ (ffn): FeedForward(
17
+ (linear1): Linear(in_features=256, out_features=1024, bias=True)
18
+ (activation): ReLU()
19
+ (dropout): Dropout(p=0.05, inplace=False)
20
+ (linear2): Linear(in_features=1024, out_features=256, bias=True)
21
+ )
22
+ (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
23
+ (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
24
+ (dropout): Dropout(p=0.05, inplace=False)
25
+ )
26
+ )
27
+ (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
28
+ )
29
+ (decoder): Decoder(
30
+ (embed): Embedding(28647, 256, padding_idx=0)
31
+ (pe): PositionalEncoding()
32
+ (layers): ModuleList(
33
+ (0-2): 3 x DecoderLayer(
34
+ (self_attn): GroupedQueryAttention(
35
+ (q_proj): Linear(in_features=256, out_features=256, bias=False)
36
+ (k_proj): Linear(in_features=256, out_features=128, bias=False)
37
+ (v_proj): Linear(in_features=256, out_features=128, bias=False)
38
+ (out_proj): Linear(in_features=256, out_features=256, bias=True)
39
+ (dropout): Dropout(p=0.05, inplace=False)
40
+ )
41
+ (cross_attn): MultiHeadAttention(
42
+ (q_proj): Linear(in_features=256, out_features=256, bias=False)
43
+ (k_proj): Linear(in_features=256, out_features=256, bias=False)
44
+ (v_proj): Linear(in_features=256, out_features=256, bias=False)
45
+ (out_proj): Linear(in_features=256, out_features=256, bias=True)
46
+ (dropout): Dropout(p=0.05, inplace=False)
47
+ )
48
+ (ffn): FeedForward(
49
+ (linear1): Linear(in_features=256, out_features=1024, bias=True)
50
+ (activation): ReLU()
51
+ (dropout): Dropout(p=0.05, inplace=False)
52
+ (linear2): Linear(in_features=1024, out_features=256, bias=True)
53
+ )
54
+ (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
55
+ (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
56
+ (norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
57
+ (dropout): Dropout(p=0.05, inplace=False)
58
+ )
59
+ )
60
+ (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
61
+ )
62
+ (proj): Linear(in_features=256, out_features=28647, bias=False)
63
+ )
64
+ [Info] Total Parameters: 28,532,992
65
+ [Info] Trainable Parameters: 28,532,992
66
+ [Info] Model Size (approx): 108.84 MB
67
+ ===== Epoch 1/10 =====
68
+ Epoch 01 | Time: 2m28s | train loss: 6.1915 | val loss: 6.0763
69
+ New best model saved to runs\train\Nltk_100k\transformer\20251226_1433\best_model.pt, best epoch: 1
70
+ ===== Epoch 2/10 =====
71
+ Epoch 02 | Time: 2m24s | train loss: 5.3152 | val loss: 5.6670
72
+ New best model saved to runs\train\Nltk_100k\transformer\20251226_1433\best_model.pt, best epoch: 2
73
+ ===== Epoch 3/10 =====
74
+ Epoch 03 | Time: 2m26s | train loss: 4.7037 | val loss: 5.3431
75
+ New best model saved to runs\train\Nltk_100k\transformer\20251226_1433\best_model.pt, best epoch: 3
76
+ ===== Epoch 4/10 =====
77
+ Epoch 04 | Time: 2m25s | train loss: 4.2414 | val loss: 5.1238
78
+ New best model saved to runs\train\Nltk_100k\transformer\20251226_1433\best_model.pt, best epoch: 4
79
+ ===== Epoch 5/10 =====
80
+ Epoch 05 | Time: 2m26s | train loss: 3.9414 | val loss: 5.0703
81
+ New best model saved to runs\train\Nltk_100k\transformer\20251226_1433\best_model.pt, best epoch: 5
82
+ ===== Epoch 6/10 =====
83
+ Epoch 06 | Time: 2m26s | train loss: 3.7310 | val loss: 5.0379
84
+ New best model saved to runs\train\Nltk_100k\transformer\20251226_1433\best_model.pt, best epoch: 6
85
+ ===== Epoch 7/10 =====
86
+ Epoch 07 | Time: 2m26s | train loss: 3.5739 | val loss: 5.0417
87
+ ===== Epoch 8/10 =====
88
+ Epoch 08 | Time: 2m26s | train loss: 3.4508 | val loss: 5.0557
89
+ ===== Epoch 9/10 =====
90
+ Epoch 09 | Time: 2m25s | train loss: 3.1906 | val loss: 5.0585
91
+ ===== Epoch 10/10 =====
92
+ Epoch 10 | Time: 2m26s | train loss: 3.0900 | val loss: 5.1495
93
+
94
+ Training finished! Total Epochs: 10 Total Time: 24m22s
transformer/MHA/best_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93644665a99dbe03250abee7b3798d4108f8f483479abcceae91e6030b8b784b
3
+ size 117051688
transformer/MHA/config_nltk_transformer_100k.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------- 分词器 -----------------
2
+ tokenizer: tokenizer.NltkTokenizer # 自定义分词器
3
+
4
+ # ------------- 模型结构 (针对 100K 数据 & 8G 显存优化) ----------------
5
+ model:
6
+ type: transformer
7
+ enc_layers: 3 # Transformer Encoder 层数
8
+ dec_layers: 3 # Transformer Decoder 层数
9
+ emb_size: 256 # 词向量 / 隐层维度
10
+ nhead: 4 # Multi-Head Attention 头数,每个头维度为 64 (256/4)
11
+ ffn_dim: 1024 # Feed-Forward 隐层
12
+ dropout: 0.05 # best
13
+ # 新参数,用于attention
14
+ attn_type: mha # 选项: 'mha' (默认), 'mqa', 'gqa', 'sparse'
15
+ num_kv_heads: 4 # 用于 MQA/GQA。MHA时等于nhead(4); MQA时为1; GQA时为2
16
+
17
+ # ------------- 训练超参 ----------------
18
+ train:
19
+ batch_size: 64
20
+ epochs: 10
21
+ lr: 0.0005 #
22
+ weight_decay: 0.0001
23
+ lr_step: 8 # batchsize减少,epoch增加,lr衰减适当减少
24
+ lr_gamma: 0.5 # 衰减系数
25
+ save_dir: runs/train/Nltk_100k
26
+ num_workers: 0
27
+
28
+ # ------------- 数据路径 ----------------
29
+ data:
30
+ raw_train: data/train_100k.jsonl # 输入文件不变
31
+ raw_val: data/valid.jsonl
32
+ raw_test: data/test.jsonl
33
+
34
+ processed_dir: data/processed_nltk_100k
35
+ train_processed: data/processed_nltk_100k/train.jsonl
36
+ val_processed: data/processed_nltk_100k/val.jsonl
37
+ test_processed: data/processed_nltk_100k/test.jsonl
38
+
39
+ src_vocab: data/processed_nltk_100k/src_vocab.pkl
40
+ tgt_vocab: data/processed_nltk_100k/tgt_vocab.pkl
41
+ min_freq: 2
42
+
43
+ # ------------- 其余 --------------------
44
+ seed: 3407 # 固定随机种子,保证可复现
transformer/MHA/log.txt ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [Info] Logging started. Output will be saved to runs\train\Nltk_100k\transformer\20251227_1329\log.txt
2
+ [Info] Model Architecture: TRANSFORMER
3
+ Seq2SeqTransformer(
4
+ (encoder): Encoder(
5
+ (embed): Embedding(34122, 256, padding_idx=0)
6
+ (pe): PositionalEncoding()
7
+ (layers): ModuleList(
8
+ (0-2): 3 x EncoderLayer(
9
+ (self_attn): MultiHeadAttention(
10
+ (q_proj): Linear(in_features=256, out_features=256, bias=False)
11
+ (k_proj): Linear(in_features=256, out_features=256, bias=False)
12
+ (v_proj): Linear(in_features=256, out_features=256, bias=False)
13
+ (out_proj): Linear(in_features=256, out_features=256, bias=True)
14
+ (dropout): Dropout(p=0.05, inplace=False)
15
+ )
16
+ (ffn): FeedForward(
17
+ (linear1): Linear(in_features=256, out_features=1024, bias=True)
18
+ (activation): ReLU()
19
+ (dropout): Dropout(p=0.05, inplace=False)
20
+ (linear2): Linear(in_features=1024, out_features=256, bias=True)
21
+ )
22
+ (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
23
+ (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
24
+ (dropout): Dropout(p=0.05, inplace=False)
25
+ )
26
+ )
27
+ (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
28
+ )
29
+ (decoder): Decoder(
30
+ (embed): Embedding(28647, 256, padding_idx=0)
31
+ (pe): PositionalEncoding()
32
+ (layers): ModuleList(
33
+ (0-2): 3 x DecoderLayer(
34
+ (self_attn): MultiHeadAttention(
35
+ (q_proj): Linear(in_features=256, out_features=256, bias=False)
36
+ (k_proj): Linear(in_features=256, out_features=256, bias=False)
37
+ (v_proj): Linear(in_features=256, out_features=256, bias=False)
38
+ (out_proj): Linear(in_features=256, out_features=256, bias=True)
39
+ (dropout): Dropout(p=0.05, inplace=False)
40
+ )
41
+ (cross_attn): MultiHeadAttention(
42
+ (q_proj): Linear(in_features=256, out_features=256, bias=False)
43
+ (k_proj): Linear(in_features=256, out_features=256, bias=False)
44
+ (v_proj): Linear(in_features=256, out_features=256, bias=False)
45
+ (out_proj): Linear(in_features=256, out_features=256, bias=True)
46
+ (dropout): Dropout(p=0.05, inplace=False)
47
+ )
48
+ (ffn): FeedForward(
49
+ (linear1): Linear(in_features=256, out_features=1024, bias=True)
50
+ (activation): ReLU()
51
+ (dropout): Dropout(p=0.05, inplace=False)
52
+ (linear2): Linear(in_features=1024, out_features=256, bias=True)
53
+ )
54
+ (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
55
+ (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
56
+ (norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
57
+ (dropout): Dropout(p=0.05, inplace=False)
58
+ )
59
+ )
60
+ (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
61
+ )
62
+ (proj): Linear(in_features=256, out_features=28647, bias=False)
63
+ )
64
+ [Info] Total Parameters: 28,926,208
65
+ [Info] Trainable Parameters: 28,926,208
66
+ [Info] Model Size (approx): 110.34 MB
67
+ ===== Epoch 1/10 =====
68
+ Epoch 01 | Time: 2m21s | train loss: 6.1264 | val loss: 6.0301
69
+ New best model saved to runs\train\Nltk_100k\transformer\20251227_1329\best_model.pt, best epoch: 1
70
+ ===== Epoch 2/10 =====
71
+ Epoch 02 | Time: 2m8s | train loss: 5.1221 | val loss: 5.4822
72
+ New best model saved to runs\train\Nltk_100k\transformer\20251227_1329\best_model.pt, best epoch: 2
73
+ ===== Epoch 3/10 =====
74
+ Epoch 03 | Time: 2m9s | train loss: 4.4576 | val loss: 5.1111
75
+ New best model saved to runs\train\Nltk_100k\transformer\20251227_1329\best_model.pt, best epoch: 3
76
+ ===== Epoch 4/10 =====
77
+ Epoch 04 | Time: 2m7s | train loss: 4.0223 | val loss: 5.0020
78
+ New best model saved to runs\train\Nltk_100k\transformer\20251227_1329\best_model.pt, best epoch: 4
79
+ ===== Epoch 5/10 =====
80
+ Epoch 05 | Time: 2m7s | train loss: 3.7541 | val loss: 4.9490
81
+ New best model saved to runs\train\Nltk_100k\transformer\20251227_1329\best_model.pt, best epoch: 5
82
+ ===== Epoch 6/10 =====
83
+ Epoch 06 | Time: 2m7s | train loss: 3.5682 | val loss: 4.9301
84
+ New best model saved to runs\train\Nltk_100k\transformer\20251227_1329\best_model.pt, best epoch: 6
85
+ ===== Epoch 7/10 =====
86
+ Epoch 07 | Time: 2m7s | train loss: 3.4254 | val loss: 4.9166
87
+ New best model saved to runs\train\Nltk_100k\transformer\20251227_1329\best_model.pt, best epoch: 7
88
+ ===== Epoch 8/10 =====
89
+ Epoch 08 | Time: 2m8s | train loss: 3.3132 | val loss: 4.9628
90
+ ===== Epoch 9/10 =====
91
+ Epoch 09 | Time: 2m7s | train loss: 3.0561 | val loss: 4.9778
92
+ ===== Epoch 10/10 =====
93
+ Epoch 10 | Time: 2m7s | train loss: 2.9551 | val loss: 5.0848
94
+
95
+ Training finished! Total Epochs: 10 Total Time: 21m35s
transformer/MQA/best_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44390b04f4e756dc99e1983207bd16fc58046e4aadd4ecaf890b96dea9c98c9f
3
+ size 114692328
transformer/MQA/config.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------- 分词器 -----------------
2
+ tokenizer: tokenizer.NltkTokenizer # 自定义分词器
3
+
4
+ # ------------- 模型结构 (针对 100K 数据 & 8G 显存优化) ----------------
5
+ model:
6
+ type: transformer
7
+ enc_layers: 3 # Transformer Encoder 层数
8
+ dec_layers: 3 # Transformer Decoder 层数
9
+ emb_size: 256 # 词向量 / 隐层维度
10
+ nhead: 4 # Multi-Head Attention 头数,每个头维度为 64 (256/4)
11
+ ffn_dim: 1024 # Feed-Forward 隐层
12
+ dropout: 0.05 # Dropout 概率
13
+ # 新参数,用于attention
14
+ attn_type: mqa # 代码中逻辑:走 GroupedQueryAttention
15
+ num_kv_heads: 1 # 只有 1 个 KV 头
16
+ # ------------- 训练超参 ----------------
17
+ train:
18
+ batch_size: 64
19
+ epochs: 10 # batchsize减少,epoch增加
20
+ lr: 0.0005 #
21
+ weight_decay: 0.0001
22
+ lr_step: 8 # batchsize减少,epoch增加,lr衰减适当减少
23
+ lr_gamma: 0.5 # 衰减系数
24
+ save_dir: runs/train/Nltk_100k
25
+ num_workers: 0
26
+
27
+ # ------------- 数据路径 ----------------
28
+ data:
29
+ raw_train: data/train_100k.jsonl # 输入文件不变
30
+ raw_val: data/valid.jsonl
31
+ raw_test: data/test.jsonl
32
+
33
+ processed_dir: data/processed_nltk_100k
34
+ train_processed: data/processed_nltk_100k/train.jsonl
35
+ val_processed: data/processed_nltk_100k/val.jsonl
36
+ test_processed: data/processed_nltk_100k/test.jsonl
37
+
38
+ src_vocab: data/processed_nltk_100k/src_vocab.pkl
39
+ tgt_vocab: data/processed_nltk_100k/tgt_vocab.pkl
40
+ min_freq: 2
41
+
42
+ # ------------- 其余 --------------------
43
+ seed: 3407 # 固定随机种子,保证可复现
transformer/MQA/log.txt ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [Info] Logging started. Output will be saved to runs\train\Nltk_100k\transformer\20251226_1357\log.txt
2
+ [Info] Model Architecture: TRANSFORMER
3
+ Seq2SeqTransformer(
4
+ (encoder): Encoder(
5
+ (embed): Embedding(34122, 256, padding_idx=0)
6
+ (pe): PositionalEncoding()
7
+ (layers): ModuleList(
8
+ (0-2): 3 x EncoderLayer(
9
+ (self_attn): GroupedQueryAttention(
10
+ (q_proj): Linear(in_features=256, out_features=256, bias=False)
11
+ (k_proj): Linear(in_features=256, out_features=64, bias=False)
12
+ (v_proj): Linear(in_features=256, out_features=64, bias=False)
13
+ (out_proj): Linear(in_features=256, out_features=256, bias=True)
14
+ (dropout): Dropout(p=0.05, inplace=False)
15
+ )
16
+ (ffn): FeedForward(
17
+ (linear1): Linear(in_features=256, out_features=1024, bias=True)
18
+ (activation): ReLU()
19
+ (dropout): Dropout(p=0.05, inplace=False)
20
+ (linear2): Linear(in_features=1024, out_features=256, bias=True)
21
+ )
22
+ (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
23
+ (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
24
+ (dropout): Dropout(p=0.05, inplace=False)
25
+ )
26
+ )
27
+ (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
28
+ )
29
+ (decoder): Decoder(
30
+ (embed): Embedding(28647, 256, padding_idx=0)
31
+ (pe): PositionalEncoding()
32
+ (layers): ModuleList(
33
+ (0-2): 3 x DecoderLayer(
34
+ (self_attn): GroupedQueryAttention(
35
+ (q_proj): Linear(in_features=256, out_features=256, bias=False)
36
+ (k_proj): Linear(in_features=256, out_features=64, bias=False)
37
+ (v_proj): Linear(in_features=256, out_features=64, bias=False)
38
+ (out_proj): Linear(in_features=256, out_features=256, bias=True)
39
+ (dropout): Dropout(p=0.05, inplace=False)
40
+ )
41
+ (cross_attn): MultiHeadAttention(
42
+ (q_proj): Linear(in_features=256, out_features=256, bias=False)
43
+ (k_proj): Linear(in_features=256, out_features=256, bias=False)
44
+ (v_proj): Linear(in_features=256, out_features=256, bias=False)
45
+ (out_proj): Linear(in_features=256, out_features=256, bias=True)
46
+ (dropout): Dropout(p=0.05, inplace=False)
47
+ )
48
+ (ffn): FeedForward(
49
+ (linear1): Linear(in_features=256, out_features=1024, bias=True)
50
+ (activation): ReLU()
51
+ (dropout): Dropout(p=0.05, inplace=False)
52
+ (linear2): Linear(in_features=1024, out_features=256, bias=True)
53
+ )
54
+ (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
55
+ (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
56
+ (norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
57
+ (dropout): Dropout(p=0.05, inplace=False)
58
+ )
59
+ )
60
+ (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
61
+ )
62
+ (proj): Linear(in_features=256, out_features=28647, bias=False)
63
+ )
64
+ [Info] Total Parameters: 28,336,384
65
+ [Info] Trainable Parameters: 28,336,384
66
+ [Info] Model Size (approx): 108.09 MB
67
+ ===== Epoch 1/10 =====
68
+ Epoch 01 | Time: 3m34s | train loss: 6.2330 | val loss: 6.0593
69
+ New best model saved to runs\train\Nltk_100k\transformer\20251226_1357\best_model.pt, best epoch: 1
70
+ ===== Epoch 2/10 =====
71
+ Epoch 02 | Time: 3m37s | train loss: 5.2751 | val loss: 5.6802
72
+ New best model saved to runs\train\Nltk_100k\transformer\20251226_1357\best_model.pt, best epoch: 2
73
+ ===== Epoch 3/10 =====
74
+ Epoch 03 | Time: 3m28s | train loss: 4.7900 | val loss: 5.4412
75
+ New best model saved to runs\train\Nltk_100k\transformer\20251226_1357\best_model.pt, best epoch: 3
76
+ ===== Epoch 4/10 =====
77
+ Epoch 04 | Time: 3m32s | train loss: 4.4403 | val loss: 5.2774
78
+ New best model saved to runs\train\Nltk_100k\transformer\20251226_1357\best_model.pt, best epoch: 4
79
+ ===== Epoch 5/10 =====
80
+ Epoch 05 | Time: 3m34s | train loss: 4.1581 | val loss: 5.2082
81
+ New best model saved to runs\train\Nltk_100k\transformer\20251226_1357\best_model.pt, best epoch: 5
82
+ ===== Epoch 6/10 =====
83
+ Epoch 06 | Time: 3m33s | train loss: 3.9421 | val loss: 5.1301
84
+ New best model saved to runs\train\Nltk_100k\transformer\20251226_1357\best_model.pt, best epoch: 6
85
+ ===== Epoch 7/10 =====
86
+ Epoch 07 | Time: 3m36s | train loss: 3.7749 | val loss: 5.0828
87
+ New best model saved to runs\train\Nltk_100k\transformer\20251226_1357\best_model.pt, best epoch: 7
88
+ ===== Epoch 8/10 =====
89
+ Epoch 08 | Time: 3m33s | train loss: 3.6390 | val loss: 5.0811
90
+ New best model saved to runs\train\Nltk_100k\transformer\20251226_1357\best_model.pt, best epoch: 8
91
+ ===== Epoch 9/10 =====
92
+ Epoch 09 | Time: 3m37s | train loss: 3.3762 | val loss: 5.0981
93
+ ===== Epoch 10/10 =====
94
+ Epoch 10 | Time: 3m32s | train loss: 3.2740 | val loss: 5.1790
95
+
96
+ Training finished! Total Epochs: 10 Total Time: 35m43s