Upload 9 files
Browse files- transformer/GQA/best_model.pt +3 -0
- transformer/GQA/config.yaml +44 -0
- transformer/GQA/log.txt +94 -0
- transformer/MHA/best_model.pt +3 -0
- transformer/MHA/config_nltk_transformer_100k.yaml +44 -0
- transformer/MHA/log.txt +95 -0
- transformer/MQA/best_model.pt +3 -0
- transformer/MQA/config.yaml +43 -0
- transformer/MQA/log.txt +96 -0
transformer/GQA/best_model.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:19a03700f4a6dcb8daeccacc4e6722a721dd37e345f3002e212d078141a222ac
|
| 3 |
+
size 115478760
|
transformer/GQA/config.yaml
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ------------- 分词器 -----------------
|
| 2 |
+
tokenizer: tokenizer.NltkTokenizer # 自定义分词器
|
| 3 |
+
|
| 4 |
+
# ------------- 模型结构 (针对 100K 数据 & 8G 显存优化) ----------------
|
| 5 |
+
model:
|
| 6 |
+
type: transformer
|
| 7 |
+
enc_layers: 3 # Transformer Encoder 层数
|
| 8 |
+
dec_layers: 3 # Transformer Decoder 层数
|
| 9 |
+
emb_size: 256 # 词向量 / 隐层维度
|
| 10 |
+
nhead: 4 # Multi-Head Attention 头数,每个头维度为 64 (256/4)
|
| 11 |
+
ffn_dim: 1024 # Feed-Forward 隐层
|
| 12 |
+
dropout: 0.05 # Dropout 概率
|
| 13 |
+
# 新参数,用于attention
|
| 14 |
+
attn_type: gqa # 代码中逻辑:走 GroupedQueryAttention
|
| 15 |
+
num_kv_heads: 2 # 2 个 Q 头共享 1 个 KV 头 (4/2=2)
|
| 16 |
+
|
| 17 |
+
# ------------- 训练超参 ----------------
|
| 18 |
+
train:
|
| 19 |
+
batch_size: 64
|
| 20 |
+
epochs: 15 # batchsize减少,epoch增加
|
| 21 |
+
lr: 0.0005 #
|
| 22 |
+
weight_decay: 0.0001
|
| 23 |
+
lr_step: 8 # batchsize减少,epoch增加,lr衰减适当减少
|
| 24 |
+
lr_gamma: 0.5 # 衰减系数
|
| 25 |
+
save_dir: runs/train/Nltk_100k
|
| 26 |
+
num_workers: 0
|
| 27 |
+
|
| 28 |
+
# ------------- 数据路径 ----------------
|
| 29 |
+
data:
|
| 30 |
+
raw_train: data/train_100k.jsonl # 输入文件不变
|
| 31 |
+
raw_val: data/valid.jsonl
|
| 32 |
+
raw_test: data/test.jsonl
|
| 33 |
+
|
| 34 |
+
processed_dir: data/processed_nltk_100k
|
| 35 |
+
train_processed: data/processed_nltk_100k/train.jsonl
|
| 36 |
+
val_processed: data/processed_nltk_100k/val.jsonl
|
| 37 |
+
test_processed: data/processed_nltk_100k/test.jsonl
|
| 38 |
+
|
| 39 |
+
src_vocab: data/processed_nltk_100k/src_vocab.pkl
|
| 40 |
+
tgt_vocab: data/processed_nltk_100k/tgt_vocab.pkl
|
| 41 |
+
min_freq: 2
|
| 42 |
+
|
| 43 |
+
# ------------- 其余 --------------------
|
| 44 |
+
seed: 3407 # 固定随机种子,保证可复现
|
transformer/GQA/log.txt
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[Info] Logging started. Output will be saved to runs\train\Nltk_100k\transformer\20251226_1433\log.txt
|
| 2 |
+
[Info] Model Architecture: TRANSFORMER
|
| 3 |
+
Seq2SeqTransformer(
|
| 4 |
+
(encoder): Encoder(
|
| 5 |
+
(embed): Embedding(34122, 256, padding_idx=0)
|
| 6 |
+
(pe): PositionalEncoding()
|
| 7 |
+
(layers): ModuleList(
|
| 8 |
+
(0-2): 3 x EncoderLayer(
|
| 9 |
+
(self_attn): GroupedQueryAttention(
|
| 10 |
+
(q_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 11 |
+
(k_proj): Linear(in_features=256, out_features=128, bias=False)
|
| 12 |
+
(v_proj): Linear(in_features=256, out_features=128, bias=False)
|
| 13 |
+
(out_proj): Linear(in_features=256, out_features=256, bias=True)
|
| 14 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 15 |
+
)
|
| 16 |
+
(ffn): FeedForward(
|
| 17 |
+
(linear1): Linear(in_features=256, out_features=1024, bias=True)
|
| 18 |
+
(activation): ReLU()
|
| 19 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 20 |
+
(linear2): Linear(in_features=1024, out_features=256, bias=True)
|
| 21 |
+
)
|
| 22 |
+
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 23 |
+
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 24 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 25 |
+
)
|
| 26 |
+
)
|
| 27 |
+
(norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 28 |
+
)
|
| 29 |
+
(decoder): Decoder(
|
| 30 |
+
(embed): Embedding(28647, 256, padding_idx=0)
|
| 31 |
+
(pe): PositionalEncoding()
|
| 32 |
+
(layers): ModuleList(
|
| 33 |
+
(0-2): 3 x DecoderLayer(
|
| 34 |
+
(self_attn): GroupedQueryAttention(
|
| 35 |
+
(q_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 36 |
+
(k_proj): Linear(in_features=256, out_features=128, bias=False)
|
| 37 |
+
(v_proj): Linear(in_features=256, out_features=128, bias=False)
|
| 38 |
+
(out_proj): Linear(in_features=256, out_features=256, bias=True)
|
| 39 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 40 |
+
)
|
| 41 |
+
(cross_attn): MultiHeadAttention(
|
| 42 |
+
(q_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 43 |
+
(k_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 44 |
+
(v_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 45 |
+
(out_proj): Linear(in_features=256, out_features=256, bias=True)
|
| 46 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 47 |
+
)
|
| 48 |
+
(ffn): FeedForward(
|
| 49 |
+
(linear1): Linear(in_features=256, out_features=1024, bias=True)
|
| 50 |
+
(activation): ReLU()
|
| 51 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 52 |
+
(linear2): Linear(in_features=1024, out_features=256, bias=True)
|
| 53 |
+
)
|
| 54 |
+
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 55 |
+
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 56 |
+
(norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 57 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 58 |
+
)
|
| 59 |
+
)
|
| 60 |
+
(norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 61 |
+
)
|
| 62 |
+
(proj): Linear(in_features=256, out_features=28647, bias=False)
|
| 63 |
+
)
|
| 64 |
+
[Info] Total Parameters: 28,532,992
|
| 65 |
+
[Info] Trainable Parameters: 28,532,992
|
| 66 |
+
[Info] Model Size (approx): 108.84 MB
|
| 67 |
+
===== Epoch 1/10 =====
|
| 68 |
+
Epoch 01 | Time: 2m28s | train loss: 6.1915 | val loss: 6.0763
|
| 69 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251226_1433\best_model.pt, best epoch: 1
|
| 70 |
+
===== Epoch 2/10 =====
|
| 71 |
+
Epoch 02 | Time: 2m24s | train loss: 5.3152 | val loss: 5.6670
|
| 72 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251226_1433\best_model.pt, best epoch: 2
|
| 73 |
+
===== Epoch 3/10 =====
|
| 74 |
+
Epoch 03 | Time: 2m26s | train loss: 4.7037 | val loss: 5.3431
|
| 75 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251226_1433\best_model.pt, best epoch: 3
|
| 76 |
+
===== Epoch 4/10 =====
|
| 77 |
+
Epoch 04 | Time: 2m25s | train loss: 4.2414 | val loss: 5.1238
|
| 78 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251226_1433\best_model.pt, best epoch: 4
|
| 79 |
+
===== Epoch 5/10 =====
|
| 80 |
+
Epoch 05 | Time: 2m26s | train loss: 3.9414 | val loss: 5.0703
|
| 81 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251226_1433\best_model.pt, best epoch: 5
|
| 82 |
+
===== Epoch 6/10 =====
|
| 83 |
+
Epoch 06 | Time: 2m26s | train loss: 3.7310 | val loss: 5.0379
|
| 84 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251226_1433\best_model.pt, best epoch: 6
|
| 85 |
+
===== Epoch 7/10 =====
|
| 86 |
+
Epoch 07 | Time: 2m26s | train loss: 3.5739 | val loss: 5.0417
|
| 87 |
+
===== Epoch 8/10 =====
|
| 88 |
+
Epoch 08 | Time: 2m26s | train loss: 3.4508 | val loss: 5.0557
|
| 89 |
+
===== Epoch 9/10 =====
|
| 90 |
+
Epoch 09 | Time: 2m25s | train loss: 3.1906 | val loss: 5.0585
|
| 91 |
+
===== Epoch 10/10 =====
|
| 92 |
+
Epoch 10 | Time: 2m26s | train loss: 3.0900 | val loss: 5.1495
|
| 93 |
+
|
| 94 |
+
Training finished! Total Epochs: 10 Total Time: 24m22s
|
transformer/MHA/best_model.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:93644665a99dbe03250abee7b3798d4108f8f483479abcceae91e6030b8b784b
|
| 3 |
+
size 117051688
|
transformer/MHA/config_nltk_transformer_100k.yaml
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ------------- 分词器 -----------------
|
| 2 |
+
tokenizer: tokenizer.NltkTokenizer # 自定义分词器
|
| 3 |
+
|
| 4 |
+
# ------------- 模型结构 (针对 100K 数据 & 8G 显存优化) ----------------
|
| 5 |
+
model:
|
| 6 |
+
type: transformer
|
| 7 |
+
enc_layers: 3 # Transformer Encoder 层数
|
| 8 |
+
dec_layers: 3 # Transformer Decoder 层数
|
| 9 |
+
emb_size: 256 # 词向量 / 隐层维度
|
| 10 |
+
nhead: 4 # Multi-Head Attention 头数,每个头维度为 64 (256/4)
|
| 11 |
+
ffn_dim: 1024 # Feed-Forward 隐层
|
| 12 |
+
dropout: 0.05 # best
|
| 13 |
+
# 新参数,用于attention
|
| 14 |
+
attn_type: mha # 选项: 'mha' (默认), 'mqa', 'gqa', 'sparse'
|
| 15 |
+
num_kv_heads: 4 # 用于 MQA/GQA。MHA时等于nhead(4); MQA时为1; GQA时为2
|
| 16 |
+
|
| 17 |
+
# ------------- 训练超参 ----------------
|
| 18 |
+
train:
|
| 19 |
+
batch_size: 64
|
| 20 |
+
epochs: 10
|
| 21 |
+
lr: 0.0005 #
|
| 22 |
+
weight_decay: 0.0001
|
| 23 |
+
lr_step: 8 # batchsize减少,epoch增加,lr衰减适当减少
|
| 24 |
+
lr_gamma: 0.5 # 衰减系数
|
| 25 |
+
save_dir: runs/train/Nltk_100k
|
| 26 |
+
num_workers: 0
|
| 27 |
+
|
| 28 |
+
# ------------- 数据路径 ----------------
|
| 29 |
+
data:
|
| 30 |
+
raw_train: data/train_100k.jsonl # 输入文件不变
|
| 31 |
+
raw_val: data/valid.jsonl
|
| 32 |
+
raw_test: data/test.jsonl
|
| 33 |
+
|
| 34 |
+
processed_dir: data/processed_nltk_100k
|
| 35 |
+
train_processed: data/processed_nltk_100k/train.jsonl
|
| 36 |
+
val_processed: data/processed_nltk_100k/val.jsonl
|
| 37 |
+
test_processed: data/processed_nltk_100k/test.jsonl
|
| 38 |
+
|
| 39 |
+
src_vocab: data/processed_nltk_100k/src_vocab.pkl
|
| 40 |
+
tgt_vocab: data/processed_nltk_100k/tgt_vocab.pkl
|
| 41 |
+
min_freq: 2
|
| 42 |
+
|
| 43 |
+
# ------------- 其余 --------------------
|
| 44 |
+
seed: 3407 # 固定随机种子,保证可复现
|
transformer/MHA/log.txt
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[Info] Logging started. Output will be saved to runs\train\Nltk_100k\transformer\20251227_1329\log.txt
|
| 2 |
+
[Info] Model Architecture: TRANSFORMER
|
| 3 |
+
Seq2SeqTransformer(
|
| 4 |
+
(encoder): Encoder(
|
| 5 |
+
(embed): Embedding(34122, 256, padding_idx=0)
|
| 6 |
+
(pe): PositionalEncoding()
|
| 7 |
+
(layers): ModuleList(
|
| 8 |
+
(0-2): 3 x EncoderLayer(
|
| 9 |
+
(self_attn): MultiHeadAttention(
|
| 10 |
+
(q_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 11 |
+
(k_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 12 |
+
(v_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 13 |
+
(out_proj): Linear(in_features=256, out_features=256, bias=True)
|
| 14 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 15 |
+
)
|
| 16 |
+
(ffn): FeedForward(
|
| 17 |
+
(linear1): Linear(in_features=256, out_features=1024, bias=True)
|
| 18 |
+
(activation): ReLU()
|
| 19 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 20 |
+
(linear2): Linear(in_features=1024, out_features=256, bias=True)
|
| 21 |
+
)
|
| 22 |
+
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 23 |
+
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 24 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 25 |
+
)
|
| 26 |
+
)
|
| 27 |
+
(norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 28 |
+
)
|
| 29 |
+
(decoder): Decoder(
|
| 30 |
+
(embed): Embedding(28647, 256, padding_idx=0)
|
| 31 |
+
(pe): PositionalEncoding()
|
| 32 |
+
(layers): ModuleList(
|
| 33 |
+
(0-2): 3 x DecoderLayer(
|
| 34 |
+
(self_attn): MultiHeadAttention(
|
| 35 |
+
(q_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 36 |
+
(k_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 37 |
+
(v_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 38 |
+
(out_proj): Linear(in_features=256, out_features=256, bias=True)
|
| 39 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 40 |
+
)
|
| 41 |
+
(cross_attn): MultiHeadAttention(
|
| 42 |
+
(q_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 43 |
+
(k_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 44 |
+
(v_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 45 |
+
(out_proj): Linear(in_features=256, out_features=256, bias=True)
|
| 46 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 47 |
+
)
|
| 48 |
+
(ffn): FeedForward(
|
| 49 |
+
(linear1): Linear(in_features=256, out_features=1024, bias=True)
|
| 50 |
+
(activation): ReLU()
|
| 51 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 52 |
+
(linear2): Linear(in_features=1024, out_features=256, bias=True)
|
| 53 |
+
)
|
| 54 |
+
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 55 |
+
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 56 |
+
(norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 57 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 58 |
+
)
|
| 59 |
+
)
|
| 60 |
+
(norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 61 |
+
)
|
| 62 |
+
(proj): Linear(in_features=256, out_features=28647, bias=False)
|
| 63 |
+
)
|
| 64 |
+
[Info] Total Parameters: 28,926,208
|
| 65 |
+
[Info] Trainable Parameters: 28,926,208
|
| 66 |
+
[Info] Model Size (approx): 110.34 MB
|
| 67 |
+
===== Epoch 1/10 =====
|
| 68 |
+
Epoch 01 | Time: 2m21s | train loss: 6.1264 | val loss: 6.0301
|
| 69 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251227_1329\best_model.pt, best epoch: 1
|
| 70 |
+
===== Epoch 2/10 =====
|
| 71 |
+
Epoch 02 | Time: 2m8s | train loss: 5.1221 | val loss: 5.4822
|
| 72 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251227_1329\best_model.pt, best epoch: 2
|
| 73 |
+
===== Epoch 3/10 =====
|
| 74 |
+
Epoch 03 | Time: 2m9s | train loss: 4.4576 | val loss: 5.1111
|
| 75 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251227_1329\best_model.pt, best epoch: 3
|
| 76 |
+
===== Epoch 4/10 =====
|
| 77 |
+
Epoch 04 | Time: 2m7s | train loss: 4.0223 | val loss: 5.0020
|
| 78 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251227_1329\best_model.pt, best epoch: 4
|
| 79 |
+
===== Epoch 5/10 =====
|
| 80 |
+
Epoch 05 | Time: 2m7s | train loss: 3.7541 | val loss: 4.9490
|
| 81 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251227_1329\best_model.pt, best epoch: 5
|
| 82 |
+
===== Epoch 6/10 =====
|
| 83 |
+
Epoch 06 | Time: 2m7s | train loss: 3.5682 | val loss: 4.9301
|
| 84 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251227_1329\best_model.pt, best epoch: 6
|
| 85 |
+
===== Epoch 7/10 =====
|
| 86 |
+
Epoch 07 | Time: 2m7s | train loss: 3.4254 | val loss: 4.9166
|
| 87 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251227_1329\best_model.pt, best epoch: 7
|
| 88 |
+
===== Epoch 8/10 =====
|
| 89 |
+
Epoch 08 | Time: 2m8s | train loss: 3.3132 | val loss: 4.9628
|
| 90 |
+
===== Epoch 9/10 =====
|
| 91 |
+
Epoch 09 | Time: 2m7s | train loss: 3.0561 | val loss: 4.9778
|
| 92 |
+
===== Epoch 10/10 =====
|
| 93 |
+
Epoch 10 | Time: 2m7s | train loss: 2.9551 | val loss: 5.0848
|
| 94 |
+
|
| 95 |
+
Training finished! Total Epochs: 10 Total Time: 21m35s
|
transformer/MQA/best_model.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:44390b04f4e756dc99e1983207bd16fc58046e4aadd4ecaf890b96dea9c98c9f
|
| 3 |
+
size 114692328
|
transformer/MQA/config.yaml
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ------------- 分词器 -----------------
|
| 2 |
+
tokenizer: tokenizer.NltkTokenizer # 自定义分词器
|
| 3 |
+
|
| 4 |
+
# ------------- 模型结构 (针对 100K 数据 & 8G 显存优化) ----------------
|
| 5 |
+
model:
|
| 6 |
+
type: transformer
|
| 7 |
+
enc_layers: 3 # Transformer Encoder 层数
|
| 8 |
+
dec_layers: 3 # Transformer Decoder 层数
|
| 9 |
+
emb_size: 256 # 词向量 / 隐层维度
|
| 10 |
+
nhead: 4 # Multi-Head Attention 头数,每个头维度为 64 (256/4)
|
| 11 |
+
ffn_dim: 1024 # Feed-Forward 隐层
|
| 12 |
+
dropout: 0.05 # Dropout 概率
|
| 13 |
+
# 新参数,用于attention
|
| 14 |
+
attn_type: mqa # 代码中逻辑:走 GroupedQueryAttention
|
| 15 |
+
num_kv_heads: 1 # 只有 1 个 KV 头
|
| 16 |
+
# ------------- 训练超参 ----------------
|
| 17 |
+
train:
|
| 18 |
+
batch_size: 64
|
| 19 |
+
epochs: 10 # batchsize减少,epoch增加
|
| 20 |
+
lr: 0.0005 #
|
| 21 |
+
weight_decay: 0.0001
|
| 22 |
+
lr_step: 8 # batchsize减少,epoch增加,lr衰减适当减少
|
| 23 |
+
lr_gamma: 0.5 # 衰减系数
|
| 24 |
+
save_dir: runs/train/Nltk_100k
|
| 25 |
+
num_workers: 0
|
| 26 |
+
|
| 27 |
+
# ------------- 数据路径 ----------------
|
| 28 |
+
data:
|
| 29 |
+
raw_train: data/train_100k.jsonl # 输入文件不变
|
| 30 |
+
raw_val: data/valid.jsonl
|
| 31 |
+
raw_test: data/test.jsonl
|
| 32 |
+
|
| 33 |
+
processed_dir: data/processed_nltk_100k
|
| 34 |
+
train_processed: data/processed_nltk_100k/train.jsonl
|
| 35 |
+
val_processed: data/processed_nltk_100k/val.jsonl
|
| 36 |
+
test_processed: data/processed_nltk_100k/test.jsonl
|
| 37 |
+
|
| 38 |
+
src_vocab: data/processed_nltk_100k/src_vocab.pkl
|
| 39 |
+
tgt_vocab: data/processed_nltk_100k/tgt_vocab.pkl
|
| 40 |
+
min_freq: 2
|
| 41 |
+
|
| 42 |
+
# ------------- 其余 --------------------
|
| 43 |
+
seed: 3407 # 固定随机种子,保证可复现
|
transformer/MQA/log.txt
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[Info] Logging started. Output will be saved to runs\train\Nltk_100k\transformer\20251226_1357\log.txt
|
| 2 |
+
[Info] Model Architecture: TRANSFORMER
|
| 3 |
+
Seq2SeqTransformer(
|
| 4 |
+
(encoder): Encoder(
|
| 5 |
+
(embed): Embedding(34122, 256, padding_idx=0)
|
| 6 |
+
(pe): PositionalEncoding()
|
| 7 |
+
(layers): ModuleList(
|
| 8 |
+
(0-2): 3 x EncoderLayer(
|
| 9 |
+
(self_attn): GroupedQueryAttention(
|
| 10 |
+
(q_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 11 |
+
(k_proj): Linear(in_features=256, out_features=64, bias=False)
|
| 12 |
+
(v_proj): Linear(in_features=256, out_features=64, bias=False)
|
| 13 |
+
(out_proj): Linear(in_features=256, out_features=256, bias=True)
|
| 14 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 15 |
+
)
|
| 16 |
+
(ffn): FeedForward(
|
| 17 |
+
(linear1): Linear(in_features=256, out_features=1024, bias=True)
|
| 18 |
+
(activation): ReLU()
|
| 19 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 20 |
+
(linear2): Linear(in_features=1024, out_features=256, bias=True)
|
| 21 |
+
)
|
| 22 |
+
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 23 |
+
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 24 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 25 |
+
)
|
| 26 |
+
)
|
| 27 |
+
(norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 28 |
+
)
|
| 29 |
+
(decoder): Decoder(
|
| 30 |
+
(embed): Embedding(28647, 256, padding_idx=0)
|
| 31 |
+
(pe): PositionalEncoding()
|
| 32 |
+
(layers): ModuleList(
|
| 33 |
+
(0-2): 3 x DecoderLayer(
|
| 34 |
+
(self_attn): GroupedQueryAttention(
|
| 35 |
+
(q_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 36 |
+
(k_proj): Linear(in_features=256, out_features=64, bias=False)
|
| 37 |
+
(v_proj): Linear(in_features=256, out_features=64, bias=False)
|
| 38 |
+
(out_proj): Linear(in_features=256, out_features=256, bias=True)
|
| 39 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 40 |
+
)
|
| 41 |
+
(cross_attn): MultiHeadAttention(
|
| 42 |
+
(q_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 43 |
+
(k_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 44 |
+
(v_proj): Linear(in_features=256, out_features=256, bias=False)
|
| 45 |
+
(out_proj): Linear(in_features=256, out_features=256, bias=True)
|
| 46 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 47 |
+
)
|
| 48 |
+
(ffn): FeedForward(
|
| 49 |
+
(linear1): Linear(in_features=256, out_features=1024, bias=True)
|
| 50 |
+
(activation): ReLU()
|
| 51 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 52 |
+
(linear2): Linear(in_features=1024, out_features=256, bias=True)
|
| 53 |
+
)
|
| 54 |
+
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 55 |
+
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 56 |
+
(norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 57 |
+
(dropout): Dropout(p=0.05, inplace=False)
|
| 58 |
+
)
|
| 59 |
+
)
|
| 60 |
+
(norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
|
| 61 |
+
)
|
| 62 |
+
(proj): Linear(in_features=256, out_features=28647, bias=False)
|
| 63 |
+
)
|
| 64 |
+
[Info] Total Parameters: 28,336,384
|
| 65 |
+
[Info] Trainable Parameters: 28,336,384
|
| 66 |
+
[Info] Model Size (approx): 108.09 MB
|
| 67 |
+
===== Epoch 1/10 =====
|
| 68 |
+
Epoch 01 | Time: 3m34s | train loss: 6.2330 | val loss: 6.0593
|
| 69 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251226_1357\best_model.pt, best epoch: 1
|
| 70 |
+
===== Epoch 2/10 =====
|
| 71 |
+
Epoch 02 | Time: 3m37s | train loss: 5.2751 | val loss: 5.6802
|
| 72 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251226_1357\best_model.pt, best epoch: 2
|
| 73 |
+
===== Epoch 3/10 =====
|
| 74 |
+
Epoch 03 | Time: 3m28s | train loss: 4.7900 | val loss: 5.4412
|
| 75 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251226_1357\best_model.pt, best epoch: 3
|
| 76 |
+
===== Epoch 4/10 =====
|
| 77 |
+
Epoch 04 | Time: 3m32s | train loss: 4.4403 | val loss: 5.2774
|
| 78 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251226_1357\best_model.pt, best epoch: 4
|
| 79 |
+
===== Epoch 5/10 =====
|
| 80 |
+
Epoch 05 | Time: 3m34s | train loss: 4.1581 | val loss: 5.2082
|
| 81 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251226_1357\best_model.pt, best epoch: 5
|
| 82 |
+
===== Epoch 6/10 =====
|
| 83 |
+
Epoch 06 | Time: 3m33s | train loss: 3.9421 | val loss: 5.1301
|
| 84 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251226_1357\best_model.pt, best epoch: 6
|
| 85 |
+
===== Epoch 7/10 =====
|
| 86 |
+
Epoch 07 | Time: 3m36s | train loss: 3.7749 | val loss: 5.0828
|
| 87 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251226_1357\best_model.pt, best epoch: 7
|
| 88 |
+
===== Epoch 8/10 =====
|
| 89 |
+
Epoch 08 | Time: 3m33s | train loss: 3.6390 | val loss: 5.0811
|
| 90 |
+
New best model saved to runs\train\Nltk_100k\transformer\20251226_1357\best_model.pt, best epoch: 8
|
| 91 |
+
===== Epoch 9/10 =====
|
| 92 |
+
Epoch 09 | Time: 3m37s | train loss: 3.3762 | val loss: 5.0981
|
| 93 |
+
===== Epoch 10/10 =====
|
| 94 |
+
Epoch 10 | Time: 3m32s | train loss: 3.2740 | val loss: 5.1790
|
| 95 |
+
|
| 96 |
+
Training finished! Total Epochs: 10 Total Time: 35m43s
|