Spaces:
Sleeping
Sleeping
File size: 2,907 Bytes
b8199f9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | import copy
import torch
import torch.nn as nn
from Embedding import Embedding
from MultiHeadAttention import MultiHeadAttention
from DiffMultiHeadAttention import DiffMultiHeadAttention
from Encoder import PositionWiseFeedForward,EncoderLayer,Encoder
from Generator import Projector,Generator
def make_model(vocab_size,embedding_dim,key_dim,head_number,position_information_type,
enable_affine,enable_talking_head,use_diff,self_attention_block_size,
feed_forward_dim,enable_layer_norm,deep,dropout_rate,enable_el_cache):
#嵌入层
embedding = Embedding(
vocab_size = vocab_size,
embedding_dim = embedding_dim,
enable_affine = enable_affine,
position_information_type = position_information_type,
dropout_rate = dropout_rate
)
#多头自注意力层
if use_diff:
assert use_diff == False, "差分注意力暂未完成EL-Attention的集成,use_diff 应当为 False "
Attention = DiffMultiHeadAttention
else:
Attention = MultiHeadAttention
assert self_attention_block_size == 0, "暂不兼容EL-Attention与注意力分块,self_attention_block_size 应当为 0 "
multi_head_attention = Attention(
embedding_dim = embedding_dim,
key_dim = key_dim,
head_number = head_number,
position_information_type = position_information_type,
enable_affine = enable_affine,
enable_talking_head = enable_talking_head,
self_attention_block_size = self_attention_block_size,
dropout_rate = dropout_rate,
enable_el_cache = enable_el_cache
)
#信息融合前馈网络
position_wise_feed_forward = PositionWiseFeedForward(
embedding_dim = embedding_dim,
feed_forward_dim = feed_forward_dim,
enable_affine = enable_affine
)
#编码器层
encoder_layer = EncoderLayer(
multi_head_attention = copy.deepcopy(multi_head_attention),
mask_future = True,#自注意力,都要遮盖
position_wise_feed_forward = copy.deepcopy(position_wise_feed_forward),
enable_layer_norm = enable_layer_norm,
dropout_rate = dropout_rate
)
#堆叠的编码器层组成编码器
encoder_layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(deep)])
encoder = Encoder(encoder_layers = encoder_layers)
#投射器
projector = Projector(
embedding_dim = embedding_dim,
vocab_out_size = vocab_size,
enable_affine = enable_affine
)
#生成器模型本身
model = Generator(
embedding = embedding,
encoder = encoder,
projector = projector
)
#模型参数初始化
for p in model.parameters():
#偏置,仿射参数不会随机设置
#矩阵形式的参数
if p.dim() == 2:
nn.init.xavier_uniform_(p)
return model
|