| import torch |
| import torch.nn as nn |
|
|
| class LuminaLite22BOptimizedAutoregressiveTransformer(nn.Module): |
| def __init__(self): |
| super().__init__() |
| self.layers = nn.Sequential( |
| nn.Embedding(**{"num_embeddings":50272,"embedding_dim":2048}), |
| nn.TransformerBlock(**{"layers":28,"hidden_size":2048,"num_attention_heads":16,"num_key_value_heads":4,"head_dim":128,"mlp_intermediate_size":5504,"normalization":"RMSNorm","activation":"SwiGLU","positional_encoding":"RoPE (Rotary Positional Embeddings)","attention_type":"Grouped-Query Attention (GQA)"}), |
| nn.Linear(**{"in_features":2048,"out_features":5504,"description":"SwiGLU Gate and Up Projection"}), |
| nn.Linear(**{"in_features":5504,"out_features":2048,"description":"MLP Down Projection"}), |
| nn.RMSNorm(**{"normalized_shape":2048,"eps":0.00001}), |
| nn.Linear(**{"in_features":2048,"out_features":50272,"bias":false,"description":"Language Modeling Head"}) |
| ) |
|
|
| def forward(self, x): |
| return self.layers(x) |