itriedcoding's picture
Add model.py
26868a6 verified
import torch
import torch.nn as nn
class LuminaLite22BOptimizedAutoregressiveTransformer(nn.Module):
def __init__(self):
super().__init__()
self.layers = nn.Sequential(
nn.Embedding(**{"num_embeddings":50272,"embedding_dim":2048}),
nn.TransformerBlock(**{"layers":28,"hidden_size":2048,"num_attention_heads":16,"num_key_value_heads":4,"head_dim":128,"mlp_intermediate_size":5504,"normalization":"RMSNorm","activation":"SwiGLU","positional_encoding":"RoPE (Rotary Positional Embeddings)","attention_type":"Grouped-Query Attention (GQA)"}),
nn.Linear(**{"in_features":2048,"out_features":5504,"description":"SwiGLU Gate and Up Projection"}),
nn.Linear(**{"in_features":5504,"out_features":2048,"description":"MLP Down Projection"}),
nn.RMSNorm(**{"normalized_shape":2048,"eps":0.00001}),
nn.Linear(**{"in_features":2048,"out_features":50272,"bias":false,"description":"Language Modeling Head"})
)
def forward(self, x):
return self.layers(x)