| { |
| "name": "Lumina-Lite-2.2B Optimized Autoregressive Transformer", |
| "layers": [ |
| { |
| "type": "Embedding", |
| "params": { |
| "num_embeddings": 50272, |
| "embedding_dim": 2048 |
| } |
| }, |
| { |
| "type": "TransformerBlock", |
| "params": { |
| "layers": 28, |
| "hidden_size": 2048, |
| "num_attention_heads": 16, |
| "num_key_value_heads": 4, |
| "head_dim": 128, |
| "mlp_intermediate_size": 5504, |
| "normalization": "RMSNorm", |
| "activation": "SwiGLU", |
| "positional_encoding": "RoPE (Rotary Positional Embeddings)", |
| "attention_type": "Grouped-Query Attention (GQA)" |
| } |
| }, |
| { |
| "type": "Linear", |
| "params": { |
| "in_features": 2048, |
| "out_features": 5504, |
| "description": "SwiGLU Gate and Up Projection" |
| } |
| }, |
| { |
| "type": "Linear", |
| "params": { |
| "in_features": 5504, |
| "out_features": 2048, |
| "description": "MLP Down Projection" |
| } |
| }, |
| { |
| "type": "RMSNorm", |
| "params": { |
| "normalized_shape": 2048, |
| "eps": 0.00001 |
| } |
| }, |
| { |
| "type": "Linear", |
| "params": { |
| "in_features": 2048, |
| "out_features": 50272, |
| "bias": false, |
| "description": "Language Modeling Head" |
| } |
| } |
| ], |
| "explanation": "Lumina-Lite-2.2B utilizes a 'deep-and-narrow' topology to optimize for high-throughput within a 6GB VRAM limit. The implementation of Grouped-Query Attention (GQA) reduces the KV cache footprint by a factor of 4 compared to multi-head attention, enabling longer context windows and larger batch sizes on consumer hardware. SwiGLU activation and RMSNorm provide superior gradient flow and training stability over standard GeLU/LayerNorm, while RoPE ensures the model maintains spatial awareness of tokens without absolute position bias. The architecture is specifically designed to facilitate 4-bit NF4 quantization, making it highly efficient for deployment via bitsandbytes or AutoGPTQ." |
| } |