File size: 2,036 Bytes
616b8e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
{
  "name": "Lumina-Lite-2.2B Optimized Autoregressive Transformer",
  "layers": [
    {
      "type": "Embedding",
      "params": {
        "num_embeddings": 50272,
        "embedding_dim": 2048
      }
    },
    {
      "type": "TransformerBlock",
      "params": {
        "layers": 28,
        "hidden_size": 2048,
        "num_attention_heads": 16,
        "num_key_value_heads": 4,
        "head_dim": 128,
        "mlp_intermediate_size": 5504,
        "normalization": "RMSNorm",
        "activation": "SwiGLU",
        "positional_encoding": "RoPE (Rotary Positional Embeddings)",
        "attention_type": "Grouped-Query Attention (GQA)"
      }
    },
    {
      "type": "Linear",
      "params": {
        "in_features": 2048,
        "out_features": 5504,
        "description": "SwiGLU Gate and Up Projection"
      }
    },
    {
      "type": "Linear",
      "params": {
        "in_features": 5504,
        "out_features": 2048,
        "description": "MLP Down Projection"
      }
    },
    {
      "type": "RMSNorm",
      "params": {
        "normalized_shape": 2048,
        "eps": 0.00001
      }
    },
    {
      "type": "Linear",
      "params": {
        "in_features": 2048,
        "out_features": 50272,
        "bias": false,
        "description": "Language Modeling Head"
      }
    }
  ],
  "explanation": "Lumina-Lite-2.2B utilizes a 'deep-and-narrow' topology to optimize for high-throughput within a 6GB VRAM limit. The implementation of Grouped-Query Attention (GQA) reduces the KV cache footprint by a factor of 4 compared to multi-head attention, enabling longer context windows and larger batch sizes on consumer hardware. SwiGLU activation and RMSNorm provide superior gradient flow and training stability over standard GeLU/LayerNorm, while RoPE ensures the model maintains spatial awareness of tokens without absolute position bias. The architecture is specifically designed to facilitate 4-bit NF4 quantization, making it highly efficient for deployment via bitsandbytes or AutoGPTQ."
}