itriedcoding
/

Lumina-Lite-2-2B-Optimized-Autoregressive-Transformer

Model card Files Files and versions

Lumina-Lite-2-2B-Optimized-Autoregressive-Transformer / config.json

itriedcoding's picture

Add config.json

616b8e9 verified about 1 month ago

history blame contribute delete

2.04 kB

	{
	"name": "Lumina-Lite-2.2B Optimized Autoregressive Transformer",
	"layers": [
	{
	"type": "Embedding",
	"params": {
	"num_embeddings": 50272,
	"embedding_dim": 2048
	}
	},
	{
	"type": "TransformerBlock",
	"params": {
	"layers": 28,
	"hidden_size": 2048,
	"num_attention_heads": 16,
	"num_key_value_heads": 4,
	"head_dim": 128,
	"mlp_intermediate_size": 5504,
	"normalization": "RMSNorm",
	"activation": "SwiGLU",
	"positional_encoding": "RoPE (Rotary Positional Embeddings)",
	"attention_type": "Grouped-Query Attention (GQA)"
	}
	},
	{
	"type": "Linear",
	"params": {
	"in_features": 2048,
	"out_features": 5504,
	"description": "SwiGLU Gate and Up Projection"
	}
	},
	{
	"type": "Linear",
	"params": {
	"in_features": 5504,
	"out_features": 2048,
	"description": "MLP Down Projection"
	}
	},
	{
	"type": "RMSNorm",
	"params": {
	"normalized_shape": 2048,
	"eps": 0.00001
	}
	},
	{
	"type": "Linear",
	"params": {
	"in_features": 2048,
	"out_features": 50272,
	"bias": false,
	"description": "Language Modeling Head"
	}
	}
	],
	"explanation": "Lumina-Lite-2.2B utilizes a 'deep-and-narrow' topology to optimize for high-throughput within a 6GB VRAM limit. The implementation of Grouped-Query Attention (GQA) reduces the KV cache footprint by a factor of 4 compared to multi-head attention, enabling longer context windows and larger batch sizes on consumer hardware. SwiGLU activation and RMSNorm provide superior gradient flow and training stability over standard GeLU/LayerNorm, while RoPE ensures the model maintains spatial awareness of tokens without absolute position bias. The architecture is specifically designed to facilitate 4-bit NF4 quantization, making it highly efficient for deployment via bitsandbytes or AutoGPTQ."
	}