| { |
| "name": "HFMT-8 (High-Fidelity Multimodal Transformer)", |
| "layers": [ |
| { |
| "type": "Conv2d", |
| "params": { |
| "in_channels": 3, |
| "out_channels": 1152, |
| "kernel_size": 14, |
| "stride": 14, |
| "note": "SigLIP-style Patch Embedding for high-resolution input" |
| } |
| }, |
| { |
| "type": "TransformerBlock", |
| "params": { |
| "hidden_size": 1152, |
| "num_heads": 16, |
| "mlp_ratio": 4, |
| "activation": "GELU", |
| "note": "SigLIP SO400M Vision Encoder Backbone" |
| } |
| }, |
| { |
| "type": "Conv2d", |
| "params": { |
| "in_channels": 1152, |
| "out_channels": 1152, |
| "kernel_size": 2, |
| "stride": 2, |
| "note": "Adaptive Patch-Merging for 50% Visual Token Reduction" |
| } |
| }, |
| { |
| "type": "Linear", |
| "params": { |
| "in_features": 1152, |
| "out_features": 4096, |
| "note": "Cross-Modal Projection Bridge to LLM Latent Space" |
| } |
| }, |
| { |
| "type": "TransformerBlock", |
| "params": { |
| "hidden_size": 4096, |
| "num_attention_heads": 32, |
| "num_key_value_groups": 8, |
| "attention_type": "Grouped-Query Attention (GQA)", |
| "positional_encoding": "RoPE (Rotary)", |
| "note": "Llama-3 Decoder Block with 4-bit NF4 Quantization Support" |
| } |
| }, |
| { |
| "type": "Linear", |
| "params": { |
| "in_features": 4096, |
| "out_features": 14336, |
| "activation": "SwiGLU", |
| "note": "Gated Linear Unit for Enhanced Representational Capacity" |
| } |
| }, |
| { |
| "type": "RMSNorm", |
| "params": { |
| "normalized_shape": 4096, |
| "eps": 0.00001, |
| "note": "Pre-block Normalization for Numerical Stability" |
| } |
| }, |
| { |
| "type": "Linear", |
| "params": { |
| "in_features": 4096, |
| "out_features": 128256, |
| "bias": false, |
| "note": "Language Modeling Head (Uncensored Configuration)" |
| } |
| } |
| ], |
| "explanation": "The HFMT-8 architecture is designed to balance multimodal reasoning with extreme memory efficiency for 8GB VRAM environments. By utilizing SigLIP for vision, we achieve better zero-shot alignment than CLIP with fewer parameters. The 'C-Abstractor' via patch-merging reduces visual tokens significantly, preventing KV-cache explosion during multimodal tasks. The LLM backbone utilizes Grouped-Query Attention (GQA) to minimize the memory footprint of the attention mechanism by a factor of 4, and the transition to 4-bit NF4 quantization ensures the 8B parameter model fits comfortably within 4.5GB, leaving ample room for the visual buffer and context window." |
| } |