Add model.py
Browse files
model.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
|
| 4 |
+
class HFMT8HighFidelityMultimodalTransformer(nn.Module):
|
| 5 |
+
def __init__(self):
|
| 6 |
+
super().__init__()
|
| 7 |
+
self.layers = nn.Sequential(
|
| 8 |
+
nn.Conv2d(**{"in_channels":3,"out_channels":1152,"kernel_size":14,"stride":14,"note":"SigLIP-style Patch Embedding for high-resolution input"}),
|
| 9 |
+
nn.TransformerBlock(**{"hidden_size":1152,"num_heads":16,"mlp_ratio":4,"activation":"GELU","note":"SigLIP SO400M Vision Encoder Backbone"}),
|
| 10 |
+
nn.Conv2d(**{"in_channels":1152,"out_channels":1152,"kernel_size":2,"stride":2,"note":"Adaptive Patch-Merging for 50% Visual Token Reduction"}),
|
| 11 |
+
nn.Linear(**{"in_features":1152,"out_features":4096,"note":"Cross-Modal Projection Bridge to LLM Latent Space"}),
|
| 12 |
+
nn.TransformerBlock(**{"hidden_size":4096,"num_attention_heads":32,"num_key_value_groups":8,"attention_type":"Grouped-Query Attention (GQA)","positional_encoding":"RoPE (Rotary)","note":"Llama-3 Decoder Block with 4-bit NF4 Quantization Support"}),
|
| 13 |
+
nn.Linear(**{"in_features":4096,"out_features":14336,"activation":"SwiGLU","note":"Gated Linear Unit for Enhanced Representational Capacity"}),
|
| 14 |
+
nn.RMSNorm(**{"normalized_shape":4096,"eps":0.00001,"note":"Pre-block Normalization for Numerical Stability"}),
|
| 15 |
+
nn.Linear(**{"in_features":4096,"out_features":128256,"bias":false,"note":"Language Modeling Head (Uncensored Configuration)"})
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
def forward(self, x):
|
| 19 |
+
return self.layers(x)
|