itriedcoding commited on
Commit
00617eb
·
verified ·
1 Parent(s): d94065f

Add config.json

Browse files
Files changed (1) hide show
  1. config.json +81 -0
config.json ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "HFMT-8 (High-Fidelity Multimodal Transformer)",
3
+ "layers": [
4
+ {
5
+ "type": "Conv2d",
6
+ "params": {
7
+ "in_channels": 3,
8
+ "out_channels": 1152,
9
+ "kernel_size": 14,
10
+ "stride": 14,
11
+ "note": "SigLIP-style Patch Embedding for high-resolution input"
12
+ }
13
+ },
14
+ {
15
+ "type": "TransformerBlock",
16
+ "params": {
17
+ "hidden_size": 1152,
18
+ "num_heads": 16,
19
+ "mlp_ratio": 4,
20
+ "activation": "GELU",
21
+ "note": "SigLIP SO400M Vision Encoder Backbone"
22
+ }
23
+ },
24
+ {
25
+ "type": "Conv2d",
26
+ "params": {
27
+ "in_channels": 1152,
28
+ "out_channels": 1152,
29
+ "kernel_size": 2,
30
+ "stride": 2,
31
+ "note": "Adaptive Patch-Merging for 50% Visual Token Reduction"
32
+ }
33
+ },
34
+ {
35
+ "type": "Linear",
36
+ "params": {
37
+ "in_features": 1152,
38
+ "out_features": 4096,
39
+ "note": "Cross-Modal Projection Bridge to LLM Latent Space"
40
+ }
41
+ },
42
+ {
43
+ "type": "TransformerBlock",
44
+ "params": {
45
+ "hidden_size": 4096,
46
+ "num_attention_heads": 32,
47
+ "num_key_value_groups": 8,
48
+ "attention_type": "Grouped-Query Attention (GQA)",
49
+ "positional_encoding": "RoPE (Rotary)",
50
+ "note": "Llama-3 Decoder Block with 4-bit NF4 Quantization Support"
51
+ }
52
+ },
53
+ {
54
+ "type": "Linear",
55
+ "params": {
56
+ "in_features": 4096,
57
+ "out_features": 14336,
58
+ "activation": "SwiGLU",
59
+ "note": "Gated Linear Unit for Enhanced Representational Capacity"
60
+ }
61
+ },
62
+ {
63
+ "type": "RMSNorm",
64
+ "params": {
65
+ "normalized_shape": 4096,
66
+ "eps": 0.00001,
67
+ "note": "Pre-block Normalization for Numerical Stability"
68
+ }
69
+ },
70
+ {
71
+ "type": "Linear",
72
+ "params": {
73
+ "in_features": 4096,
74
+ "out_features": 128256,
75
+ "bias": false,
76
+ "note": "Language Modeling Head (Uncensored Configuration)"
77
+ }
78
+ }
79
+ ],
80
+ "explanation": "The HFMT-8 architecture is designed to balance multimodal reasoning with extreme memory efficiency for 8GB VRAM environments. By utilizing SigLIP for vision, we achieve better zero-shot alignment than CLIP with fewer parameters. The 'C-Abstractor' via patch-merging reduces visual tokens significantly, preventing KV-cache explosion during multimodal tasks. The LLM backbone utilizes Grouped-Query Attention (GQA) to minimize the memory footprint of the attention mechanism by a factor of 4, and the transition to 4-bit NF4 quantization ensures the 8B parameter model fits comfortably within 4.5GB, leaving ample room for the visual buffer and context window."
81
+ }