Spaces:
Running
Running
Upload 9 files
Browse files- minimind-3v/config.json +3 -2
- minimind-3v/model_vlm.py +4 -10
- minimind-3v/pytorch_model.bin +3 -0
minimind-3v/config.json
CHANGED
|
@@ -38,5 +38,6 @@
|
|
| 38 |
"router_aux_loss_coef": 0.0005,
|
| 39 |
"transformers_version": "4.57.6",
|
| 40 |
"use_moe": false,
|
| 41 |
-
"vocab_size": 6400
|
| 42 |
-
|
|
|
|
|
|
| 38 |
"router_aux_loss_coef": 0.0005,
|
| 39 |
"transformers_version": "4.57.6",
|
| 40 |
"use_moe": false,
|
| 41 |
+
"vocab_size": 6400,
|
| 42 |
+
"tie_word_embeddings": true
|
| 43 |
+
}
|
minimind-3v/model_vlm.py
CHANGED
|
@@ -21,28 +21,22 @@ class VLMConfig(MiniMindConfig):
|
|
| 21 |
super().__init__(**kwargs)
|
| 22 |
|
| 23 |
class MMVisionProjector(nn.Module):
|
| 24 |
-
def __init__(self, in_dim, out_dim, source_tokens=
|
| 25 |
super().__init__()
|
| 26 |
-
self.target_tokens = target_tokens
|
| 27 |
-
self.merge = source_tokens // target_tokens
|
| 28 |
self.mlp = nn.Sequential(
|
| 29 |
-
nn.LayerNorm(in_dim
|
| 30 |
-
nn.Linear(in_dim
|
| 31 |
nn.GELU(),
|
| 32 |
nn.Linear(out_dim, out_dim),
|
| 33 |
)
|
| 34 |
def forward(self, x):
|
| 35 |
-
b, n, d = x.shape
|
| 36 |
-
side = int(n ** 0.5)
|
| 37 |
-
s = int(self.merge ** 0.5)
|
| 38 |
-
x = x.view(b, side // s, s, side // s, s, d).permute(0, 1, 3, 2, 4, 5).reshape(b, self.target_tokens, d * self.merge)
|
| 39 |
return self.mlp(x)
|
| 40 |
|
| 41 |
# 继承自语言模型
|
| 42 |
class MiniMindVLM(MiniMindForCausalLM):
|
| 43 |
config_class = VLMConfig
|
| 44 |
|
| 45 |
-
def __init__(self, config: VLMConfig = None, vision_model_path="./model/siglip2-base-
|
| 46 |
self.config = config or VLMConfig()
|
| 47 |
super().__init__(self.config)
|
| 48 |
self.vision_encoder, self.processor = self.__class__.get_vision_model(vision_model_path)
|
|
|
|
| 21 |
super().__init__(**kwargs)
|
| 22 |
|
| 23 |
class MMVisionProjector(nn.Module):
|
| 24 |
+
def __init__(self, in_dim, out_dim, source_tokens=64, target_tokens=64):
|
| 25 |
super().__init__()
|
|
|
|
|
|
|
| 26 |
self.mlp = nn.Sequential(
|
| 27 |
+
nn.LayerNorm(in_dim),
|
| 28 |
+
nn.Linear(in_dim, out_dim),
|
| 29 |
nn.GELU(),
|
| 30 |
nn.Linear(out_dim, out_dim),
|
| 31 |
)
|
| 32 |
def forward(self, x):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
return self.mlp(x)
|
| 34 |
|
| 35 |
# 继承自语言模型
|
| 36 |
class MiniMindVLM(MiniMindForCausalLM):
|
| 37 |
config_class = VLMConfig
|
| 38 |
|
| 39 |
+
def __init__(self, config: VLMConfig = None, vision_model_path="./model/siglip2-base-p32-256-ve"):
|
| 40 |
self.config = config or VLMConfig()
|
| 41 |
super().__init__(self.config)
|
| 42 |
self.vision_encoder, self.processor = self.__class__.get_vision_model(vision_model_path)
|
minimind-3v/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:baf6028706a526a5430de8ebf0c243a95ab8b318d668d5f4d91c1f5a1e55a760
|
| 3 |
+
size 130220750
|