Spaces:

jingyaogong
/

MiniMind-V

Running

jingyaogong commited on Apr 20

Commit

86cffcc

verified ·

1 Parent(s): 2860e7c

Upload 9 files

Files changed (3) hide show

minimind-3v/config.json CHANGED Viewed

@@ -38,5 +38,6 @@
   "router_aux_loss_coef": 0.0005,
   "transformers_version": "4.57.6",
   "use_moe": false,
-  "vocab_size": 6400
-}

   "router_aux_loss_coef": 0.0005,
   "transformers_version": "4.57.6",
   "use_moe": false,
+  "vocab_size": 6400,
+  "tie_word_embeddings": true
+}

minimind-3v/model_vlm.py CHANGED Viewed

@@ -21,28 +21,22 @@ class VLMConfig(MiniMindConfig):
         super().__init__(**kwargs)
 class MMVisionProjector(nn.Module):
-    def __init__(self, in_dim, out_dim, source_tokens=256, target_tokens=64):
         super().__init__()
-        self.target_tokens = target_tokens
-        self.merge = source_tokens // target_tokens
         self.mlp = nn.Sequential(
-            nn.LayerNorm(in_dim * self.merge),
-            nn.Linear(in_dim * self.merge, out_dim),
             nn.GELU(),
             nn.Linear(out_dim, out_dim),
         )
     def forward(self, x):
-        b, n, d = x.shape
-        side = int(n ** 0.5)
-        s = int(self.merge ** 0.5)
-        x = x.view(b, side // s, s, side // s, s, d).permute(0, 1, 3, 2, 4, 5).reshape(b, self.target_tokens, d * self.merge)
         return self.mlp(x)
 # 继承自语言模型
 class MiniMindVLM(MiniMindForCausalLM):
     config_class = VLMConfig
-    def __init__(self, config: VLMConfig = None, vision_model_path="./model/siglip2-base-p16-256-ve"):
         self.config = config or VLMConfig()
         super().__init__(self.config)
         self.vision_encoder, self.processor = self.__class__.get_vision_model(vision_model_path)

         super().__init__(**kwargs)
 class MMVisionProjector(nn.Module):
+    def __init__(self, in_dim, out_dim, source_tokens=64, target_tokens=64):
         super().__init__()
         self.mlp = nn.Sequential(
+            nn.LayerNorm(in_dim),
+            nn.Linear(in_dim, out_dim),
             nn.GELU(),
             nn.Linear(out_dim, out_dim),
         )
     def forward(self, x):
         return self.mlp(x)
 # 继承自语言模型
 class MiniMindVLM(MiniMindForCausalLM):
     config_class = VLMConfig
+    def __init__(self, config: VLMConfig = None, vision_model_path="./model/siglip2-base-p32-256-ve"):
         self.config = config or VLMConfig()
         super().__init__(self.config)
         self.vision_encoder, self.processor = self.__class__.get_vision_model(vision_model_path)

minimind-3v/pytorch_model.bin ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:baf6028706a526a5430de8ebf0c243a95ab8b318d668d5f4d91c1f5a1e55a760
+size 130220750