ZephyrCode
/

moondream_ashu

Model card Files Files and versions

ZephyrCode commited on Jan 31, 2025

Commit

76d74d2

·

verified ·

1 Parent(s): 5a8c97f

Update vision_encoder.py

Files changed (1) hide show

vision_encoder.py +4 -4

vision_encoder.py CHANGED Viewed

@@ -96,7 +96,7 @@ class VisionTransformer(nn.Module):
         super().__init__()
         embed_len = 729
-        embed_dim = 1152
         self.patch_embed = LinearPatchEmbedding()
         self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * 0.02)
@@ -288,7 +288,7 @@ class VisionEncoder(nn.Module):
         full_img_features = combined_features[: len(im_list)]
         patch_features = (
-            combined_features[len(im_list) :].transpose(1, 2).view(-1, 1152, 27, 27)
         )
         # Reshape patch features back to their original structure
@@ -297,7 +297,7 @@ class VisionEncoder(nn.Module):
         for i, patch_set in enumerate(patches):
             if len(patch_set) == 0:
                 reshaped_patch_features.append(
-                    full_img_features[i].transpose(0, 1).view(1152, 27, 27)
                 )
             else:
                 sample_features = []
@@ -317,7 +317,7 @@ class VisionEncoder(nn.Module):
                 ).squeeze(0)
                 reshaped_patch_features.append(sample_features)
         reshaped_patch_features = (
-            torch.stack(reshaped_patch_features).view(-1, 1152, 729).transpose(1, 2)
         )
         final_features = torch.cat([full_img_features, reshaped_patch_features], dim=2)

         super().__init__()
         embed_len = 729
+        embed_dim = 4608
         self.patch_embed = LinearPatchEmbedding()
         self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * 0.02)
         full_img_features = combined_features[: len(im_list)]
         patch_features = (
+            combined_features[len(im_list) :].transpose(1, 2).view(-1, 4608, 27, 27)
         )
         # Reshape patch features back to their original structure
         for i, patch_set in enumerate(patches):
             if len(patch_set) == 0:
                 reshaped_patch_features.append(
+                    full_img_features[i].transpose(0, 1).view(4608, 27, 27)
                 )
             else:
                 sample_features = []
                 ).squeeze(0)
                 reshaped_patch_features.append(sample_features)
         reshaped_patch_features = (
+            torch.stack(reshaped_patch_features).view(-1, 4608, 729).transpose(1, 2)
         )
         final_features = torch.cat([full_img_features, reshaped_patch_features], dim=2)