ZTYikes commited on
Commit
c2ce384
·
verified ·
1 Parent(s): 4efb46c

Upload nanoVLM using push_to_hub

Browse files
Files changed (2) hide show
  1. config.json +45 -9
  2. model.safetensors +2 -2
config.json CHANGED
@@ -2,34 +2,34 @@
2
  "vit_hidden_dim": 768,
3
  "vit_inter_dim": 3072,
4
  "vit_patch_size": 16,
5
- "vit_img_size": 512,
6
  "vit_n_heads": 12,
7
  "vit_dropout": 0.0,
8
  "vit_n_blocks": 12,
9
  "vit_ln_eps": 1e-06,
10
  "vit_cls_flag": false,
11
- "vit_model_type": "google/siglip2-base-patch16-512",
12
  "lm_hidden_dim": 960,
13
  "lm_inter_dim": 2560,
14
  "lm_rms_eps": 1e-05,
15
  "lm_re_base": 100000,
16
  "lm_max_position_embeddings": 8192,
17
  "lm_base_vocab_size": 49152,
18
- "extra_token_amount": 66,
19
- "lm_vocab_size": 49218,
20
  "lm_n_heads": 15,
21
  "lm_n_kv_heads": 5,
22
  "lm_dropout": 0.0,
23
  "lm_n_blocks": 32,
24
  "lm_attn_scaling": 1.0,
25
- "lm_max_length": 4096,
26
  "lm_use_tokens": false,
27
  "lm_tie_weights": true,
28
  "lm_model_type": "HuggingFaceTB/SmolLM2-360M-Instruct",
29
  "lm_tokenizer": "HuggingFaceTB/SmolLM2-360M-Instruct",
30
  "lm_chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
31
- "mp_pixel_shuffle_factor": 4,
32
- "mp_image_token_length": 64,
33
  "max_img_size": 2048,
34
  "resize_to_max_side_len": true,
35
  "vlm_extra_tokens": {
@@ -43,6 +43,8 @@
43
  "r1c6": "<row_1_col_6>",
44
  "r1c7": "<row_1_col_7>",
45
  "r1c8": "<row_1_col_8>",
 
 
46
  "r2c1": "<row_2_col_1>",
47
  "r2c2": "<row_2_col_2>",
48
  "r2c3": "<row_2_col_3>",
@@ -51,6 +53,8 @@
51
  "r2c6": "<row_2_col_6>",
52
  "r2c7": "<row_2_col_7>",
53
  "r2c8": "<row_2_col_8>",
 
 
54
  "r3c1": "<row_3_col_1>",
55
  "r3c2": "<row_3_col_2>",
56
  "r3c3": "<row_3_col_3>",
@@ -59,6 +63,8 @@
59
  "r3c6": "<row_3_col_6>",
60
  "r3c7": "<row_3_col_7>",
61
  "r3c8": "<row_3_col_8>",
 
 
62
  "r4c1": "<row_4_col_1>",
63
  "r4c2": "<row_4_col_2>",
64
  "r4c3": "<row_4_col_3>",
@@ -67,6 +73,8 @@
67
  "r4c6": "<row_4_col_6>",
68
  "r4c7": "<row_4_col_7>",
69
  "r4c8": "<row_4_col_8>",
 
 
70
  "r5c1": "<row_5_col_1>",
71
  "r5c2": "<row_5_col_2>",
72
  "r5c3": "<row_5_col_3>",
@@ -75,6 +83,8 @@
75
  "r5c6": "<row_5_col_6>",
76
  "r5c7": "<row_5_col_7>",
77
  "r5c8": "<row_5_col_8>",
 
 
78
  "r6c1": "<row_6_col_1>",
79
  "r6c2": "<row_6_col_2>",
80
  "r6c3": "<row_6_col_3>",
@@ -83,6 +93,8 @@
83
  "r6c6": "<row_6_col_6>",
84
  "r6c7": "<row_6_col_7>",
85
  "r6c8": "<row_6_col_8>",
 
 
86
  "r7c1": "<row_7_col_1>",
87
  "r7c2": "<row_7_col_2>",
88
  "r7c3": "<row_7_col_3>",
@@ -91,6 +103,8 @@
91
  "r7c6": "<row_7_col_6>",
92
  "r7c7": "<row_7_col_7>",
93
  "r7c8": "<row_7_col_8>",
 
 
94
  "r8c1": "<row_8_col_1>",
95
  "r8c2": "<row_8_col_2>",
96
  "r8c3": "<row_8_col_3>",
@@ -98,9 +112,31 @@
98
  "r8c5": "<row_8_col_5>",
99
  "r8c6": "<row_8_col_6>",
100
  "r8c7": "<row_8_col_7>",
101
- "r8c8": "<row_8_col_8>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  },
103
  "vlm_load_backbone_weights": true,
104
- "vlm_checkpoint_path": "checkpoints",
105
  "hf_repo_name": "nanoVLM"
106
  }
 
2
  "vit_hidden_dim": 768,
3
  "vit_inter_dim": 3072,
4
  "vit_patch_size": 16,
5
+ "vit_img_size": 224,
6
  "vit_n_heads": 12,
7
  "vit_dropout": 0.0,
8
  "vit_n_blocks": 12,
9
  "vit_ln_eps": 1e-06,
10
  "vit_cls_flag": false,
11
+ "vit_model_type": "google/siglip2-base-patch16-224",
12
  "lm_hidden_dim": 960,
13
  "lm_inter_dim": 2560,
14
  "lm_rms_eps": 1e-05,
15
  "lm_re_base": 100000,
16
  "lm_max_position_embeddings": 8192,
17
  "lm_base_vocab_size": 49152,
18
+ "extra_token_amount": 102,
19
+ "lm_vocab_size": 49254,
20
  "lm_n_heads": 15,
21
  "lm_n_kv_heads": 5,
22
  "lm_dropout": 0.0,
23
  "lm_n_blocks": 32,
24
  "lm_attn_scaling": 1.0,
25
+ "lm_max_length": 2048,
26
  "lm_use_tokens": false,
27
  "lm_tie_weights": true,
28
  "lm_model_type": "HuggingFaceTB/SmolLM2-360M-Instruct",
29
  "lm_tokenizer": "HuggingFaceTB/SmolLM2-360M-Instruct",
30
  "lm_chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
31
+ "mp_pixel_shuffle_factor": 2,
32
+ "mp_image_token_length": 49,
33
  "max_img_size": 2048,
34
  "resize_to_max_side_len": true,
35
  "vlm_extra_tokens": {
 
43
  "r1c6": "<row_1_col_6>",
44
  "r1c7": "<row_1_col_7>",
45
  "r1c8": "<row_1_col_8>",
46
+ "r1c9": "<row_1_col_9>",
47
+ "r1c10": "<row_1_col_10>",
48
  "r2c1": "<row_2_col_1>",
49
  "r2c2": "<row_2_col_2>",
50
  "r2c3": "<row_2_col_3>",
 
53
  "r2c6": "<row_2_col_6>",
54
  "r2c7": "<row_2_col_7>",
55
  "r2c8": "<row_2_col_8>",
56
+ "r2c9": "<row_2_col_9>",
57
+ "r2c10": "<row_2_col_10>",
58
  "r3c1": "<row_3_col_1>",
59
  "r3c2": "<row_3_col_2>",
60
  "r3c3": "<row_3_col_3>",
 
63
  "r3c6": "<row_3_col_6>",
64
  "r3c7": "<row_3_col_7>",
65
  "r3c8": "<row_3_col_8>",
66
+ "r3c9": "<row_3_col_9>",
67
+ "r3c10": "<row_3_col_10>",
68
  "r4c1": "<row_4_col_1>",
69
  "r4c2": "<row_4_col_2>",
70
  "r4c3": "<row_4_col_3>",
 
73
  "r4c6": "<row_4_col_6>",
74
  "r4c7": "<row_4_col_7>",
75
  "r4c8": "<row_4_col_8>",
76
+ "r4c9": "<row_4_col_9>",
77
+ "r4c10": "<row_4_col_10>",
78
  "r5c1": "<row_5_col_1>",
79
  "r5c2": "<row_5_col_2>",
80
  "r5c3": "<row_5_col_3>",
 
83
  "r5c6": "<row_5_col_6>",
84
  "r5c7": "<row_5_col_7>",
85
  "r5c8": "<row_5_col_8>",
86
+ "r5c9": "<row_5_col_9>",
87
+ "r5c10": "<row_5_col_10>",
88
  "r6c1": "<row_6_col_1>",
89
  "r6c2": "<row_6_col_2>",
90
  "r6c3": "<row_6_col_3>",
 
93
  "r6c6": "<row_6_col_6>",
94
  "r6c7": "<row_6_col_7>",
95
  "r6c8": "<row_6_col_8>",
96
+ "r6c9": "<row_6_col_9>",
97
+ "r6c10": "<row_6_col_10>",
98
  "r7c1": "<row_7_col_1>",
99
  "r7c2": "<row_7_col_2>",
100
  "r7c3": "<row_7_col_3>",
 
103
  "r7c6": "<row_7_col_6>",
104
  "r7c7": "<row_7_col_7>",
105
  "r7c8": "<row_7_col_8>",
106
+ "r7c9": "<row_7_col_9>",
107
+ "r7c10": "<row_7_col_10>",
108
  "r8c1": "<row_8_col_1>",
109
  "r8c2": "<row_8_col_2>",
110
  "r8c3": "<row_8_col_3>",
 
112
  "r8c5": "<row_8_col_5>",
113
  "r8c6": "<row_8_col_6>",
114
  "r8c7": "<row_8_col_7>",
115
+ "r8c8": "<row_8_col_8>",
116
+ "r8c9": "<row_8_col_9>",
117
+ "r8c10": "<row_8_col_10>",
118
+ "r9c1": "<row_9_col_1>",
119
+ "r9c2": "<row_9_col_2>",
120
+ "r9c3": "<row_9_col_3>",
121
+ "r9c4": "<row_9_col_4>",
122
+ "r9c5": "<row_9_col_5>",
123
+ "r9c6": "<row_9_col_6>",
124
+ "r9c7": "<row_9_col_7>",
125
+ "r9c8": "<row_9_col_8>",
126
+ "r9c9": "<row_9_col_9>",
127
+ "r9c10": "<row_9_col_10>",
128
+ "r10c1": "<row_10_col_1>",
129
+ "r10c2": "<row_10_col_2>",
130
+ "r10c3": "<row_10_col_3>",
131
+ "r10c4": "<row_10_col_4>",
132
+ "r10c5": "<row_10_col_5>",
133
+ "r10c6": "<row_10_col_6>",
134
+ "r10c7": "<row_10_col_7>",
135
+ "r10c8": "<row_10_col_8>",
136
+ "r10c9": "<row_10_col_9>",
137
+ "r10c10": "<row_10_col_10>"
138
  },
139
  "vlm_load_backbone_weights": true,
140
+ "vlm_checkpoint_path": "./checkpoints",
141
  "hf_repo_name": "nanoVLM"
142
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7908b2ebefa2e7d5911b2241c4354b82b1fda9978e35a7b2bfe96bec1592f6f
3
- size 1840504504
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41da8139fed05f1eccf1f7538a6bbef21aa0045f3285c3047489f272ac9b6a92
3
+ size 1802709648