config.json CHANGED
@@ -1,4 +1,12 @@
1
  {
 
 
 
 
 
 
 
 
2
  "architectures": [
3
  "IsaacForConditionalGeneration"
4
  ],
@@ -53,20 +61,92 @@
53
  "num_attention_heads": 16,
54
  "num_hidden_layers": 28,
55
  "num_key_value_heads": 8,
 
56
  "pixel_shuffle_scale": 2,
57
  "rms_norm_eps": 1e-06,
58
- "rope_scaling": {
59
- "mrope_interleaved": true,
60
- "mrope_section": null,
61
- "rope_type": "default"
62
- },
63
- "rope_theta": 1000000.0,
64
  "sliding_window": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  "tie_word_embeddings": false,
66
- "transformers_version": "4.56.1",
67
  "use_cache": true,
68
  "use_sliding_window": false,
69
- "video_patch_size": 16,
70
  "vision_config": {
71
  "attention_dropout": 0.0,
72
  "hidden_act": "gelu_pytorch_tanh",
@@ -74,7 +154,7 @@
74
  "image_size": 256,
75
  "intermediate_size": 4304,
76
  "layer_norm_eps": 1e-06,
77
- "model_type": "pixel_shuffle_siglip2",
78
  "num_attention_heads": 16,
79
  "num_channels": 3,
80
  "num_hidden_layers": 27,
@@ -83,7 +163,19 @@
83
  "pixel_shuffle_scale_factor": 2
84
  },
85
  "vision_max_num_patches": 6144,
 
 
 
 
 
86
  "vision_min_num_patches": 256,
 
 
 
 
 
 
 
87
  "vision_token": "<image>",
88
  "vocab_size": 151936
89
  }
 
1
  {
2
+ "_rope_parameters": {
3
+ "rope_theta": 1000000,
4
+ "rope_type": "default"
5
+ },
6
+ "_rope_scaling": {
7
+ "rope_theta": 1000000,
8
+ "rope_type": "default"
9
+ },
10
  "architectures": [
11
  "IsaacForConditionalGeneration"
12
  ],
 
61
  "num_attention_heads": 16,
62
  "num_hidden_layers": 28,
63
  "num_key_value_heads": 8,
64
+ "pad_token_id": null,
65
  "pixel_shuffle_scale": 2,
66
  "rms_norm_eps": 1e-06,
67
+ "rope_theta": 1000000,
 
 
 
 
 
68
  "sliding_window": null,
69
+ "text_config": {
70
+ "_name_or_path": "hf-checkpoint",
71
+ "architectures": [
72
+ "IsaacForConditionalGeneration"
73
+ ],
74
+ "attention_bias": false,
75
+ "attention_dropout": 0.0,
76
+ "bos_token_id": 151643,
77
+ "dtype": "float32",
78
+ "eos_token_id": 151645,
79
+ "head_dim": 128,
80
+ "hidden_act": "silu",
81
+ "hidden_size": 2048,
82
+ "initializer_range": 0.02,
83
+ "intermediate_size": 6144,
84
+ "layer_types": [
85
+ "full_attention",
86
+ "full_attention",
87
+ "full_attention",
88
+ "full_attention",
89
+ "full_attention",
90
+ "full_attention",
91
+ "full_attention",
92
+ "full_attention",
93
+ "full_attention",
94
+ "full_attention",
95
+ "full_attention",
96
+ "full_attention",
97
+ "full_attention",
98
+ "full_attention",
99
+ "full_attention",
100
+ "full_attention",
101
+ "full_attention",
102
+ "full_attention",
103
+ "full_attention",
104
+ "full_attention",
105
+ "full_attention",
106
+ "full_attention",
107
+ "full_attention",
108
+ "full_attention",
109
+ "full_attention",
110
+ "full_attention",
111
+ "full_attention",
112
+ "full_attention"
113
+ ],
114
+ "max_position_embeddings": 40960,
115
+ "max_window_layers": 28,
116
+ "model_type": "qwen3",
117
+ "num_attention_heads": 16,
118
+ "num_hidden_layers": 28,
119
+ "num_key_value_heads": 8,
120
+ "pad_token_id": null,
121
+ "pixel_shuffle_scale": 2,
122
+ "rms_norm_eps": 1e-06,
123
+ "rope_parameters": {
124
+ "rope_theta": 1000000,
125
+ "rope_type": "default"
126
+ },
127
+ "sliding_window": null,
128
+ "tie_word_embeddings": false,
129
+ "use_cache": true,
130
+ "use_sliding_window": false,
131
+ "vision_max_num_patches": 6144,
132
+ "vision_mean": [
133
+ 0.5,
134
+ 0.5,
135
+ 0.5
136
+ ],
137
+ "vision_min_num_patches": 256,
138
+ "vision_patch_size": 16,
139
+ "vision_std": [
140
+ 0.5,
141
+ 0.5,
142
+ 0.5
143
+ ],
144
+ "vocab_size": 151936
145
+ },
146
  "tie_word_embeddings": false,
147
+ "transformers_version": "5.0.0",
148
  "use_cache": true,
149
  "use_sliding_window": false,
 
150
  "vision_config": {
151
  "attention_dropout": 0.0,
152
  "hidden_act": "gelu_pytorch_tanh",
 
154
  "image_size": 256,
155
  "intermediate_size": 4304,
156
  "layer_norm_eps": 1e-06,
157
+ "model_type": "isaac_vision",
158
  "num_attention_heads": 16,
159
  "num_channels": 3,
160
  "num_hidden_layers": 27,
 
163
  "pixel_shuffle_scale_factor": 2
164
  },
165
  "vision_max_num_patches": 6144,
166
+ "vision_mean": [
167
+ 0.5,
168
+ 0.5,
169
+ 0.5
170
+ ],
171
  "vision_min_num_patches": 256,
172
+ "vision_patch_size": 16,
173
+ "vision_rescale_factor": 0.00392156862745098,
174
+ "vision_std": [
175
+ 0.5,
176
+ 0.5,
177
+ 0.5
178
+ ],
179
  "vision_token": "<image>",
180
  "vocab_size": 151936
181
  }
model-00002-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e133442cabfd18ed5ba13cd21527d0220c78e2989a2778b8849e5835e0995c75
3
- size 4054187824
 
 
 
 
model-00003-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d48fec96ee25327332beee7dbd72e4d82a20d8e2c3e7135fcd6ce3bb9229862
3
- size 1244659840
 
 
 
 
model-00001-of-00003.safetensors → model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d31217bf5365162ae38b4e6a5b27acff8481ef892e9803874cbb49476d0f501
3
- size 4969539560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:995183b426ff163896c4c11b7dc1683233916909d6ca809b39520a2d4c03ead7
3
+ size 10268388224
modular_isaac.py CHANGED
The diff for this file is too large to render. See raw diff
 
processor_config.json CHANGED
@@ -2,208 +2,39 @@
2
  "auto_map": {
3
  "AutoProcessor": "modular_isaac.IsaacProcessor"
4
  },
5
- "config": {
6
- "_name_or_path": "",
7
- "add_cross_attention": false,
8
- "architectures": [
9
- "IsaacForConditionalGeneration"
10
- ],
11
- "attention_bias": false,
12
- "attention_dropout": 0.0,
13
  "auto_map": {
14
- "AutoModelForCausalLM": "modular_isaac.IsaacForConditionalGeneration"
15
- },
16
- "bad_words_ids": null,
17
- "begin_suppress_tokens": null,
18
- "bos_token_id": 151643,
19
- "chunk_size_feed_forward": 0,
20
- "cross_attention_hidden_size": null,
21
- "decoder_start_token_id": null,
22
- "diversity_penalty": 0.0,
23
- "do_sample": false,
24
- "dtype": "float32",
25
- "early_stopping": false,
26
- "encoder_no_repeat_ngram_size": 0,
27
- "eos_token_id": 151645,
28
- "exponential_decay_length_penalty": null,
29
- "finetuning_task": null,
30
- "forced_bos_token_id": null,
31
- "forced_eos_token_id": null,
32
- "head_dim": 128,
33
- "hidden_act": "silu",
34
- "hidden_size": 2048,
35
- "id2label": {
36
- "0": "LABEL_0",
37
- "1": "LABEL_1"
38
- },
39
- "initializer_range": 0.02,
40
- "intermediate_size": 6144,
41
- "is_decoder": false,
42
- "is_encoder_decoder": false,
43
- "label2id": {
44
- "LABEL_0": 0,
45
- "LABEL_1": 1
46
  },
47
- "layer_types": [
48
- "full_attention",
49
- "full_attention",
50
- "full_attention",
51
- "full_attention",
52
- "full_attention",
53
- "full_attention",
54
- "full_attention",
55
- "full_attention",
56
- "full_attention",
57
- "full_attention",
58
- "full_attention",
59
- "full_attention",
60
- "full_attention",
61
- "full_attention",
62
- "full_attention",
63
- "full_attention",
64
- "full_attention",
65
- "full_attention",
66
- "full_attention",
67
- "full_attention",
68
- "full_attention",
69
- "full_attention",
70
- "full_attention",
71
- "full_attention",
72
- "full_attention",
73
- "full_attention",
74
- "full_attention",
75
- "full_attention"
76
  ],
77
- "length_penalty": 1.0,
78
- "max_length": 20,
79
- "max_position_embeddings": 40960,
80
- "max_sequence_length": 16384,
81
- "max_window_layers": 28,
82
- "min_length": 0,
83
- "model_type": "isaac",
84
- "no_repeat_ngram_size": 0,
85
- "num_attention_heads": 16,
86
- "num_beam_groups": 1,
87
- "num_beams": 1,
88
- "num_hidden_layers": 28,
89
- "num_key_value_heads": 8,
90
- "num_return_sequences": 1,
91
- "output_attentions": false,
92
- "output_hidden_states": false,
93
- "output_scores": false,
94
- "pad_token_id": null,
95
  "pixel_shuffle_scale": 2,
96
- "prefix": null,
97
- "problem_type": null,
98
- "pruned_heads": {},
99
- "remove_invalid_values": false,
100
- "repetition_penalty": 1.0,
101
- "return_dict": true,
102
- "return_dict_in_generate": false,
103
- "rms_norm_eps": 1e-06,
104
- "rope_scaling": {
105
- "mrope_interleaved": true,
106
- "mrope_section": null,
107
- "rope_type": "default"
108
- },
109
- "rope_theta": 1000000.0,
110
- "sep_token_id": null,
111
- "sliding_window": null,
112
- "suppress_tokens": null,
113
- "task_specific_params": null,
114
- "temperature": 1.0,
115
- "tf_legacy_loss": false,
116
- "tie_encoder_decoder": false,
117
- "tie_word_embeddings": false,
118
- "tokenizer_class": null,
119
- "top_k": 50,
120
- "top_p": 1.0,
121
- "torchscript": false,
122
- "transformers_version": "4.56.1",
123
- "typical_p": 1.0,
124
- "use_bfloat16": false,
125
- "use_cache": true,
126
- "use_sliding_window": false,
127
- "video_patch_size": 16,
128
- "vision_config": {
129
- "_name_or_path": "",
130
- "add_cross_attention": false,
131
- "architectures": null,
132
- "attention_dropout": 0.0,
133
- "bad_words_ids": null,
134
- "begin_suppress_tokens": null,
135
- "bos_token_id": null,
136
- "chunk_size_feed_forward": 0,
137
- "cross_attention_hidden_size": null,
138
- "decoder_start_token_id": null,
139
- "diversity_penalty": 0.0,
140
- "do_sample": false,
141
- "dtype": null,
142
- "early_stopping": false,
143
- "encoder_no_repeat_ngram_size": 0,
144
- "eos_token_id": null,
145
- "exponential_decay_length_penalty": null,
146
- "finetuning_task": null,
147
- "forced_bos_token_id": null,
148
- "forced_eos_token_id": null,
149
- "hidden_act": "gelu_pytorch_tanh",
150
- "hidden_size": 1152,
151
- "id2label": {
152
- "0": "LABEL_0",
153
- "1": "LABEL_1"
154
- },
155
- "image_size": 256,
156
- "intermediate_size": 4304,
157
- "is_decoder": false,
158
- "is_encoder_decoder": false,
159
- "label2id": {
160
- "LABEL_0": 0,
161
- "LABEL_1": 1
162
- },
163
- "layer_norm_eps": 1e-06,
164
- "length_penalty": 1.0,
165
- "max_length": 20,
166
- "min_length": 0,
167
- "model_type": "pixel_shuffle_siglip2",
168
- "no_repeat_ngram_size": 0,
169
- "num_attention_heads": 16,
170
- "num_beam_groups": 1,
171
- "num_beams": 1,
172
- "num_channels": 3,
173
- "num_hidden_layers": 27,
174
- "num_patches": 256,
175
- "num_return_sequences": 1,
176
- "output_attentions": false,
177
- "output_hidden_states": false,
178
- "output_scores": false,
179
- "pad_token_id": null,
180
- "patch_size": 16,
181
- "pixel_shuffle_scale_factor": 2,
182
- "prefix": null,
183
- "problem_type": null,
184
- "pruned_heads": {},
185
- "remove_invalid_values": false,
186
- "repetition_penalty": 1.0,
187
- "return_dict": true,
188
- "return_dict_in_generate": false,
189
- "sep_token_id": null,
190
- "suppress_tokens": null,
191
- "task_specific_params": null,
192
- "temperature": 1.0,
193
- "tf_legacy_loss": false,
194
- "tie_encoder_decoder": false,
195
- "tie_word_embeddings": true,
196
- "tokenizer_class": null,
197
- "top_k": 50,
198
- "top_p": 1.0,
199
- "torchscript": false,
200
- "typical_p": 1.0,
201
- "use_bfloat16": false
202
- },
203
- "vision_max_num_patches": 6144,
204
- "vision_min_num_patches": 256,
205
- "vision_token": "<image>",
206
- "vocab_size": 151936
207
  },
208
- "processor_class": "IsaacProcessor"
209
- }
 
 
 
2
  "auto_map": {
3
  "AutoProcessor": "modular_isaac.IsaacProcessor"
4
  },
5
+ "config": null,
6
+ "image_processor": {
 
 
 
 
 
 
7
  "auto_map": {
8
+ "AutoProcessor": "modular_isaac.IsaacProcessor",
9
+ "AutoImageProcessor": "modular_isaac.IsaacImageProcessorFast"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  },
11
+ "data_format": "channels_first",
12
+ "disable_grouping": false,
13
+ "do_center_crop": false,
14
+ "do_convert_rgb": true,
15
+ "do_normalize": true,
16
+ "do_pad": false,
17
+ "do_rescale": true,
18
+ "do_resize": true,
19
+ "image_mean": [
20
+ 0.5,
21
+ 0.5,
22
+ 0.5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  ],
24
+ "image_processor_type": "IsaacImageProcessorFast",
25
+ "image_std": [
26
+ 0.5,
27
+ 0.5,
28
+ 0.5
29
+ ],
30
+ "max_num_patches": 6144,
31
+ "min_num_patches": 256,
32
+ "patch_size": 16,
 
 
 
 
 
 
 
 
 
33
  "pixel_shuffle_scale": 2,
34
+ "resample": 2,
35
+ "rescale_factor": 0.00392156862745098
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  },
37
+ "max_sequence_length": 16384,
38
+ "processor_class": "IsaacProcessor",
39
+ "vision_token": "<image>"
40
+ }