Switch to T5Gemma2Encoder for text encoding

#5
README.md CHANGED
@@ -111,7 +111,7 @@ For the full derivation of why Shared Cross-Attention shares K/V but not Q, and
111
  - CUDA-capable GPU with **30GB+ VRAM** (e.g., A100, H100) — for 24GB GPUs see [Memory-efficient Inference](#-memory-efficient-inference)
112
 
113
  ```bash
114
- pip install "diffusers>=0.35.2" "transformers>=5.0.0" torch accelerate ftfy einops sentencepiece regex Pillow
115
  ```
116
 
117
  ### Text-to-Video (T2V)
@@ -131,7 +131,6 @@ guider = AdaptiveProjectedGuidance(
131
  pipe = DiffusionPipeline.from_pretrained(
132
  "Motif-Technologies/Motif-Video-2B",
133
  custom_pipeline="pipeline_motif_video",
134
- trust_remote_code=True,
135
  torch_dtype=torch.bfloat16,
136
  guider=guider,
137
  )
@@ -165,7 +164,6 @@ guider = AdaptiveProjectedGuidance(
165
  pipe = DiffusionPipeline.from_pretrained(
166
  "Motif-Technologies/Motif-Video-2B",
167
  custom_pipeline="pipeline_motif_video",
168
- trust_remote_code=True,
169
  torch_dtype=torch.bfloat16,
170
  guider=guider,
171
  )
 
111
  - CUDA-capable GPU with **30GB+ VRAM** (e.g., A100, H100) — for 24GB GPUs see [Memory-efficient Inference](#-memory-efficient-inference)
112
 
113
  ```bash
114
+ pip install "diffusers>=0.35.2" "transformers>=5.5.4" torch accelerate ftfy einops sentencepiece regex Pillow imageio imageio-ffmpeg
115
  ```
116
 
117
  ### Text-to-Video (T2V)
 
131
  pipe = DiffusionPipeline.from_pretrained(
132
  "Motif-Technologies/Motif-Video-2B",
133
  custom_pipeline="pipeline_motif_video",
 
134
  torch_dtype=torch.bfloat16,
135
  guider=guider,
136
  )
 
164
  pipe = DiffusionPipeline.from_pretrained(
165
  "Motif-Technologies/Motif-Video-2B",
166
  custom_pipeline="pipeline_motif_video",
 
167
  torch_dtype=torch.bfloat16,
168
  guider=guider,
169
  )
model_index.json CHANGED
@@ -7,7 +7,7 @@
7
  ],
8
  "text_encoder": [
9
  "transformers",
10
- "T5Gemma2Model"
11
  ],
12
  "tokenizer": [
13
  "transformers",
@@ -25,4 +25,4 @@
25
  "transformers",
26
  "SiglipImageProcessor"
27
  ]
28
- }
 
7
  ],
8
  "text_encoder": [
9
  "transformers",
10
+ "T5Gemma2Encoder"
11
  ],
12
  "tokenizer": [
13
  "transformers",
 
25
  "transformers",
26
  "SiglipImageProcessor"
27
  ]
28
+ }
pipeline_motif_video.py CHANGED
@@ -32,17 +32,28 @@ from diffusers import (
32
  UniPCMultistepScheduler,
33
  )
34
  from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
35
- from diffusers.utils import BaseOutput, is_torch_xla_available, logging, replace_example_docstring
 
 
 
 
 
 
 
36
  from diffusers.utils.torch_utils import randn_tensor
37
  from diffusers.video_processor import VideoProcessor
38
  from einops import rearrange
39
  from PIL import Image
40
  from torch import Tensor
41
 
42
- from diffusers.guiders.adaptive_projected_guidance import MomentumBuffer
43
- from diffusers.guiders.guider_utils import GuiderOutput
 
 
 
 
 
44
  from ._fm_solvers_unipc import FlowUniPCMultistepScheduler
45
- from transformers import BatchEncoding, PreTrainedTokenizerBase, SiglipImageProcessor, T5Gemma2Model
46
 
47
 
48
  if is_torch_xla_available():
@@ -143,7 +154,10 @@ def video_normalized_guidance(
143
  v1 = torch.nn.functional.normalize(v1, dim=dim)
144
  v0_parallel = (v0 * v1).sum(dim=dim, keepdim=True) * v1
145
  v0_orthogonal = v0 - v0_parallel
146
- diff_parallel, diff_orthogonal = v0_parallel.type_as(diff), v0_orthogonal.type_as(diff)
 
 
 
147
  normalized_update = diff_orthogonal + eta * diff_parallel
148
 
149
  pred = pred_cond if use_original_formulation else pred_uncond
@@ -358,7 +372,7 @@ class MotifVideoPipeline(DiffusionPipeline):
358
  A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
359
  vae ([`AutoencoderKLWan`]):
360
  Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
361
- text_encoder ([`T5Gemma2Model`]):
362
  Primary text encoder for encoding text prompts into embeddings.
363
  tokenizer ([`PreTrainedTokenizerBase`]):
364
  Tokenizer corresponding to the primary text encoder.
@@ -379,11 +393,16 @@ class MotifVideoPipeline(DiffusionPipeline):
379
  FlowUniPCMultistepScheduler,
380
  ],
381
  vae: AutoencoderKLWan,
382
- text_encoder: T5Gemma2Model,
383
  tokenizer: PreTrainedTokenizerBase,
384
  transformer,
385
  guider: Optional[
386
- Union[ClassifierFreeGuidance, SkipLayerGuidance, AdaptiveProjectedGuidance, VideoAdaptiveProjectedGuidance]
 
 
 
 
 
387
  ] = None,
388
  feature_extractor: Optional[SiglipImageProcessor] = None,
389
  ):
@@ -451,7 +470,7 @@ class MotifVideoPipeline(DiffusionPipeline):
451
 
452
  def _get_prompt_embeds(
453
  self,
454
- text_encoder: T5Gemma2Model,
455
  tokenizer: PreTrainedTokenizerBase,
456
  prompt: Union[str, List[str]] | None = None,
457
  num_videos_per_prompt: int = 1,
@@ -471,17 +490,11 @@ class MotifVideoPipeline(DiffusionPipeline):
471
  "device": device,
472
  "dtype": dtype,
473
  }
474
- # T5Gemma2Model bundles encoder and decoder/LM head, while _get_default_embeds expects an encoder-only model
475
- # (similar to T5EncoderModel/T5GemmaEncoderModel), so we pass the encoder submodule explicitly here.
476
- if isinstance(text_encoder, T5Gemma2Model):
477
- encoder = text_encoder.encoder
478
- # When enable_model_cpu_offload() is active, the accelerate forward hook is on text_encoder (parent),
479
- # not on .encoder (child). Moving the encoder to the execution device explicitly ensures inputs and
480
- # weights are on the same device. The parent's offload hook will move text_encoder back to CPU after
481
- # the next component claims the GPU.
482
- if next(encoder.parameters()).device != torch.device(device):
483
- encoder.to(device)
484
- prompt_embeds_kwargs["text_encoder"] = encoder
485
  prompt_embeds, prompt_attention_mask = self._get_default_embeds(**prompt_embeds_kwargs)
486
 
487
  pooled_prompt_embeds = self._average_pool(prompt_embeds, prompt_attention_mask)
@@ -552,7 +565,7 @@ class MotifVideoPipeline(DiffusionPipeline):
552
  T5Gemma2 has vision_tower.vision_model structure.
553
  Will raise AttributeError if not available.
554
  """
555
- return self.text_encoder.encoder.vision_tower.vision_model
556
 
557
  def encode_image(
558
  self,
@@ -662,10 +675,22 @@ class MotifVideoPipeline(DiffusionPipeline):
662
 
663
  # Initialize conditioning tensors
664
  latent_condition = torch.zeros(
665
- batch_size, lantent_channels, latent_num_frames, latent_height, latent_width, device=device, dtype=dtype
 
 
 
 
 
 
666
  )
667
  latent_mask = torch.zeros(
668
- batch_size, 1, latent_num_frames, latent_height, latent_width, device=device, dtype=dtype
 
 
 
 
 
 
669
  )
670
  image_embeds = None
671
 
@@ -910,7 +935,9 @@ class MotifVideoPipeline(DiffusionPipeline):
910
  self,
911
  prompt: Union[str, List[str]] | None = None,
912
  image=None,
913
- negative_prompt: Optional[Union[str, List[str]]] = "text overlay, graphic overlay, watermark, logo, subtitles, timestamp, broadcast graphics, UI elements, random letters, frozen pose, rigid, static expression, jerky motion, mechanical motion, discontinuous motion, flat framing, depthless, dull lighting, monotone, crushed shadows, blown-out highlights, shifting background, fading background, poor continuity, identity drift, deformation, flickering, ghosting, smearing, duplication, mutated proportions, inconsistent clothing, flat colors, desaturated, tonally compressed, poor background separation, exposure shift, uneven brightness, color balance shift",
 
 
914
  height: int = 736,
915
  width: int = 1280,
916
  num_frames: int = 121,
@@ -1066,7 +1093,11 @@ class MotifVideoPipeline(DiffusionPipeline):
1066
 
1067
  if self.guider._enabled:
1068
  negative_prompt = self._prepare_negative_prompt(negative_prompt, batch_size)
1069
- negative_prompt_embeds, negative_pooled_prompt_embeds, negative_prompt_attention_mask = self.encode_prompt(
 
 
 
 
1070
  prompt=negative_prompt,
1071
  num_videos_per_prompt=num_videos_per_prompt,
1072
  prompt_embeds=negative_prompt_embeds,
@@ -1123,7 +1154,11 @@ class MotifVideoPipeline(DiffusionPipeline):
1123
  # Compute sigmas: use linear-quadratic schedule if enabled, otherwise default linear
1124
  _is_flow_multistep = isinstance(
1125
  self.scheduler,
1126
- (DPMSolverMultistepScheduler, UniPCMultistepScheduler, FlowUniPCMultistepScheduler),
 
 
 
 
1127
  )
1128
 
1129
  # Compute mu once, shared by both branches (required by FlowUniPCMultistepScheduler)
@@ -1195,9 +1230,15 @@ class MotifVideoPipeline(DiffusionPipeline):
1195
  "encoder_hidden_states": (prompt_embeds, negative_prompt_embeds),
1196
  }
1197
  if use_attention_mask:
1198
- guider_inputs["encoder_attention_mask"] = (prompt_attention_mask, negative_prompt_attention_mask)
 
 
 
1199
  if self.transformer.config.pooled_projection_dim is not None:
1200
- guider_inputs["pooled_projections"] = (pooled_prompt_embeds, negative_pooled_prompt_embeds)
 
 
 
1201
  if image_embeds is not None:
1202
  guider_inputs["image_embeds"] = (image_embeds, image_embeds)
1203
 
 
32
  UniPCMultistepScheduler,
33
  )
34
  from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
35
+ from diffusers.guiders.adaptive_projected_guidance import MomentumBuffer
36
+ from diffusers.guiders.guider_utils import GuiderOutput
37
+ from diffusers.utils import (
38
+ BaseOutput,
39
+ is_torch_xla_available,
40
+ logging,
41
+ replace_example_docstring,
42
+ )
43
  from diffusers.utils.torch_utils import randn_tensor
44
  from diffusers.video_processor import VideoProcessor
45
  from einops import rearrange
46
  from PIL import Image
47
  from torch import Tensor
48
 
49
+ from transformers import (
50
+ BatchEncoding,
51
+ PreTrainedTokenizerBase,
52
+ SiglipImageProcessor,
53
+ T5Gemma2Encoder,
54
+ )
55
+
56
  from ._fm_solvers_unipc import FlowUniPCMultistepScheduler
 
57
 
58
 
59
  if is_torch_xla_available():
 
154
  v1 = torch.nn.functional.normalize(v1, dim=dim)
155
  v0_parallel = (v0 * v1).sum(dim=dim, keepdim=True) * v1
156
  v0_orthogonal = v0 - v0_parallel
157
+ diff_parallel, diff_orthogonal = (
158
+ v0_parallel.type_as(diff),
159
+ v0_orthogonal.type_as(diff),
160
+ )
161
  normalized_update = diff_orthogonal + eta * diff_parallel
162
 
163
  pred = pred_cond if use_original_formulation else pred_uncond
 
372
  A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
373
  vae ([`AutoencoderKLWan`]):
374
  Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
375
+ text_encoder ([`T5Gemma2Encoder`]):
376
  Primary text encoder for encoding text prompts into embeddings.
377
  tokenizer ([`PreTrainedTokenizerBase`]):
378
  Tokenizer corresponding to the primary text encoder.
 
393
  FlowUniPCMultistepScheduler,
394
  ],
395
  vae: AutoencoderKLWan,
396
+ text_encoder: T5Gemma2Encoder,
397
  tokenizer: PreTrainedTokenizerBase,
398
  transformer,
399
  guider: Optional[
400
+ Union[
401
+ ClassifierFreeGuidance,
402
+ SkipLayerGuidance,
403
+ AdaptiveProjectedGuidance,
404
+ VideoAdaptiveProjectedGuidance,
405
+ ]
406
  ] = None,
407
  feature_extractor: Optional[SiglipImageProcessor] = None,
408
  ):
 
470
 
471
  def _get_prompt_embeds(
472
  self,
473
+ text_encoder: T5Gemma2Encoder,
474
  tokenizer: PreTrainedTokenizerBase,
475
  prompt: Union[str, List[str]] | None = None,
476
  num_videos_per_prompt: int = 1,
 
490
  "device": device,
491
  "dtype": dtype,
492
  }
493
+ # When enable_model_cpu_offload() is active, the accelerate forward hook is on text_encoder (parent). Moving the encoder to the execution device explicitly ensures inputs and
494
+ # weights are on the same device. The parent's offload hook will move text_encoder back to CPU after
495
+ # the next component claims the GPU.
496
+ if next(text_encoder.parameters()).device != torch.device(device):
497
+ text_encoder.to(device)
 
 
 
 
 
 
498
  prompt_embeds, prompt_attention_mask = self._get_default_embeds(**prompt_embeds_kwargs)
499
 
500
  pooled_prompt_embeds = self._average_pool(prompt_embeds, prompt_attention_mask)
 
565
  T5Gemma2 has vision_tower.vision_model structure.
566
  Will raise AttributeError if not available.
567
  """
568
+ return self.text_encoder.vision_tower.vision_model
569
 
570
  def encode_image(
571
  self,
 
675
 
676
  # Initialize conditioning tensors
677
  latent_condition = torch.zeros(
678
+ batch_size,
679
+ lantent_channels,
680
+ latent_num_frames,
681
+ latent_height,
682
+ latent_width,
683
+ device=device,
684
+ dtype=dtype,
685
  )
686
  latent_mask = torch.zeros(
687
+ batch_size,
688
+ 1,
689
+ latent_num_frames,
690
+ latent_height,
691
+ latent_width,
692
+ device=device,
693
+ dtype=dtype,
694
  )
695
  image_embeds = None
696
 
 
935
  self,
936
  prompt: Union[str, List[str]] | None = None,
937
  image=None,
938
+ negative_prompt: Optional[
939
+ Union[str, List[str]]
940
+ ] = "text overlay, graphic overlay, watermark, logo, subtitles, timestamp, broadcast graphics, UI elements, random letters, frozen pose, rigid, static expression, jerky motion, mechanical motion, discontinuous motion, flat framing, depthless, dull lighting, monotone, crushed shadows, blown-out highlights, shifting background, fading background, poor continuity, identity drift, deformation, flickering, ghosting, smearing, duplication, mutated proportions, inconsistent clothing, flat colors, desaturated, tonally compressed, poor background separation, exposure shift, uneven brightness, color balance shift",
941
  height: int = 736,
942
  width: int = 1280,
943
  num_frames: int = 121,
 
1093
 
1094
  if self.guider._enabled:
1095
  negative_prompt = self._prepare_negative_prompt(negative_prompt, batch_size)
1096
+ (
1097
+ negative_prompt_embeds,
1098
+ negative_pooled_prompt_embeds,
1099
+ negative_prompt_attention_mask,
1100
+ ) = self.encode_prompt(
1101
  prompt=negative_prompt,
1102
  num_videos_per_prompt=num_videos_per_prompt,
1103
  prompt_embeds=negative_prompt_embeds,
 
1154
  # Compute sigmas: use linear-quadratic schedule if enabled, otherwise default linear
1155
  _is_flow_multistep = isinstance(
1156
  self.scheduler,
1157
+ (
1158
+ DPMSolverMultistepScheduler,
1159
+ UniPCMultistepScheduler,
1160
+ FlowUniPCMultistepScheduler,
1161
+ ),
1162
  )
1163
 
1164
  # Compute mu once, shared by both branches (required by FlowUniPCMultistepScheduler)
 
1230
  "encoder_hidden_states": (prompt_embeds, negative_prompt_embeds),
1231
  }
1232
  if use_attention_mask:
1233
+ guider_inputs["encoder_attention_mask"] = (
1234
+ prompt_attention_mask,
1235
+ negative_prompt_attention_mask,
1236
+ )
1237
  if self.transformer.config.pooled_projection_dim is not None:
1238
+ guider_inputs["pooled_projections"] = (
1239
+ pooled_prompt_embeds,
1240
+ negative_pooled_prompt_embeds,
1241
+ )
1242
  if image_embeds is not None:
1243
  guider_inputs["image_embeds"] = (image_embeds, image_embeds)
1244
 
text_encoder/config.json CHANGED
@@ -1,23 +1,36 @@
1
  {
2
  "architectures": [
3
- "T5Gemma2Model"
4
  ],
5
  "attention_dropout": 0.0,
6
- "bos_token_id": 2,
7
- "classifier_dropout_rate": 0.0,
8
- "decoder": {
 
 
 
 
 
 
9
  "_sliding_window_pattern": 6,
 
10
  "attention_bias": false,
11
  "attention_dropout": 0.0,
12
  "attn_logit_softcapping": null,
 
 
 
13
  "dropout_rate": 0.0,
14
  "dtype": "bfloat16",
 
15
  "final_logit_softcapping": null,
 
16
  "head_dim": 256,
17
  "hidden_activation": "gelu_pytorch_tanh",
18
  "hidden_size": 2560,
19
  "initializer_range": 0.02,
20
  "intermediate_size": 10240,
 
21
  "layer_types": [
22
  "sliding_attention",
23
  "sliding_attention",
@@ -55,10 +68,12 @@
55
  "sliding_attention"
56
  ],
57
  "max_position_embeddings": 131072,
58
- "model_type": "t5gemma2_decoder",
59
  "num_attention_heads": 8,
60
  "num_hidden_layers": 34,
61
  "num_key_value_heads": 4,
 
 
62
  "query_pre_attn_scalar": 256,
63
  "rms_norm_eps": 1e-06,
64
  "rope_parameters": {
@@ -72,181 +87,48 @@
72
  "rope_type": "default"
73
  }
74
  },
 
75
  "sliding_window": 1024,
 
 
 
 
76
  "use_bidirectional_attention": false,
77
  "use_cache": true,
78
  "vocab_size": 262144
79
  },
80
- "dropout_rate": 0.0,
81
- "dtype": "bfloat16",
82
- "encoder": {
 
83
  "attention_dropout": 0.0,
84
- "boi_token_index": 255999,
 
 
85
  "dropout_rate": 0.0,
86
  "dtype": "bfloat16",
87
- "eoi_token_index": 256000,
88
- "image_token_index": 256001,
89
- "initializer_range": 0.02,
90
- "mm_tokens_per_image": 256,
91
- "model_type": "t5gemma2_encoder",
92
- "text_config": {
93
- "_name_or_path": "",
94
- "_sliding_window_pattern": 6,
95
- "add_cross_attention": false,
96
- "architectures": null,
97
- "attention_bias": false,
98
- "attention_dropout": 0.0,
99
- "attn_logit_softcapping": null,
100
- "bos_token_id": 2,
101
- "chunk_size_feed_forward": 0,
102
- "cross_attention_hidden_size": null,
103
- "decoder_start_token_id": null,
104
- "dropout_rate": 0.0,
105
- "dtype": "bfloat16",
106
- "eos_token_id": 1,
107
- "final_logit_softcapping": null,
108
- "finetuning_task": null,
109
- "head_dim": 256,
110
- "hidden_activation": "gelu_pytorch_tanh",
111
- "hidden_size": 2560,
112
- "id2label": {
113
- "0": "LABEL_0",
114
- "1": "LABEL_1"
115
- },
116
- "initializer_range": 0.02,
117
- "intermediate_size": 10240,
118
- "is_decoder": false,
119
- "is_encoder_decoder": false,
120
- "label2id": {
121
- "LABEL_0": 0,
122
- "LABEL_1": 1
123
- },
124
- "layer_types": [
125
- "sliding_attention",
126
- "sliding_attention",
127
- "sliding_attention",
128
- "sliding_attention",
129
- "sliding_attention",
130
- "full_attention",
131
- "sliding_attention",
132
- "sliding_attention",
133
- "sliding_attention",
134
- "sliding_attention",
135
- "sliding_attention",
136
- "full_attention",
137
- "sliding_attention",
138
- "sliding_attention",
139
- "sliding_attention",
140
- "sliding_attention",
141
- "sliding_attention",
142
- "full_attention",
143
- "sliding_attention",
144
- "sliding_attention",
145
- "sliding_attention",
146
- "sliding_attention",
147
- "sliding_attention",
148
- "full_attention",
149
- "sliding_attention",
150
- "sliding_attention",
151
- "sliding_attention",
152
- "sliding_attention",
153
- "sliding_attention",
154
- "full_attention",
155
- "sliding_attention",
156
- "sliding_attention",
157
- "sliding_attention",
158
- "sliding_attention"
159
- ],
160
- "max_position_embeddings": 131072,
161
- "model_type": "t5gemma2_text",
162
- "num_attention_heads": 8,
163
- "num_hidden_layers": 34,
164
- "num_key_value_heads": 4,
165
- "output_attentions": false,
166
- "output_hidden_states": false,
167
- "pad_token_id": 0,
168
- "prefix": null,
169
- "problem_type": null,
170
- "query_pre_attn_scalar": 256,
171
- "return_dict": true,
172
- "rms_norm_eps": 1e-06,
173
- "rope_parameters": {
174
- "full_attention": {
175
- "factor": 8.0,
176
- "rope_theta": 1000000,
177
- "rope_type": "linear"
178
- },
179
- "sliding_attention": {
180
- "rope_theta": 10000,
181
- "rope_type": "default"
182
- }
183
- },
184
- "sep_token_id": null,
185
- "sliding_window": 1024,
186
- "task_specific_params": null,
187
- "tie_encoder_decoder": false,
188
- "tie_word_embeddings": true,
189
- "tokenizer_class": null,
190
- "use_bidirectional_attention": false,
191
- "use_cache": true,
192
- "vocab_size": 262144
193
- },
194
- "vision_config": {
195
- "_name_or_path": "",
196
- "add_cross_attention": false,
197
- "architectures": null,
198
- "attention_dropout": 0.0,
199
- "bos_token_id": null,
200
- "chunk_size_feed_forward": 0,
201
- "cross_attention_hidden_size": null,
202
- "decoder_start_token_id": null,
203
- "dropout_rate": 0.0,
204
- "dtype": "bfloat16",
205
- "eos_token_id": null,
206
- "finetuning_task": null,
207
- "hidden_act": "gelu_pytorch_tanh",
208
- "hidden_size": 1152,
209
- "id2label": {
210
- "0": "LABEL_0",
211
- "1": "LABEL_1"
212
- },
213
- "image_size": 896,
214
- "intermediate_size": 4304,
215
- "is_decoder": false,
216
- "is_encoder_decoder": false,
217
- "label2id": {
218
- "LABEL_0": 0,
219
- "LABEL_1": 1
220
- },
221
- "layer_norm_eps": 1e-06,
222
- "model_type": "siglip_vision_model",
223
- "num_attention_heads": 16,
224
- "num_channels": 3,
225
- "num_hidden_layers": 27,
226
- "output_attentions": false,
227
- "output_hidden_states": false,
228
- "pad_token_id": null,
229
- "patch_size": 14,
230
- "prefix": null,
231
- "problem_type": null,
232
- "return_dict": true,
233
- "sep_token_id": null,
234
- "task_specific_params": null,
235
- "tie_encoder_decoder": false,
236
- "tie_word_embeddings": true,
237
- "tokenizer_class": null,
238
- "vision_use_head": false,
239
- "vocab_size": 262144
240
- },
241
  "vocab_size": 262144
242
  },
243
- "eoi_token_index": 256000,
244
- "eos_token_id": 1,
245
- "image_token_index": 256001,
246
- "initializer_range": 0.02,
247
- "is_encoder_decoder": true,
248
- "model_type": "t5gemma2",
249
- "pad_token_id": 0,
250
- "transformers_version": "5.0.0rc1",
251
  "vocab_size": 262144
252
- }
 
1
  {
2
  "architectures": [
3
+ "T5Gemma2Encoder"
4
  ],
5
  "attention_dropout": 0.0,
6
+ "boi_token_index": 255999,
7
+ "dropout_rate": 0.0,
8
+ "dtype": "bfloat16",
9
+ "eoi_token_index": 256000,
10
+ "image_token_index": 256001,
11
+ "initializer_range": 0.02,
12
+ "mm_tokens_per_image": 256,
13
+ "model_type": "t5gemma2_encoder",
14
+ "text_config": {
15
  "_sliding_window_pattern": 6,
16
+ "add_cross_attention": false,
17
  "attention_bias": false,
18
  "attention_dropout": 0.0,
19
  "attn_logit_softcapping": null,
20
+ "bos_token_id": 2,
21
+ "cross_attention_hidden_size": null,
22
+ "decoder_start_token_id": null,
23
  "dropout_rate": 0.0,
24
  "dtype": "bfloat16",
25
+ "eos_token_id": 1,
26
  "final_logit_softcapping": null,
27
+ "finetuning_task": null,
28
  "head_dim": 256,
29
  "hidden_activation": "gelu_pytorch_tanh",
30
  "hidden_size": 2560,
31
  "initializer_range": 0.02,
32
  "intermediate_size": 10240,
33
+ "is_decoder": false,
34
  "layer_types": [
35
  "sliding_attention",
36
  "sliding_attention",
 
68
  "sliding_attention"
69
  ],
70
  "max_position_embeddings": 131072,
71
+ "model_type": "t5gemma2_text",
72
  "num_attention_heads": 8,
73
  "num_hidden_layers": 34,
74
  "num_key_value_heads": 4,
75
+ "pad_token_id": 0,
76
+ "prefix": null,
77
  "query_pre_attn_scalar": 256,
78
  "rms_norm_eps": 1e-06,
79
  "rope_parameters": {
 
87
  "rope_type": "default"
88
  }
89
  },
90
+ "sep_token_id": null,
91
  "sliding_window": 1024,
92
+ "task_specific_params": null,
93
+ "tie_encoder_decoder": false,
94
+ "tie_word_embeddings": true,
95
+ "tokenizer_class": null,
96
  "use_bidirectional_attention": false,
97
  "use_cache": true,
98
  "vocab_size": 262144
99
  },
100
+ "tie_word_embeddings": true,
101
+ "transformers_version": "5.5.4",
102
+ "vision_config": {
103
+ "add_cross_attention": false,
104
  "attention_dropout": 0.0,
105
+ "bos_token_id": null,
106
+ "cross_attention_hidden_size": null,
107
+ "decoder_start_token_id": null,
108
  "dropout_rate": 0.0,
109
  "dtype": "bfloat16",
110
+ "eos_token_id": null,
111
+ "finetuning_task": null,
112
+ "hidden_act": "gelu_pytorch_tanh",
113
+ "hidden_size": 1152,
114
+ "image_size": 896,
115
+ "intermediate_size": 4304,
116
+ "is_decoder": false,
117
+ "layer_norm_eps": 1e-06,
118
+ "model_type": "siglip_vision_model",
119
+ "num_attention_heads": 16,
120
+ "num_channels": 3,
121
+ "num_hidden_layers": 27,
122
+ "pad_token_id": null,
123
+ "patch_size": 14,
124
+ "prefix": null,
125
+ "sep_token_id": null,
126
+ "task_specific_params": null,
127
+ "tie_encoder_decoder": false,
128
+ "tie_word_embeddings": true,
129
+ "tokenizer_class": null,
130
+ "vision_use_head": false,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  "vocab_size": 262144
132
  },
 
 
 
 
 
 
 
 
133
  "vocab_size": 262144
134
+ }
text_encoder/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c7dd568c34c56a521475124f226983dc191e57aa9b1cac9a22a87dcc753cb57
3
- size 16360212008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2957deadcb660bb6e411a88c4f8860c5972f7f4eb856ac520d2628d1e225359f
3
+ size 8599946488
transformer/config.json CHANGED
@@ -4,7 +4,6 @@
4
  "_library": "diffusers",
5
  "attention_head_dim": 128,
6
  "base_latent_size": null,
7
- "image_condition_type": null,
8
  "image_embed_dim": 1152,
9
  "in_channels": 33,
10
  "mlp_ratio": 4.0,
 
4
  "_library": "diffusers",
5
  "attention_head_dim": 128,
6
  "base_latent_size": null,
 
7
  "image_embed_dim": 1152,
8
  "in_channels": 33,
9
  "mlp_ratio": 4.0,