Switch to T5Gemma2Encoder for text encoding

by kencwt - opened Apr 20

base: refs/heads/main

←

from: refs/pr/5

Discussion Files changed

+128

-208

Files changed (6) hide show

README.md +1 -3
model_index.json +2 -2
pipeline_motif_video.py +69 -28
text_encoder/config.json +54 -172
text_encoder/model.safetensors +2 -2
transformer/config.json +0 -1

README.md CHANGED Viewed

@@ -111,7 +111,7 @@ For the full derivation of why Shared Cross-Attention shares K/V but not Q, and
 - CUDA-capable GPU with **30GB+ VRAM** (e.g., A100, H100) — for 24GB GPUs see [Memory-efficient Inference](#-memory-efficient-inference)
 ```bash
-pip install "diffusers>=0.35.2" "transformers>=5.0.0" torch accelerate ftfy einops sentencepiece regex Pillow
 ```
 ### Text-to-Video (T2V)
@@ -131,7 +131,6 @@ guider = AdaptiveProjectedGuidance(
 pipe = DiffusionPipeline.from_pretrained(
     "Motif-Technologies/Motif-Video-2B",
     custom_pipeline="pipeline_motif_video",
-    trust_remote_code=True,
     torch_dtype=torch.bfloat16,
     guider=guider,
 )
@@ -165,7 +164,6 @@ guider = AdaptiveProjectedGuidance(
 pipe = DiffusionPipeline.from_pretrained(
     "Motif-Technologies/Motif-Video-2B",
     custom_pipeline="pipeline_motif_video",
-    trust_remote_code=True,
     torch_dtype=torch.bfloat16,
     guider=guider,
 )

 - CUDA-capable GPU with **30GB+ VRAM** (e.g., A100, H100) — for 24GB GPUs see [Memory-efficient Inference](#-memory-efficient-inference)
 ```bash
+pip install "diffusers>=0.35.2" "transformers>=5.5.4" torch accelerate ftfy einops sentencepiece regex Pillow imageio imageio-ffmpeg
 ```
 ### Text-to-Video (T2V)
 pipe = DiffusionPipeline.from_pretrained(
     "Motif-Technologies/Motif-Video-2B",
     custom_pipeline="pipeline_motif_video",
     torch_dtype=torch.bfloat16,
     guider=guider,
 )
 pipe = DiffusionPipeline.from_pretrained(
     "Motif-Technologies/Motif-Video-2B",
     custom_pipeline="pipeline_motif_video",
     torch_dtype=torch.bfloat16,
     guider=guider,
 )

model_index.json CHANGED Viewed

@@ -7,7 +7,7 @@
   ],
   "text_encoder": [
     "transformers",
-    "T5Gemma2Model"
   ],
   "tokenizer": [
     "transformers",
@@ -25,4 +25,4 @@
     "transformers",
     "SiglipImageProcessor"
   ]
-}

   ],
   "text_encoder": [
     "transformers",
+    "T5Gemma2Encoder"
   ],
   "tokenizer": [
     "transformers",
     "transformers",
     "SiglipImageProcessor"
   ]
+}

pipeline_motif_video.py CHANGED Viewed

@@ -32,17 +32,28 @@ from diffusers import (
     UniPCMultistepScheduler,
 )
 from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
-from diffusers.utils import BaseOutput, is_torch_xla_available, logging, replace_example_docstring
 from diffusers.utils.torch_utils import randn_tensor
 from diffusers.video_processor import VideoProcessor
 from einops import rearrange
 from PIL import Image
 from torch import Tensor
-from diffusers.guiders.adaptive_projected_guidance import MomentumBuffer
-from diffusers.guiders.guider_utils import GuiderOutput
 from ._fm_solvers_unipc import FlowUniPCMultistepScheduler
-from transformers import BatchEncoding, PreTrainedTokenizerBase, SiglipImageProcessor, T5Gemma2Model
 if is_torch_xla_available():
@@ -143,7 +154,10 @@ def video_normalized_guidance(
     v1 = torch.nn.functional.normalize(v1, dim=dim)
     v0_parallel = (v0 * v1).sum(dim=dim, keepdim=True) * v1
     v0_orthogonal = v0 - v0_parallel
-    diff_parallel, diff_orthogonal = v0_parallel.type_as(diff), v0_orthogonal.type_as(diff)
     normalized_update = diff_orthogonal + eta * diff_parallel
     pred = pred_cond if use_original_formulation else pred_uncond
@@ -358,7 +372,7 @@ class MotifVideoPipeline(DiffusionPipeline):
             A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
         vae ([`AutoencoderKLWan`]):
             Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
-        text_encoder ([`T5Gemma2Model`]):
             Primary text encoder for encoding text prompts into embeddings.
         tokenizer ([`PreTrainedTokenizerBase`]):
             Tokenizer corresponding to the primary text encoder.
@@ -379,11 +393,16 @@ class MotifVideoPipeline(DiffusionPipeline):
             FlowUniPCMultistepScheduler,
         ],
         vae: AutoencoderKLWan,
-        text_encoder: T5Gemma2Model,
         tokenizer: PreTrainedTokenizerBase,
         transformer,
         guider: Optional[
-            Union[ClassifierFreeGuidance, SkipLayerGuidance, AdaptiveProjectedGuidance, VideoAdaptiveProjectedGuidance]
         ] = None,
         feature_extractor: Optional[SiglipImageProcessor] = None,
     ):
@@ -451,7 +470,7 @@ class MotifVideoPipeline(DiffusionPipeline):
     def _get_prompt_embeds(
         self,
-        text_encoder: T5Gemma2Model,
         tokenizer: PreTrainedTokenizerBase,
         prompt: Union[str, List[str]] | None = None,
         num_videos_per_prompt: int = 1,
@@ -471,17 +490,11 @@ class MotifVideoPipeline(DiffusionPipeline):
             "device": device,
             "dtype": dtype,
         }
-        # T5Gemma2Model bundles encoder and decoder/LM head, while _get_default_embeds expects an encoder-only model
-        # (similar to T5EncoderModel/T5GemmaEncoderModel), so we pass the encoder submodule explicitly here.
-        if isinstance(text_encoder, T5Gemma2Model):
-            encoder = text_encoder.encoder
-            # When enable_model_cpu_offload() is active, the accelerate forward hook is on text_encoder (parent),
-            # not on .encoder (child). Moving the encoder to the execution device explicitly ensures inputs and
-            # weights are on the same device. The parent's offload hook will move text_encoder back to CPU after
-            # the next component claims the GPU.
-            if next(encoder.parameters()).device != torch.device(device):
-                encoder.to(device)
-            prompt_embeds_kwargs["text_encoder"] = encoder
         prompt_embeds, prompt_attention_mask = self._get_default_embeds(**prompt_embeds_kwargs)
         pooled_prompt_embeds = self._average_pool(prompt_embeds, prompt_attention_mask)
@@ -552,7 +565,7 @@ class MotifVideoPipeline(DiffusionPipeline):
         T5Gemma2 has vision_tower.vision_model structure.
         Will raise AttributeError if not available.
         """
-        return self.text_encoder.encoder.vision_tower.vision_model
     def encode_image(
         self,
@@ -662,10 +675,22 @@ class MotifVideoPipeline(DiffusionPipeline):
         # Initialize conditioning tensors
         latent_condition = torch.zeros(
-            batch_size, lantent_channels, latent_num_frames, latent_height, latent_width, device=device, dtype=dtype
         )
         latent_mask = torch.zeros(
-            batch_size, 1, latent_num_frames, latent_height, latent_width, device=device, dtype=dtype
         )
         image_embeds = None
@@ -910,7 +935,9 @@ class MotifVideoPipeline(DiffusionPipeline):
         self,
         prompt: Union[str, List[str]] | None = None,
         image=None,
-        negative_prompt: Optional[Union[str, List[str]]] = "text overlay, graphic overlay, watermark, logo, subtitles, timestamp, broadcast graphics, UI elements, random letters, frozen pose, rigid, static expression, jerky motion, mechanical motion, discontinuous motion, flat framing, depthless, dull lighting, monotone, crushed shadows, blown-out highlights, shifting background, fading background, poor continuity, identity drift, deformation, flickering, ghosting, smearing, duplication, mutated proportions, inconsistent clothing, flat colors, desaturated, tonally compressed, poor background separation, exposure shift, uneven brightness, color balance shift",
         height: int = 736,
         width: int = 1280,
         num_frames: int = 121,
@@ -1066,7 +1093,11 @@ class MotifVideoPipeline(DiffusionPipeline):
         if self.guider._enabled:
             negative_prompt = self._prepare_negative_prompt(negative_prompt, batch_size)
-            negative_prompt_embeds, negative_pooled_prompt_embeds, negative_prompt_attention_mask = self.encode_prompt(
                 prompt=negative_prompt,
                 num_videos_per_prompt=num_videos_per_prompt,
                 prompt_embeds=negative_prompt_embeds,
@@ -1123,7 +1154,11 @@ class MotifVideoPipeline(DiffusionPipeline):
         # Compute sigmas: use linear-quadratic schedule if enabled, otherwise default linear
         _is_flow_multistep = isinstance(
             self.scheduler,
-            (DPMSolverMultistepScheduler, UniPCMultistepScheduler, FlowUniPCMultistepScheduler),
         )
         # Compute mu once, shared by both branches (required by FlowUniPCMultistepScheduler)
@@ -1195,9 +1230,15 @@ class MotifVideoPipeline(DiffusionPipeline):
                     "encoder_hidden_states": (prompt_embeds, negative_prompt_embeds),
                 }
                 if use_attention_mask:
-                    guider_inputs["encoder_attention_mask"] = (prompt_attention_mask, negative_prompt_attention_mask)
                 if self.transformer.config.pooled_projection_dim is not None:
-                    guider_inputs["pooled_projections"] = (pooled_prompt_embeds, negative_pooled_prompt_embeds)
                 if image_embeds is not None:
                     guider_inputs["image_embeds"] = (image_embeds, image_embeds)

     UniPCMultistepScheduler,
 )
 from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.guiders.adaptive_projected_guidance import MomentumBuffer
+from diffusers.guiders.guider_utils import GuiderOutput
+from diffusers.utils import (
+    BaseOutput,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
 from diffusers.utils.torch_utils import randn_tensor
 from diffusers.video_processor import VideoProcessor
 from einops import rearrange
 from PIL import Image
 from torch import Tensor
+from transformers import (
+    BatchEncoding,
+    PreTrainedTokenizerBase,
+    SiglipImageProcessor,
+    T5Gemma2Encoder,
+)
 from ._fm_solvers_unipc import FlowUniPCMultistepScheduler
 if is_torch_xla_available():
     v1 = torch.nn.functional.normalize(v1, dim=dim)
     v0_parallel = (v0 * v1).sum(dim=dim, keepdim=True) * v1
     v0_orthogonal = v0 - v0_parallel
+    diff_parallel, diff_orthogonal = (
+        v0_parallel.type_as(diff),
+        v0_orthogonal.type_as(diff),
+    )
     normalized_update = diff_orthogonal + eta * diff_parallel
     pred = pred_cond if use_original_formulation else pred_uncond
             A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
         vae ([`AutoencoderKLWan`]):
             Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        text_encoder ([`T5Gemma2Encoder`]):
             Primary text encoder for encoding text prompts into embeddings.
         tokenizer ([`PreTrainedTokenizerBase`]):
             Tokenizer corresponding to the primary text encoder.
             FlowUniPCMultistepScheduler,
         ],
         vae: AutoencoderKLWan,
+        text_encoder: T5Gemma2Encoder,
         tokenizer: PreTrainedTokenizerBase,
         transformer,
         guider: Optional[
+            Union[
+                ClassifierFreeGuidance,
+                SkipLayerGuidance,
+                AdaptiveProjectedGuidance,
+                VideoAdaptiveProjectedGuidance,
+            ]
         ] = None,
         feature_extractor: Optional[SiglipImageProcessor] = None,
     ):
     def _get_prompt_embeds(
         self,
+        text_encoder: T5Gemma2Encoder,
         tokenizer: PreTrainedTokenizerBase,
         prompt: Union[str, List[str]] | None = None,
         num_videos_per_prompt: int = 1,
             "device": device,
             "dtype": dtype,
         }
+        # When enable_model_cpu_offload() is active, the accelerate forward hook is on text_encoder (parent). Moving the encoder to the execution device explicitly ensures inputs and
+        # weights are on the same device. The parent's offload hook will move text_encoder back to CPU after
+        # the next component claims the GPU.
+        if next(text_encoder.parameters()).device != torch.device(device):
+            text_encoder.to(device)
         prompt_embeds, prompt_attention_mask = self._get_default_embeds(**prompt_embeds_kwargs)
         pooled_prompt_embeds = self._average_pool(prompt_embeds, prompt_attention_mask)
         T5Gemma2 has vision_tower.vision_model structure.
         Will raise AttributeError if not available.
         """
+        return self.text_encoder.vision_tower.vision_model
     def encode_image(
         self,
         # Initialize conditioning tensors
         latent_condition = torch.zeros(
+            batch_size,
+            lantent_channels,
+            latent_num_frames,
+            latent_height,
+            latent_width,
+            device=device,
+            dtype=dtype,
         )
         latent_mask = torch.zeros(
+            batch_size,
+            1,
+            latent_num_frames,
+            latent_height,
+            latent_width,
+            device=device,
+            dtype=dtype,
         )
         image_embeds = None
         self,
         prompt: Union[str, List[str]] | None = None,
         image=None,
+        negative_prompt: Optional[
+            Union[str, List[str]]
+        ] = "text overlay, graphic overlay, watermark, logo, subtitles, timestamp, broadcast graphics, UI elements, random letters, frozen pose, rigid, static expression, jerky motion, mechanical motion, discontinuous motion, flat framing, depthless, dull lighting, monotone, crushed shadows, blown-out highlights, shifting background, fading background, poor continuity, identity drift, deformation, flickering, ghosting, smearing, duplication, mutated proportions, inconsistent clothing, flat colors, desaturated, tonally compressed, poor background separation, exposure shift, uneven brightness, color balance shift",
         height: int = 736,
         width: int = 1280,
         num_frames: int = 121,
         if self.guider._enabled:
             negative_prompt = self._prepare_negative_prompt(negative_prompt, batch_size)
+            (
+                negative_prompt_embeds,
+                negative_pooled_prompt_embeds,
+                negative_prompt_attention_mask,
+            ) = self.encode_prompt(
                 prompt=negative_prompt,
                 num_videos_per_prompt=num_videos_per_prompt,
                 prompt_embeds=negative_prompt_embeds,
         # Compute sigmas: use linear-quadratic schedule if enabled, otherwise default linear
         _is_flow_multistep = isinstance(
             self.scheduler,
+            (
+                DPMSolverMultistepScheduler,
+                UniPCMultistepScheduler,
+                FlowUniPCMultistepScheduler,
+            ),
         )
         # Compute mu once, shared by both branches (required by FlowUniPCMultistepScheduler)
                     "encoder_hidden_states": (prompt_embeds, negative_prompt_embeds),
                 }
                 if use_attention_mask:
+                    guider_inputs["encoder_attention_mask"] = (
+                        prompt_attention_mask,
+                        negative_prompt_attention_mask,
+                    )
                 if self.transformer.config.pooled_projection_dim is not None:
+                    guider_inputs["pooled_projections"] = (
+                        pooled_prompt_embeds,
+                        negative_pooled_prompt_embeds,
+                    )
                 if image_embeds is not None:
                     guider_inputs["image_embeds"] = (image_embeds, image_embeds)

text_encoder/config.json CHANGED Viewed

@@ -1,23 +1,36 @@
 {
   "architectures": [
-    "T5Gemma2Model"
   ],
   "attention_dropout": 0.0,
-  "bos_token_id": 2,
-  "classifier_dropout_rate": 0.0,
-  "decoder": {
     "_sliding_window_pattern": 6,
     "attention_bias": false,
     "attention_dropout": 0.0,
     "attn_logit_softcapping": null,
     "dropout_rate": 0.0,
     "dtype": "bfloat16",
     "final_logit_softcapping": null,
     "head_dim": 256,
     "hidden_activation": "gelu_pytorch_tanh",
     "hidden_size": 2560,
     "initializer_range": 0.02,
     "intermediate_size": 10240,
     "layer_types": [
       "sliding_attention",
       "sliding_attention",
@@ -55,10 +68,12 @@
       "sliding_attention"
     ],
     "max_position_embeddings": 131072,
-    "model_type": "t5gemma2_decoder",
     "num_attention_heads": 8,
     "num_hidden_layers": 34,
     "num_key_value_heads": 4,
     "query_pre_attn_scalar": 256,
     "rms_norm_eps": 1e-06,
     "rope_parameters": {
@@ -72,181 +87,48 @@
         "rope_type": "default"
       }
     },
     "sliding_window": 1024,
     "use_bidirectional_attention": false,
     "use_cache": true,
     "vocab_size": 262144
   },
-  "dropout_rate": 0.0,
-  "dtype": "bfloat16",
-  "encoder": {
     "attention_dropout": 0.0,
-    "boi_token_index": 255999,
     "dropout_rate": 0.0,
     "dtype": "bfloat16",
-    "eoi_token_index": 256000,
-    "image_token_index": 256001,
-    "initializer_range": 0.02,
-    "mm_tokens_per_image": 256,
-    "model_type": "t5gemma2_encoder",
-    "text_config": {
-      "_name_or_path": "",
-      "_sliding_window_pattern": 6,
-      "add_cross_attention": false,
-      "architectures": null,
-      "attention_bias": false,
-      "attention_dropout": 0.0,
-      "attn_logit_softcapping": null,
-      "bos_token_id": 2,
-      "chunk_size_feed_forward": 0,
-      "cross_attention_hidden_size": null,
-      "decoder_start_token_id": null,
-      "dropout_rate": 0.0,
-      "dtype": "bfloat16",
-      "eos_token_id": 1,
-      "final_logit_softcapping": null,
-      "finetuning_task": null,
-      "head_dim": 256,
-      "hidden_activation": "gelu_pytorch_tanh",
-      "hidden_size": 2560,
-      "id2label": {
-        "0": "LABEL_0",
-        "1": "LABEL_1"
-      },
-      "initializer_range": 0.02,
-      "intermediate_size": 10240,
-      "is_decoder": false,
-      "is_encoder_decoder": false,
-      "label2id": {
-        "LABEL_0": 0,
-        "LABEL_1": 1
-      },
-      "layer_types": [
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "full_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "full_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "full_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "full_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "full_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention"
-      ],
-      "max_position_embeddings": 131072,
-      "model_type": "t5gemma2_text",
-      "num_attention_heads": 8,
-      "num_hidden_layers": 34,
-      "num_key_value_heads": 4,
-      "output_attentions": false,
-      "output_hidden_states": false,
-      "pad_token_id": 0,
-      "prefix": null,
-      "problem_type": null,
-      "query_pre_attn_scalar": 256,
-      "return_dict": true,
-      "rms_norm_eps": 1e-06,
-      "rope_parameters": {
-        "full_attention": {
-          "factor": 8.0,
-          "rope_theta": 1000000,
-          "rope_type": "linear"
-        },
-        "sliding_attention": {
-          "rope_theta": 10000,
-          "rope_type": "default"
-        }
-      },
-      "sep_token_id": null,
-      "sliding_window": 1024,
-      "task_specific_params": null,
-      "tie_encoder_decoder": false,
-      "tie_word_embeddings": true,
-      "tokenizer_class": null,
-      "use_bidirectional_attention": false,
-      "use_cache": true,
-      "vocab_size": 262144
-    },
-    "vision_config": {
-      "_name_or_path": "",
-      "add_cross_attention": false,
-      "architectures": null,
-      "attention_dropout": 0.0,
-      "bos_token_id": null,
-      "chunk_size_feed_forward": 0,
-      "cross_attention_hidden_size": null,
-      "decoder_start_token_id": null,
-      "dropout_rate": 0.0,
-      "dtype": "bfloat16",
-      "eos_token_id": null,
-      "finetuning_task": null,
-      "hidden_act": "gelu_pytorch_tanh",
-      "hidden_size": 1152,
-      "id2label": {
-        "0": "LABEL_0",
-        "1": "LABEL_1"
-      },
-      "image_size": 896,
-      "intermediate_size": 4304,
-      "is_decoder": false,
-      "is_encoder_decoder": false,
-      "label2id": {
-        "LABEL_0": 0,
-        "LABEL_1": 1
-      },
-      "layer_norm_eps": 1e-06,
-      "model_type": "siglip_vision_model",
-      "num_attention_heads": 16,
-      "num_channels": 3,
-      "num_hidden_layers": 27,
-      "output_attentions": false,
-      "output_hidden_states": false,
-      "pad_token_id": null,
-      "patch_size": 14,
-      "prefix": null,
-      "problem_type": null,
-      "return_dict": true,
-      "sep_token_id": null,
-      "task_specific_params": null,
-      "tie_encoder_decoder": false,
-      "tie_word_embeddings": true,
-      "tokenizer_class": null,
-      "vision_use_head": false,
-      "vocab_size": 262144
-    },
     "vocab_size": 262144
   },
-  "eoi_token_index": 256000,
-  "eos_token_id": 1,
-  "image_token_index": 256001,
-  "initializer_range": 0.02,
-  "is_encoder_decoder": true,
-  "model_type": "t5gemma2",
-  "pad_token_id": 0,
-  "transformers_version": "5.0.0rc1",
   "vocab_size": 262144
-}

 {
   "architectures": [
+    "T5Gemma2Encoder"
   ],
   "attention_dropout": 0.0,
+  "boi_token_index": 255999,
+  "dropout_rate": 0.0,
+  "dtype": "bfloat16",
+  "eoi_token_index": 256000,
+  "image_token_index": 256001,
+  "initializer_range": 0.02,
+  "mm_tokens_per_image": 256,
+  "model_type": "t5gemma2_encoder",
+  "text_config": {
     "_sliding_window_pattern": 6,
+    "add_cross_attention": false,
     "attention_bias": false,
     "attention_dropout": 0.0,
     "attn_logit_softcapping": null,
+    "bos_token_id": 2,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
     "dropout_rate": 0.0,
     "dtype": "bfloat16",
+    "eos_token_id": 1,
     "final_logit_softcapping": null,
+    "finetuning_task": null,
     "head_dim": 256,
     "hidden_activation": "gelu_pytorch_tanh",
     "hidden_size": 2560,
     "initializer_range": 0.02,
     "intermediate_size": 10240,
+    "is_decoder": false,
     "layer_types": [
       "sliding_attention",
       "sliding_attention",
       "sliding_attention"
     ],
     "max_position_embeddings": 131072,
+    "model_type": "t5gemma2_text",
     "num_attention_heads": 8,
     "num_hidden_layers": 34,
     "num_key_value_heads": 4,
+    "pad_token_id": 0,
+    "prefix": null,
     "query_pre_attn_scalar": 256,
     "rms_norm_eps": 1e-06,
     "rope_parameters": {
         "rope_type": "default"
       }
     },
+    "sep_token_id": null,
     "sliding_window": 1024,
+    "task_specific_params": null,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
     "use_bidirectional_attention": false,
     "use_cache": true,
     "vocab_size": 262144
   },
+  "tie_word_embeddings": true,
+  "transformers_version": "5.5.4",
+  "vision_config": {
+    "add_cross_attention": false,
     "attention_dropout": 0.0,
+    "bos_token_id": null,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
     "dropout_rate": 0.0,
     "dtype": "bfloat16",
+    "eos_token_id": null,
+    "finetuning_task": null,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 896,
+    "intermediate_size": 4304,
+    "is_decoder": false,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "sep_token_id": null,
+    "task_specific_params": null,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "vision_use_head": false,
     "vocab_size": 262144
   },
   "vocab_size": 262144
+}

text_encoder/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8c7dd568c34c56a521475124f226983dc191e57aa9b1cac9a22a87dcc753cb57
-size 16360212008

 version https://git-lfs.github.com/spec/v1
+oid sha256:2957deadcb660bb6e411a88c4f8860c5972f7f4eb856ac520d2628d1e225359f
+size 8599946488

transformer/config.json CHANGED Viewed

@@ -4,7 +4,6 @@
   "_library": "diffusers",
   "attention_head_dim": 128,
   "base_latent_size": null,
-  "image_condition_type": null,
   "image_embed_dim": 1152,
   "in_channels": 33,
   "mlp_ratio": 4.0,

   "_library": "diffusers",
   "attention_head_dim": 128,
   "base_latent_size": null,
   "image_embed_dim": 1152,
   "in_channels": 33,
   "mlp_ratio": 4.0,