Instructions to use Motif-Technologies/Motif-Video-2B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Diffusers
How to use Motif-Technologies/Motif-Video-2B with Diffusers:
pip install -U diffusers transformers accelerate
import torch from diffusers import DiffusionPipeline # switch to "mps" for apple devices pipe = DiffusionPipeline.from_pretrained("Motif-Technologies/Motif-Video-2B", dtype=torch.bfloat16, device_map="cuda") prompt = "A vibrant blue jay perches gracefully on a slender branch, its feathers shimmering in the soft morning light. The bird's keen eyes scan the surroundings, capturing the essence of the tranquil forest. It flutters its wings briefly, showcasing the intricate patterns of blue, white, and black on its plumage. The background reveals a lush canopy of green leaves, with rays of sunlight filtering through, creating a dappled effect on the forest floor. The blue jay then tilts its head, emitting a melodious call that echoes through the serene woodland, adding a touch of magic to the peaceful scene." image = pipe(prompt).images[0] - Notebooks
- Google Colab
- Kaggle
Switch to T5Gemma2Encoder for text encoding
#5
by kencwt - opened
- README.md +1 -3
- model_index.json +2 -2
- pipeline_motif_video.py +69 -28
- text_encoder/config.json +54 -172
- text_encoder/model.safetensors +2 -2
- transformer/config.json +0 -1
README.md
CHANGED
|
@@ -111,7 +111,7 @@ For the full derivation of why Shared Cross-Attention shares K/V but not Q, and
|
|
| 111 |
- CUDA-capable GPU with **30GB+ VRAM** (e.g., A100, H100) — for 24GB GPUs see [Memory-efficient Inference](#-memory-efficient-inference)
|
| 112 |
|
| 113 |
```bash
|
| 114 |
-
pip install "diffusers>=0.35.2" "transformers>=5.
|
| 115 |
```
|
| 116 |
|
| 117 |
### Text-to-Video (T2V)
|
|
@@ -131,7 +131,6 @@ guider = AdaptiveProjectedGuidance(
|
|
| 131 |
pipe = DiffusionPipeline.from_pretrained(
|
| 132 |
"Motif-Technologies/Motif-Video-2B",
|
| 133 |
custom_pipeline="pipeline_motif_video",
|
| 134 |
-
trust_remote_code=True,
|
| 135 |
torch_dtype=torch.bfloat16,
|
| 136 |
guider=guider,
|
| 137 |
)
|
|
@@ -165,7 +164,6 @@ guider = AdaptiveProjectedGuidance(
|
|
| 165 |
pipe = DiffusionPipeline.from_pretrained(
|
| 166 |
"Motif-Technologies/Motif-Video-2B",
|
| 167 |
custom_pipeline="pipeline_motif_video",
|
| 168 |
-
trust_remote_code=True,
|
| 169 |
torch_dtype=torch.bfloat16,
|
| 170 |
guider=guider,
|
| 171 |
)
|
|
|
|
| 111 |
- CUDA-capable GPU with **30GB+ VRAM** (e.g., A100, H100) — for 24GB GPUs see [Memory-efficient Inference](#-memory-efficient-inference)
|
| 112 |
|
| 113 |
```bash
|
| 114 |
+
pip install "diffusers>=0.35.2" "transformers>=5.5.4" torch accelerate ftfy einops sentencepiece regex Pillow imageio imageio-ffmpeg
|
| 115 |
```
|
| 116 |
|
| 117 |
### Text-to-Video (T2V)
|
|
|
|
| 131 |
pipe = DiffusionPipeline.from_pretrained(
|
| 132 |
"Motif-Technologies/Motif-Video-2B",
|
| 133 |
custom_pipeline="pipeline_motif_video",
|
|
|
|
| 134 |
torch_dtype=torch.bfloat16,
|
| 135 |
guider=guider,
|
| 136 |
)
|
|
|
|
| 164 |
pipe = DiffusionPipeline.from_pretrained(
|
| 165 |
"Motif-Technologies/Motif-Video-2B",
|
| 166 |
custom_pipeline="pipeline_motif_video",
|
|
|
|
| 167 |
torch_dtype=torch.bfloat16,
|
| 168 |
guider=guider,
|
| 169 |
)
|
model_index.json
CHANGED
|
@@ -7,7 +7,7 @@
|
|
| 7 |
],
|
| 8 |
"text_encoder": [
|
| 9 |
"transformers",
|
| 10 |
-
"
|
| 11 |
],
|
| 12 |
"tokenizer": [
|
| 13 |
"transformers",
|
|
@@ -25,4 +25,4 @@
|
|
| 25 |
"transformers",
|
| 26 |
"SiglipImageProcessor"
|
| 27 |
]
|
| 28 |
-
}
|
|
|
|
| 7 |
],
|
| 8 |
"text_encoder": [
|
| 9 |
"transformers",
|
| 10 |
+
"T5Gemma2Encoder"
|
| 11 |
],
|
| 12 |
"tokenizer": [
|
| 13 |
"transformers",
|
|
|
|
| 25 |
"transformers",
|
| 26 |
"SiglipImageProcessor"
|
| 27 |
]
|
| 28 |
+
}
|
pipeline_motif_video.py
CHANGED
|
@@ -32,17 +32,28 @@ from diffusers import (
|
|
| 32 |
UniPCMultistepScheduler,
|
| 33 |
)
|
| 34 |
from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
|
| 35 |
-
from diffusers.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
from diffusers.utils.torch_utils import randn_tensor
|
| 37 |
from diffusers.video_processor import VideoProcessor
|
| 38 |
from einops import rearrange
|
| 39 |
from PIL import Image
|
| 40 |
from torch import Tensor
|
| 41 |
|
| 42 |
-
from
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
from ._fm_solvers_unipc import FlowUniPCMultistepScheduler
|
| 45 |
-
from transformers import BatchEncoding, PreTrainedTokenizerBase, SiglipImageProcessor, T5Gemma2Model
|
| 46 |
|
| 47 |
|
| 48 |
if is_torch_xla_available():
|
|
@@ -143,7 +154,10 @@ def video_normalized_guidance(
|
|
| 143 |
v1 = torch.nn.functional.normalize(v1, dim=dim)
|
| 144 |
v0_parallel = (v0 * v1).sum(dim=dim, keepdim=True) * v1
|
| 145 |
v0_orthogonal = v0 - v0_parallel
|
| 146 |
-
diff_parallel, diff_orthogonal =
|
|
|
|
|
|
|
|
|
|
| 147 |
normalized_update = diff_orthogonal + eta * diff_parallel
|
| 148 |
|
| 149 |
pred = pred_cond if use_original_formulation else pred_uncond
|
|
@@ -358,7 +372,7 @@ class MotifVideoPipeline(DiffusionPipeline):
|
|
| 358 |
A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
|
| 359 |
vae ([`AutoencoderKLWan`]):
|
| 360 |
Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
|
| 361 |
-
text_encoder ([`
|
| 362 |
Primary text encoder for encoding text prompts into embeddings.
|
| 363 |
tokenizer ([`PreTrainedTokenizerBase`]):
|
| 364 |
Tokenizer corresponding to the primary text encoder.
|
|
@@ -379,11 +393,16 @@ class MotifVideoPipeline(DiffusionPipeline):
|
|
| 379 |
FlowUniPCMultistepScheduler,
|
| 380 |
],
|
| 381 |
vae: AutoencoderKLWan,
|
| 382 |
-
text_encoder:
|
| 383 |
tokenizer: PreTrainedTokenizerBase,
|
| 384 |
transformer,
|
| 385 |
guider: Optional[
|
| 386 |
-
Union[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
] = None,
|
| 388 |
feature_extractor: Optional[SiglipImageProcessor] = None,
|
| 389 |
):
|
|
@@ -451,7 +470,7 @@ class MotifVideoPipeline(DiffusionPipeline):
|
|
| 451 |
|
| 452 |
def _get_prompt_embeds(
|
| 453 |
self,
|
| 454 |
-
text_encoder:
|
| 455 |
tokenizer: PreTrainedTokenizerBase,
|
| 456 |
prompt: Union[str, List[str]] | None = None,
|
| 457 |
num_videos_per_prompt: int = 1,
|
|
@@ -471,17 +490,11 @@ class MotifVideoPipeline(DiffusionPipeline):
|
|
| 471 |
"device": device,
|
| 472 |
"dtype": dtype,
|
| 473 |
}
|
| 474 |
-
#
|
| 475 |
-
#
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
# not on .encoder (child). Moving the encoder to the execution device explicitly ensures inputs and
|
| 480 |
-
# weights are on the same device. The parent's offload hook will move text_encoder back to CPU after
|
| 481 |
-
# the next component claims the GPU.
|
| 482 |
-
if next(encoder.parameters()).device != torch.device(device):
|
| 483 |
-
encoder.to(device)
|
| 484 |
-
prompt_embeds_kwargs["text_encoder"] = encoder
|
| 485 |
prompt_embeds, prompt_attention_mask = self._get_default_embeds(**prompt_embeds_kwargs)
|
| 486 |
|
| 487 |
pooled_prompt_embeds = self._average_pool(prompt_embeds, prompt_attention_mask)
|
|
@@ -552,7 +565,7 @@ class MotifVideoPipeline(DiffusionPipeline):
|
|
| 552 |
T5Gemma2 has vision_tower.vision_model structure.
|
| 553 |
Will raise AttributeError if not available.
|
| 554 |
"""
|
| 555 |
-
return self.text_encoder.
|
| 556 |
|
| 557 |
def encode_image(
|
| 558 |
self,
|
|
@@ -662,10 +675,22 @@ class MotifVideoPipeline(DiffusionPipeline):
|
|
| 662 |
|
| 663 |
# Initialize conditioning tensors
|
| 664 |
latent_condition = torch.zeros(
|
| 665 |
-
batch_size,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 666 |
)
|
| 667 |
latent_mask = torch.zeros(
|
| 668 |
-
batch_size,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 669 |
)
|
| 670 |
image_embeds = None
|
| 671 |
|
|
@@ -910,7 +935,9 @@ class MotifVideoPipeline(DiffusionPipeline):
|
|
| 910 |
self,
|
| 911 |
prompt: Union[str, List[str]] | None = None,
|
| 912 |
image=None,
|
| 913 |
-
negative_prompt: Optional[
|
|
|
|
|
|
|
| 914 |
height: int = 736,
|
| 915 |
width: int = 1280,
|
| 916 |
num_frames: int = 121,
|
|
@@ -1066,7 +1093,11 @@ class MotifVideoPipeline(DiffusionPipeline):
|
|
| 1066 |
|
| 1067 |
if self.guider._enabled:
|
| 1068 |
negative_prompt = self._prepare_negative_prompt(negative_prompt, batch_size)
|
| 1069 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1070 |
prompt=negative_prompt,
|
| 1071 |
num_videos_per_prompt=num_videos_per_prompt,
|
| 1072 |
prompt_embeds=negative_prompt_embeds,
|
|
@@ -1123,7 +1154,11 @@ class MotifVideoPipeline(DiffusionPipeline):
|
|
| 1123 |
# Compute sigmas: use linear-quadratic schedule if enabled, otherwise default linear
|
| 1124 |
_is_flow_multistep = isinstance(
|
| 1125 |
self.scheduler,
|
| 1126 |
-
(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1127 |
)
|
| 1128 |
|
| 1129 |
# Compute mu once, shared by both branches (required by FlowUniPCMultistepScheduler)
|
|
@@ -1195,9 +1230,15 @@ class MotifVideoPipeline(DiffusionPipeline):
|
|
| 1195 |
"encoder_hidden_states": (prompt_embeds, negative_prompt_embeds),
|
| 1196 |
}
|
| 1197 |
if use_attention_mask:
|
| 1198 |
-
guider_inputs["encoder_attention_mask"] = (
|
|
|
|
|
|
|
|
|
|
| 1199 |
if self.transformer.config.pooled_projection_dim is not None:
|
| 1200 |
-
guider_inputs["pooled_projections"] = (
|
|
|
|
|
|
|
|
|
|
| 1201 |
if image_embeds is not None:
|
| 1202 |
guider_inputs["image_embeds"] = (image_embeds, image_embeds)
|
| 1203 |
|
|
|
|
| 32 |
UniPCMultistepScheduler,
|
| 33 |
)
|
| 34 |
from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
|
| 35 |
+
from diffusers.guiders.adaptive_projected_guidance import MomentumBuffer
|
| 36 |
+
from diffusers.guiders.guider_utils import GuiderOutput
|
| 37 |
+
from diffusers.utils import (
|
| 38 |
+
BaseOutput,
|
| 39 |
+
is_torch_xla_available,
|
| 40 |
+
logging,
|
| 41 |
+
replace_example_docstring,
|
| 42 |
+
)
|
| 43 |
from diffusers.utils.torch_utils import randn_tensor
|
| 44 |
from diffusers.video_processor import VideoProcessor
|
| 45 |
from einops import rearrange
|
| 46 |
from PIL import Image
|
| 47 |
from torch import Tensor
|
| 48 |
|
| 49 |
+
from transformers import (
|
| 50 |
+
BatchEncoding,
|
| 51 |
+
PreTrainedTokenizerBase,
|
| 52 |
+
SiglipImageProcessor,
|
| 53 |
+
T5Gemma2Encoder,
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
from ._fm_solvers_unipc import FlowUniPCMultistepScheduler
|
|
|
|
| 57 |
|
| 58 |
|
| 59 |
if is_torch_xla_available():
|
|
|
|
| 154 |
v1 = torch.nn.functional.normalize(v1, dim=dim)
|
| 155 |
v0_parallel = (v0 * v1).sum(dim=dim, keepdim=True) * v1
|
| 156 |
v0_orthogonal = v0 - v0_parallel
|
| 157 |
+
diff_parallel, diff_orthogonal = (
|
| 158 |
+
v0_parallel.type_as(diff),
|
| 159 |
+
v0_orthogonal.type_as(diff),
|
| 160 |
+
)
|
| 161 |
normalized_update = diff_orthogonal + eta * diff_parallel
|
| 162 |
|
| 163 |
pred = pred_cond if use_original_formulation else pred_uncond
|
|
|
|
| 372 |
A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
|
| 373 |
vae ([`AutoencoderKLWan`]):
|
| 374 |
Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
|
| 375 |
+
text_encoder ([`T5Gemma2Encoder`]):
|
| 376 |
Primary text encoder for encoding text prompts into embeddings.
|
| 377 |
tokenizer ([`PreTrainedTokenizerBase`]):
|
| 378 |
Tokenizer corresponding to the primary text encoder.
|
|
|
|
| 393 |
FlowUniPCMultistepScheduler,
|
| 394 |
],
|
| 395 |
vae: AutoencoderKLWan,
|
| 396 |
+
text_encoder: T5Gemma2Encoder,
|
| 397 |
tokenizer: PreTrainedTokenizerBase,
|
| 398 |
transformer,
|
| 399 |
guider: Optional[
|
| 400 |
+
Union[
|
| 401 |
+
ClassifierFreeGuidance,
|
| 402 |
+
SkipLayerGuidance,
|
| 403 |
+
AdaptiveProjectedGuidance,
|
| 404 |
+
VideoAdaptiveProjectedGuidance,
|
| 405 |
+
]
|
| 406 |
] = None,
|
| 407 |
feature_extractor: Optional[SiglipImageProcessor] = None,
|
| 408 |
):
|
|
|
|
| 470 |
|
| 471 |
def _get_prompt_embeds(
|
| 472 |
self,
|
| 473 |
+
text_encoder: T5Gemma2Encoder,
|
| 474 |
tokenizer: PreTrainedTokenizerBase,
|
| 475 |
prompt: Union[str, List[str]] | None = None,
|
| 476 |
num_videos_per_prompt: int = 1,
|
|
|
|
| 490 |
"device": device,
|
| 491 |
"dtype": dtype,
|
| 492 |
}
|
| 493 |
+
# When enable_model_cpu_offload() is active, the accelerate forward hook is on text_encoder (parent). Moving the encoder to the execution device explicitly ensures inputs and
|
| 494 |
+
# weights are on the same device. The parent's offload hook will move text_encoder back to CPU after
|
| 495 |
+
# the next component claims the GPU.
|
| 496 |
+
if next(text_encoder.parameters()).device != torch.device(device):
|
| 497 |
+
text_encoder.to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 498 |
prompt_embeds, prompt_attention_mask = self._get_default_embeds(**prompt_embeds_kwargs)
|
| 499 |
|
| 500 |
pooled_prompt_embeds = self._average_pool(prompt_embeds, prompt_attention_mask)
|
|
|
|
| 565 |
T5Gemma2 has vision_tower.vision_model structure.
|
| 566 |
Will raise AttributeError if not available.
|
| 567 |
"""
|
| 568 |
+
return self.text_encoder.vision_tower.vision_model
|
| 569 |
|
| 570 |
def encode_image(
|
| 571 |
self,
|
|
|
|
| 675 |
|
| 676 |
# Initialize conditioning tensors
|
| 677 |
latent_condition = torch.zeros(
|
| 678 |
+
batch_size,
|
| 679 |
+
lantent_channels,
|
| 680 |
+
latent_num_frames,
|
| 681 |
+
latent_height,
|
| 682 |
+
latent_width,
|
| 683 |
+
device=device,
|
| 684 |
+
dtype=dtype,
|
| 685 |
)
|
| 686 |
latent_mask = torch.zeros(
|
| 687 |
+
batch_size,
|
| 688 |
+
1,
|
| 689 |
+
latent_num_frames,
|
| 690 |
+
latent_height,
|
| 691 |
+
latent_width,
|
| 692 |
+
device=device,
|
| 693 |
+
dtype=dtype,
|
| 694 |
)
|
| 695 |
image_embeds = None
|
| 696 |
|
|
|
|
| 935 |
self,
|
| 936 |
prompt: Union[str, List[str]] | None = None,
|
| 937 |
image=None,
|
| 938 |
+
negative_prompt: Optional[
|
| 939 |
+
Union[str, List[str]]
|
| 940 |
+
] = "text overlay, graphic overlay, watermark, logo, subtitles, timestamp, broadcast graphics, UI elements, random letters, frozen pose, rigid, static expression, jerky motion, mechanical motion, discontinuous motion, flat framing, depthless, dull lighting, monotone, crushed shadows, blown-out highlights, shifting background, fading background, poor continuity, identity drift, deformation, flickering, ghosting, smearing, duplication, mutated proportions, inconsistent clothing, flat colors, desaturated, tonally compressed, poor background separation, exposure shift, uneven brightness, color balance shift",
|
| 941 |
height: int = 736,
|
| 942 |
width: int = 1280,
|
| 943 |
num_frames: int = 121,
|
|
|
|
| 1093 |
|
| 1094 |
if self.guider._enabled:
|
| 1095 |
negative_prompt = self._prepare_negative_prompt(negative_prompt, batch_size)
|
| 1096 |
+
(
|
| 1097 |
+
negative_prompt_embeds,
|
| 1098 |
+
negative_pooled_prompt_embeds,
|
| 1099 |
+
negative_prompt_attention_mask,
|
| 1100 |
+
) = self.encode_prompt(
|
| 1101 |
prompt=negative_prompt,
|
| 1102 |
num_videos_per_prompt=num_videos_per_prompt,
|
| 1103 |
prompt_embeds=negative_prompt_embeds,
|
|
|
|
| 1154 |
# Compute sigmas: use linear-quadratic schedule if enabled, otherwise default linear
|
| 1155 |
_is_flow_multistep = isinstance(
|
| 1156 |
self.scheduler,
|
| 1157 |
+
(
|
| 1158 |
+
DPMSolverMultistepScheduler,
|
| 1159 |
+
UniPCMultistepScheduler,
|
| 1160 |
+
FlowUniPCMultistepScheduler,
|
| 1161 |
+
),
|
| 1162 |
)
|
| 1163 |
|
| 1164 |
# Compute mu once, shared by both branches (required by FlowUniPCMultistepScheduler)
|
|
|
|
| 1230 |
"encoder_hidden_states": (prompt_embeds, negative_prompt_embeds),
|
| 1231 |
}
|
| 1232 |
if use_attention_mask:
|
| 1233 |
+
guider_inputs["encoder_attention_mask"] = (
|
| 1234 |
+
prompt_attention_mask,
|
| 1235 |
+
negative_prompt_attention_mask,
|
| 1236 |
+
)
|
| 1237 |
if self.transformer.config.pooled_projection_dim is not None:
|
| 1238 |
+
guider_inputs["pooled_projections"] = (
|
| 1239 |
+
pooled_prompt_embeds,
|
| 1240 |
+
negative_pooled_prompt_embeds,
|
| 1241 |
+
)
|
| 1242 |
if image_embeds is not None:
|
| 1243 |
guider_inputs["image_embeds"] = (image_embeds, image_embeds)
|
| 1244 |
|
text_encoder/config.json
CHANGED
|
@@ -1,23 +1,36 @@
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
-
"
|
| 4 |
],
|
| 5 |
"attention_dropout": 0.0,
|
| 6 |
-
"
|
| 7 |
-
"
|
| 8 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
"_sliding_window_pattern": 6,
|
|
|
|
| 10 |
"attention_bias": false,
|
| 11 |
"attention_dropout": 0.0,
|
| 12 |
"attn_logit_softcapping": null,
|
|
|
|
|
|
|
|
|
|
| 13 |
"dropout_rate": 0.0,
|
| 14 |
"dtype": "bfloat16",
|
|
|
|
| 15 |
"final_logit_softcapping": null,
|
|
|
|
| 16 |
"head_dim": 256,
|
| 17 |
"hidden_activation": "gelu_pytorch_tanh",
|
| 18 |
"hidden_size": 2560,
|
| 19 |
"initializer_range": 0.02,
|
| 20 |
"intermediate_size": 10240,
|
|
|
|
| 21 |
"layer_types": [
|
| 22 |
"sliding_attention",
|
| 23 |
"sliding_attention",
|
|
@@ -55,10 +68,12 @@
|
|
| 55 |
"sliding_attention"
|
| 56 |
],
|
| 57 |
"max_position_embeddings": 131072,
|
| 58 |
-
"model_type": "
|
| 59 |
"num_attention_heads": 8,
|
| 60 |
"num_hidden_layers": 34,
|
| 61 |
"num_key_value_heads": 4,
|
|
|
|
|
|
|
| 62 |
"query_pre_attn_scalar": 256,
|
| 63 |
"rms_norm_eps": 1e-06,
|
| 64 |
"rope_parameters": {
|
|
@@ -72,181 +87,48 @@
|
|
| 72 |
"rope_type": "default"
|
| 73 |
}
|
| 74 |
},
|
|
|
|
| 75 |
"sliding_window": 1024,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
"use_bidirectional_attention": false,
|
| 77 |
"use_cache": true,
|
| 78 |
"vocab_size": 262144
|
| 79 |
},
|
| 80 |
-
"
|
| 81 |
-
"
|
| 82 |
-
"
|
|
|
|
| 83 |
"attention_dropout": 0.0,
|
| 84 |
-
"
|
|
|
|
|
|
|
| 85 |
"dropout_rate": 0.0,
|
| 86 |
"dtype": "bfloat16",
|
| 87 |
-
"
|
| 88 |
-
"
|
| 89 |
-
"
|
| 90 |
-
"
|
| 91 |
-
"
|
| 92 |
-
"
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
"finetuning_task": null,
|
| 109 |
-
"head_dim": 256,
|
| 110 |
-
"hidden_activation": "gelu_pytorch_tanh",
|
| 111 |
-
"hidden_size": 2560,
|
| 112 |
-
"id2label": {
|
| 113 |
-
"0": "LABEL_0",
|
| 114 |
-
"1": "LABEL_1"
|
| 115 |
-
},
|
| 116 |
-
"initializer_range": 0.02,
|
| 117 |
-
"intermediate_size": 10240,
|
| 118 |
-
"is_decoder": false,
|
| 119 |
-
"is_encoder_decoder": false,
|
| 120 |
-
"label2id": {
|
| 121 |
-
"LABEL_0": 0,
|
| 122 |
-
"LABEL_1": 1
|
| 123 |
-
},
|
| 124 |
-
"layer_types": [
|
| 125 |
-
"sliding_attention",
|
| 126 |
-
"sliding_attention",
|
| 127 |
-
"sliding_attention",
|
| 128 |
-
"sliding_attention",
|
| 129 |
-
"sliding_attention",
|
| 130 |
-
"full_attention",
|
| 131 |
-
"sliding_attention",
|
| 132 |
-
"sliding_attention",
|
| 133 |
-
"sliding_attention",
|
| 134 |
-
"sliding_attention",
|
| 135 |
-
"sliding_attention",
|
| 136 |
-
"full_attention",
|
| 137 |
-
"sliding_attention",
|
| 138 |
-
"sliding_attention",
|
| 139 |
-
"sliding_attention",
|
| 140 |
-
"sliding_attention",
|
| 141 |
-
"sliding_attention",
|
| 142 |
-
"full_attention",
|
| 143 |
-
"sliding_attention",
|
| 144 |
-
"sliding_attention",
|
| 145 |
-
"sliding_attention",
|
| 146 |
-
"sliding_attention",
|
| 147 |
-
"sliding_attention",
|
| 148 |
-
"full_attention",
|
| 149 |
-
"sliding_attention",
|
| 150 |
-
"sliding_attention",
|
| 151 |
-
"sliding_attention",
|
| 152 |
-
"sliding_attention",
|
| 153 |
-
"sliding_attention",
|
| 154 |
-
"full_attention",
|
| 155 |
-
"sliding_attention",
|
| 156 |
-
"sliding_attention",
|
| 157 |
-
"sliding_attention",
|
| 158 |
-
"sliding_attention"
|
| 159 |
-
],
|
| 160 |
-
"max_position_embeddings": 131072,
|
| 161 |
-
"model_type": "t5gemma2_text",
|
| 162 |
-
"num_attention_heads": 8,
|
| 163 |
-
"num_hidden_layers": 34,
|
| 164 |
-
"num_key_value_heads": 4,
|
| 165 |
-
"output_attentions": false,
|
| 166 |
-
"output_hidden_states": false,
|
| 167 |
-
"pad_token_id": 0,
|
| 168 |
-
"prefix": null,
|
| 169 |
-
"problem_type": null,
|
| 170 |
-
"query_pre_attn_scalar": 256,
|
| 171 |
-
"return_dict": true,
|
| 172 |
-
"rms_norm_eps": 1e-06,
|
| 173 |
-
"rope_parameters": {
|
| 174 |
-
"full_attention": {
|
| 175 |
-
"factor": 8.0,
|
| 176 |
-
"rope_theta": 1000000,
|
| 177 |
-
"rope_type": "linear"
|
| 178 |
-
},
|
| 179 |
-
"sliding_attention": {
|
| 180 |
-
"rope_theta": 10000,
|
| 181 |
-
"rope_type": "default"
|
| 182 |
-
}
|
| 183 |
-
},
|
| 184 |
-
"sep_token_id": null,
|
| 185 |
-
"sliding_window": 1024,
|
| 186 |
-
"task_specific_params": null,
|
| 187 |
-
"tie_encoder_decoder": false,
|
| 188 |
-
"tie_word_embeddings": true,
|
| 189 |
-
"tokenizer_class": null,
|
| 190 |
-
"use_bidirectional_attention": false,
|
| 191 |
-
"use_cache": true,
|
| 192 |
-
"vocab_size": 262144
|
| 193 |
-
},
|
| 194 |
-
"vision_config": {
|
| 195 |
-
"_name_or_path": "",
|
| 196 |
-
"add_cross_attention": false,
|
| 197 |
-
"architectures": null,
|
| 198 |
-
"attention_dropout": 0.0,
|
| 199 |
-
"bos_token_id": null,
|
| 200 |
-
"chunk_size_feed_forward": 0,
|
| 201 |
-
"cross_attention_hidden_size": null,
|
| 202 |
-
"decoder_start_token_id": null,
|
| 203 |
-
"dropout_rate": 0.0,
|
| 204 |
-
"dtype": "bfloat16",
|
| 205 |
-
"eos_token_id": null,
|
| 206 |
-
"finetuning_task": null,
|
| 207 |
-
"hidden_act": "gelu_pytorch_tanh",
|
| 208 |
-
"hidden_size": 1152,
|
| 209 |
-
"id2label": {
|
| 210 |
-
"0": "LABEL_0",
|
| 211 |
-
"1": "LABEL_1"
|
| 212 |
-
},
|
| 213 |
-
"image_size": 896,
|
| 214 |
-
"intermediate_size": 4304,
|
| 215 |
-
"is_decoder": false,
|
| 216 |
-
"is_encoder_decoder": false,
|
| 217 |
-
"label2id": {
|
| 218 |
-
"LABEL_0": 0,
|
| 219 |
-
"LABEL_1": 1
|
| 220 |
-
},
|
| 221 |
-
"layer_norm_eps": 1e-06,
|
| 222 |
-
"model_type": "siglip_vision_model",
|
| 223 |
-
"num_attention_heads": 16,
|
| 224 |
-
"num_channels": 3,
|
| 225 |
-
"num_hidden_layers": 27,
|
| 226 |
-
"output_attentions": false,
|
| 227 |
-
"output_hidden_states": false,
|
| 228 |
-
"pad_token_id": null,
|
| 229 |
-
"patch_size": 14,
|
| 230 |
-
"prefix": null,
|
| 231 |
-
"problem_type": null,
|
| 232 |
-
"return_dict": true,
|
| 233 |
-
"sep_token_id": null,
|
| 234 |
-
"task_specific_params": null,
|
| 235 |
-
"tie_encoder_decoder": false,
|
| 236 |
-
"tie_word_embeddings": true,
|
| 237 |
-
"tokenizer_class": null,
|
| 238 |
-
"vision_use_head": false,
|
| 239 |
-
"vocab_size": 262144
|
| 240 |
-
},
|
| 241 |
"vocab_size": 262144
|
| 242 |
},
|
| 243 |
-
"eoi_token_index": 256000,
|
| 244 |
-
"eos_token_id": 1,
|
| 245 |
-
"image_token_index": 256001,
|
| 246 |
-
"initializer_range": 0.02,
|
| 247 |
-
"is_encoder_decoder": true,
|
| 248 |
-
"model_type": "t5gemma2",
|
| 249 |
-
"pad_token_id": 0,
|
| 250 |
-
"transformers_version": "5.0.0rc1",
|
| 251 |
"vocab_size": 262144
|
| 252 |
-
}
|
|
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
+
"T5Gemma2Encoder"
|
| 4 |
],
|
| 5 |
"attention_dropout": 0.0,
|
| 6 |
+
"boi_token_index": 255999,
|
| 7 |
+
"dropout_rate": 0.0,
|
| 8 |
+
"dtype": "bfloat16",
|
| 9 |
+
"eoi_token_index": 256000,
|
| 10 |
+
"image_token_index": 256001,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"mm_tokens_per_image": 256,
|
| 13 |
+
"model_type": "t5gemma2_encoder",
|
| 14 |
+
"text_config": {
|
| 15 |
"_sliding_window_pattern": 6,
|
| 16 |
+
"add_cross_attention": false,
|
| 17 |
"attention_bias": false,
|
| 18 |
"attention_dropout": 0.0,
|
| 19 |
"attn_logit_softcapping": null,
|
| 20 |
+
"bos_token_id": 2,
|
| 21 |
+
"cross_attention_hidden_size": null,
|
| 22 |
+
"decoder_start_token_id": null,
|
| 23 |
"dropout_rate": 0.0,
|
| 24 |
"dtype": "bfloat16",
|
| 25 |
+
"eos_token_id": 1,
|
| 26 |
"final_logit_softcapping": null,
|
| 27 |
+
"finetuning_task": null,
|
| 28 |
"head_dim": 256,
|
| 29 |
"hidden_activation": "gelu_pytorch_tanh",
|
| 30 |
"hidden_size": 2560,
|
| 31 |
"initializer_range": 0.02,
|
| 32 |
"intermediate_size": 10240,
|
| 33 |
+
"is_decoder": false,
|
| 34 |
"layer_types": [
|
| 35 |
"sliding_attention",
|
| 36 |
"sliding_attention",
|
|
|
|
| 68 |
"sliding_attention"
|
| 69 |
],
|
| 70 |
"max_position_embeddings": 131072,
|
| 71 |
+
"model_type": "t5gemma2_text",
|
| 72 |
"num_attention_heads": 8,
|
| 73 |
"num_hidden_layers": 34,
|
| 74 |
"num_key_value_heads": 4,
|
| 75 |
+
"pad_token_id": 0,
|
| 76 |
+
"prefix": null,
|
| 77 |
"query_pre_attn_scalar": 256,
|
| 78 |
"rms_norm_eps": 1e-06,
|
| 79 |
"rope_parameters": {
|
|
|
|
| 87 |
"rope_type": "default"
|
| 88 |
}
|
| 89 |
},
|
| 90 |
+
"sep_token_id": null,
|
| 91 |
"sliding_window": 1024,
|
| 92 |
+
"task_specific_params": null,
|
| 93 |
+
"tie_encoder_decoder": false,
|
| 94 |
+
"tie_word_embeddings": true,
|
| 95 |
+
"tokenizer_class": null,
|
| 96 |
"use_bidirectional_attention": false,
|
| 97 |
"use_cache": true,
|
| 98 |
"vocab_size": 262144
|
| 99 |
},
|
| 100 |
+
"tie_word_embeddings": true,
|
| 101 |
+
"transformers_version": "5.5.4",
|
| 102 |
+
"vision_config": {
|
| 103 |
+
"add_cross_attention": false,
|
| 104 |
"attention_dropout": 0.0,
|
| 105 |
+
"bos_token_id": null,
|
| 106 |
+
"cross_attention_hidden_size": null,
|
| 107 |
+
"decoder_start_token_id": null,
|
| 108 |
"dropout_rate": 0.0,
|
| 109 |
"dtype": "bfloat16",
|
| 110 |
+
"eos_token_id": null,
|
| 111 |
+
"finetuning_task": null,
|
| 112 |
+
"hidden_act": "gelu_pytorch_tanh",
|
| 113 |
+
"hidden_size": 1152,
|
| 114 |
+
"image_size": 896,
|
| 115 |
+
"intermediate_size": 4304,
|
| 116 |
+
"is_decoder": false,
|
| 117 |
+
"layer_norm_eps": 1e-06,
|
| 118 |
+
"model_type": "siglip_vision_model",
|
| 119 |
+
"num_attention_heads": 16,
|
| 120 |
+
"num_channels": 3,
|
| 121 |
+
"num_hidden_layers": 27,
|
| 122 |
+
"pad_token_id": null,
|
| 123 |
+
"patch_size": 14,
|
| 124 |
+
"prefix": null,
|
| 125 |
+
"sep_token_id": null,
|
| 126 |
+
"task_specific_params": null,
|
| 127 |
+
"tie_encoder_decoder": false,
|
| 128 |
+
"tie_word_embeddings": true,
|
| 129 |
+
"tokenizer_class": null,
|
| 130 |
+
"vision_use_head": false,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
"vocab_size": 262144
|
| 132 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
"vocab_size": 262144
|
| 134 |
+
}
|
text_encoder/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2957deadcb660bb6e411a88c4f8860c5972f7f4eb856ac520d2628d1e225359f
|
| 3 |
+
size 8599946488
|
transformer/config.json
CHANGED
|
@@ -4,7 +4,6 @@
|
|
| 4 |
"_library": "diffusers",
|
| 5 |
"attention_head_dim": 128,
|
| 6 |
"base_latent_size": null,
|
| 7 |
-
"image_condition_type": null,
|
| 8 |
"image_embed_dim": 1152,
|
| 9 |
"in_channels": 33,
|
| 10 |
"mlp_ratio": 4.0,
|
|
|
|
| 4 |
"_library": "diffusers",
|
| 5 |
"attention_head_dim": 128,
|
| 6 |
"base_latent_size": null,
|
|
|
|
| 7 |
"image_embed_dim": 1152,
|
| 8 |
"in_channels": 33,
|
| 9 |
"mlp_ratio": 4.0,
|