Reza2kn commited on Jun 2

Commit

4a29951

verified ·

1 Parent(s): e139166

Upload folder using huggingface_hub

Browse files

Files changed (35) hide show

.gitattributes +9 -0
README.md +66 -0
mlx_cosmos3.py +193 -0
mlx_pipeline.py +113 -0
model_index.json +28 -0
samples/anime.png +3 -0
samples/barista.png +3 -0
samples/city.png +3 -0
samples/food.png +3 -0
samples/panda.png +3 -0
samples/portrait.png +3 -0
samples/t2v_f0.png +0 -0
samples/t2v_f16.png +3 -0
samples/t2v_f8.png +3 -0
samples/t2v_waves.mp4 +0 -0
scheduler/scheduler_config.json +33 -0
sound_tokenizer/config.json +64 -0
sound_tokenizer/diffusion_pytorch_model.safetensors +3 -0
text_tokenizer/added_tokens.json +28 -0
text_tokenizer/chat_template.jinja +120 -0
text_tokenizer/merges.txt +0 -0
text_tokenizer/special_tokens_map.json +31 -0
text_tokenizer/tokenizer.json +3 -0
text_tokenizer/tokenizer_config.json +239 -0
text_tokenizer/vocab.json +0 -0
transformer/mlx_quant_config.json +474 -0
transformer/model-00001-of-00007.safetensors +3 -0
transformer/model-00002-of-00007.safetensors +3 -0
transformer/model-00003-of-00007.safetensors +3 -0
transformer/model-00004-of-00007.safetensors +3 -0
transformer/model-00005-of-00007.safetensors +3 -0
transformer/model-00006-of-00007.safetensors +3 -0
transformer/model-00007-of-00007.safetensors +3 -0
vae/config.json +129 -0
vae/diffusion_pytorch_model.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+samples/anime.png filter=lfs diff=lfs merge=lfs -text
+samples/barista.png filter=lfs diff=lfs merge=lfs -text
+samples/city.png filter=lfs diff=lfs merge=lfs -text
+samples/food.png filter=lfs diff=lfs merge=lfs -text
+samples/panda.png filter=lfs diff=lfs merge=lfs -text
+samples/portrait.png filter=lfs diff=lfs merge=lfs -text
+samples/t2v_f16.png filter=lfs diff=lfs merge=lfs -text
+samples/t2v_f8.png filter=lfs diff=lfs merge=lfs -text
+text_tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,66 @@

+---
+license: other
+license_name: openmdw-1.1
+license_link: https://openmdw.ai/license/1-1/
+base_model: nvidia/Cosmos3-Nano
+base_model_relation: quantized
+library_name: mlx
+pipeline_tag: text-to-image
+tags: [cosmos, cosmos3, mlx, apple-silicon, 4-bit, quantization, text-to-image]
+---
+# Cosmos3-Nano — MLX 4-bit (Apple Silicon)
+A **4-bit MLX** build of [`nvidia/Cosmos3-Nano`](https://huggingface.co/nvidia/Cosmos3-Nano) that
+**runs on Apple Silicon** — not just quantized weights, a working text2image model. The custom
+Cosmos3 omni-MoT diffusion transformer was ported to MLX from scratch (no `mlx-vlm` support exists
+for this architecture) and every block was validated against the reference torch implementation.
+> Derivative of `nvidia/Cosmos3-Nano`. © NVIDIA. Distributed under **OpenMDW-1.1** (license + NVIDIA
+> copyright/origin notices retained). Not affiliated with, nor endorsed by, NVIDIA.
+## Highlights
+- **Transformer: 30.3 GB bf16 → 12.1 GB MLX-4bit** (468 attn+MLP linears quantized, group-64; embeddings/norms/lm_head kept bf16).
+- **Runs ~11 GB peak** — fits a 16 GB Mac. ~12 s for a 256² image (M2 Ultra), longer at higher res.
+- **Validated:** every module matches torch — primitives ~1e-6, full decoder layer ~1e-3 (bf16), patchify bit-exact.
+## Usage
+```python
+import torch
+from huggingface_hub import snapshot_download
+from mlx_pipeline import MLXCosmos3Transformer        # included in this repo
+from diffusers import Cosmos3OmniPipeline, AutoencoderKLWan, UniPCMultistepScheduler
+from diffusers.models.autoencoders.autoencoder_cosmos3_audio import Cosmos3AVAEAudioTokenizer
+from transformers import AutoTokenizer
+repo = snapshot_download("Reza2kn/Cosmos3-Nano-MLX-4bit")
+vae = AutoencoderKLWan.from_pretrained(repo, subfolder="vae", torch_dtype=torch.float32).eval()
+sched = UniPCMultistepScheduler.from_pretrained(repo, subfolder="scheduler")
+tok = AutoTokenizer.from_pretrained(repo, subfolder="text_tokenizer")
+st = Cosmos3AVAEAudioTokenizer.from_pretrained(repo, subfolder="sound_tokenizer", torch_dtype=torch.float32).eval()
+pipe = Cosmos3OmniPipeline(transformer=MLXCosmos3Transformer(repo + "/transformer"),
+        text_tokenizer=tok, vae=vae, scheduler=sched, sound_tokenizer=st, enable_safety_checker=False)
+img = pipe("A red panda astronaut floating in a nebula", num_frames=1,
+           height=384, width=384, num_inference_steps=24).video[0][0]
+img.save("out.png")
+```
+**Requires:** `mlx`, `diffusers` (git main / ≥0.39 for Cosmos3), `transformers`, `torch` (VAE/scheduler only). The
+heavy 16B transformer runs in MLX on the GPU; the small VAE/scheduler/tokenizer run in torch.
+## Quality (honest)
+Same profile as any 4-bit build: **clean on typical content** (portraits, scenes, objects, food —
+see `samples/`), but **4-bit defects appear on hard anatomy** — e.g. fused/mangled **hands**
+(`samples/barista.png`) and broken limbs in complex poses (`samples/anime.png`). PickScore (mean
+**21.42**, vs the CUDA builds' ~21.8) does **not** reliably catch these — eyeball the hard cases.
+Use FP8/BF16 if you need hands/complex anatomy to hold up.
+## Status / honesty
+- **text2image: working** (`samples/*.png`), with the 4-bit anatomy caveats above.
+- **text2video: working** (`samples/t2v_waves.mp4`, `num_frames>1`).
+- **image2video / audio:** not implemented yet (image-conditioning + sound paths).
+- Quantization is 4-bit weight-only — near-original on typical content, with the usual 4-bit wobble on the
+  hardest cases (dense hands, on-image text), same as any 4-bit build.
+## How it was built
+`mlx_cosmos3.py` (validated MLX modules), `mlx_pipeline.py` (torch wrapper routing the transformer forward to MLX
+while reusing torch tokenizer/UniPC/VAE/CFG). Quantized with `mx.quantize` (group-64, 4-bit), streamed shard-by-shard.

mlx_cosmos3.py ADDED Viewed

	@@ -0,0 +1,193 @@

+"""MLX port of the Cosmos3-Nano omni transformer. Built module-by-module, each validated
+against the torch reference (validate_primitives.py). Runs the MLX 4-bit weights produced by
+mlx_quant.py. WIP — primitives + attention first, then full transformer + pipeline glue."""
+import mlx.core as mx
+import mlx.nn as nn
+QGROUP = 64
+QBITS = 4  # set by loader from mlx_quant_config.json
+def rms_norm(x, weight, eps):
+    # matches diffusers RMSNorm: variance in float32, scale, then * weight
+    xf = x.astype(mx.float32)
+    var = mx.mean(xf * xf, axis=-1, keepdims=True)
+    xf = xf * mx.rsqrt(var + eps)
+    return (weight * xf.astype(x.dtype)) if weight is not None else xf.astype(x.dtype)
+def silu(x):
+    return x * mx.sigmoid(x)
+def swiglu_mlp(x, gate_w, up_w, down_w):
+    # down(silu(gate(x)) * up(x)); weights are [out,in] (torch Linear) -> x @ w.T
+    g = silu(x @ gate_w.T)
+    u = x @ up_w.T
+    return (g * u) @ down_w.T
+def rotate_half(x):
+    half = x.shape[-1] // 2
+    return mx.concatenate([-x[..., half:], x[..., :half]], axis=-1)
+def apply_rope(x, cos, sin):
+    # x: [N, heads, head_dim]; cos/sin: [N, head_dim] -> unsqueeze head axis
+    cos = mx.expand_dims(cos, 1)
+    sin = mx.expand_dims(sin, 1)
+    return x * cos + rotate_half(x) * sin
+class RoPE3D:
+    """Cosmos3VLTextRotaryEmbedding: interleaved 3D mRoPE."""
+    def __init__(self, head_dim, rope_theta, rope_axes_dim):
+        self.inv_freq = 1.0 / (rope_theta ** (mx.arange(0, head_dim, 2).astype(mx.float32) / head_dim))
+        self.rope_axes_dim = rope_axes_dim  # e.g. [24,20,20]
+    def _interleave(self, freqs):
+        # freqs: [3, N, head_dim//2] -> [N, head_dim//2] interleaving H,W into T grid
+        freqs_t = freqs[0]
+        for dim, offset in ((1, 1), (2, 2)):  # (axis idx, start offset)
+            length = self.rope_axes_dim[dim] * 3
+            idx = mx.arange(offset, length, 3)
+            # assign freqs_t[..., idx] = freqs[dim][..., idx]
+            sel = freqs[dim][..., idx]
+            freqs_t[..., idx] = sel
+        return freqs_t
+    def __call__(self, position_ids):
+        # position_ids: [3, N]
+        pid = position_ids.astype(mx.float32)                     # [3, N]
+        inv = self.inv_freq[None, :, None]                        # [1, d/2, 1]
+        inv = mx.broadcast_to(inv, (3, inv.shape[1], 1))          # [3, d/2, 1]
+        pe = pid[:, None, :]                                      # [3, 1, N]
+        freqs = mx.transpose(inv @ pe, (0, 2, 1))                 # [3, N, d/2]
+        freqs = self._interleave(freqs)                          # [N, d/2]
+        emb = mx.concatenate([freqs, freqs], axis=-1)            # [N, d]
+        return mx.cos(emb), mx.sin(emb)
+def gqa_attention(q, k, v, n_heads, n_kv_heads, causal):
+    # q:[N,H,D] k,v:[M,Hkv,D]. expand kv groups, scaled-dot-product.
+    N, H, D = q.shape
+    M = k.shape[0]
+    rep = n_heads // n_kv_heads
+    k = mx.repeat(k, rep, axis=1)   # [M, H, D]
+    v = mx.repeat(v, rep, axis=1)
+    q = mx.transpose(q, (1, 0, 2))  # [H, N, D]
+    k = mx.transpose(k, (1, 0, 2))  # [H, M, D]
+    v = mx.transpose(v, (1, 0, 2))
+    scale = 1.0 / (D ** 0.5)
+    scores = (q @ mx.transpose(k, (0, 2, 1))) * scale  # [H, N, M]
+    if causal:
+        mask = mx.triu(mx.full((N, M), -1e9, dtype=scores.dtype), k=1)
+        scores = scores + mask
+    w = mx.softmax(scores.astype(mx.float32), axis=-1).astype(v.dtype)
+    out = w @ v                      # [H, N, D]
+    return mx.transpose(out, (1, 0, 2))  # [N, H, D]
+def dual_attention(und_seq, gen_seq, rope, w, n_heads, n_kv_heads, head_dim, eps):
+    """Cosmos3AttnProcessor in MLX: und=causal self-attn, gen=full attn over [und+gen] kv."""
+    cos_u, sin_u, cos_g, sin_g = rope
+    q_u = (und_seq @ w['to_q'].T).reshape(-1, n_heads, head_dim)
+    k_u = (und_seq @ w['to_k'].T).reshape(-1, n_kv_heads, head_dim)
+    v_u = (und_seq @ w['to_v'].T).reshape(-1, n_kv_heads, head_dim)
+    q_g = (gen_seq @ w['add_q_proj'].T).reshape(-1, n_heads, head_dim)
+    k_g = (gen_seq @ w['add_k_proj'].T).reshape(-1, n_kv_heads, head_dim)
+    v_g = (gen_seq @ w['add_v_proj'].T).reshape(-1, n_kv_heads, head_dim)
+    q_u = rms_norm(q_u, w['norm_q'], eps); k_u = rms_norm(k_u, w['norm_k'], eps)
+    q_g = rms_norm(q_g, w['norm_added_q'], eps); k_g = rms_norm(k_g, w['norm_added_k'], eps)
+    q_u = apply_rope(q_u, cos_u, sin_u); k_u = apply_rope(k_u, cos_u, sin_u)
+    q_g = apply_rope(q_g, cos_g, sin_g); k_g = apply_rope(k_g, cos_g, sin_g)
+    causal_out = gqa_attention(q_u, k_u, v_u, n_heads, n_kv_heads, causal=True).reshape(-1, n_heads * head_dim)
+    all_k = mx.concatenate([k_u, k_g], axis=0); all_v = mx.concatenate([v_u, v_g], axis=0)
+    full_out = gqa_attention(q_g, all_k, all_v, n_heads, n_kv_heads, causal=False).reshape(-1, n_heads * head_dim)
+    return causal_out @ w['to_out'].T, full_out @ w['to_add_out'].T
+# ---- timestep embedding (diffusers Timesteps + TimestepEmbedding) ----
+def get_timestep_embedding(timesteps, dim=256, max_period=10000, downscale_freq_shift=0.0):
+    half = dim // 2
+    exponent = -mx.log(mx.array(float(max_period))) * mx.arange(half).astype(mx.float32)
+    exponent = exponent / (half - downscale_freq_shift)
+    emb = mx.exp(exponent)
+    emb = timesteps.astype(mx.float32)[:, None] * emb[None, :]
+    # flip_sin_to_cos=True -> [cos, sin]
+    return mx.concatenate([mx.cos(emb), mx.sin(emb)], axis=-1)
+def timestep_embedder(t_emb, l1_w, l1_b, l2_w, l2_b):
+    h = silu(t_emb @ l1_w.T + l1_b)
+    return h @ l2_w.T + l2_b
+# ---- linear that accepts bf16 weight (mx array) or quantized tuple (wq, scales, biases) ----
+def linear(x, w, bias=None, group_size=None, bits=None):
+    if isinstance(w, tuple):
+        wq, scales, biases = w
+        out = mx.quantized_matmul(x, wq, scales, biases, transpose=True,
+                                  group_size=group_size or QGROUP, bits=bits or QBITS)
+    else:
+        out = x @ w.T
+    return out + bias if bias is not None else out
+def decoder_layer(und, gen, rope, P, cfg):
+    """One Cosmos3VLTextMoTDecoderLayer in MLX. P = dict of this layer's params (mx arrays or
+    quantized tuples). cfg = (n_heads, n_kv, head_dim, eps)."""
+    NH, NKV, HD, EPS = cfg
+    und_n = rms_norm(und, P["input_layernorm.weight"], EPS)
+    gen_n = rms_norm(gen, P["input_layernorm_moe_gen.weight"], EPS)
+    cos_u, sin_u, cos_g, sin_g = rope
+    def proj(seq, name, nh):
+        return linear(seq, P[name]).reshape(-1, nh, HD)
+    q_u = proj(und_n, "self_attn.to_q.weight", NH); k_u = proj(und_n, "self_attn.to_k.weight", NKV); v_u = proj(und_n, "self_attn.to_v.weight", NKV)
+    q_g = proj(gen_n, "self_attn.add_q_proj.weight", NH); k_g = proj(gen_n, "self_attn.add_k_proj.weight", NKV); v_g = proj(gen_n, "self_attn.add_v_proj.weight", NKV)
+    q_u = rms_norm(q_u, P["self_attn.norm_q.weight"], EPS); k_u = rms_norm(k_u, P["self_attn.norm_k.weight"], EPS)
+    q_g = rms_norm(q_g, P["self_attn.norm_added_q.weight"], EPS); k_g = rms_norm(k_g, P["self_attn.norm_added_k.weight"], EPS)
+    q_u = apply_rope(q_u, cos_u, sin_u); k_u = apply_rope(k_u, cos_u, sin_u)
+    q_g = apply_rope(q_g, cos_g, sin_g); k_g = apply_rope(k_g, cos_g, sin_g)
+    co = gqa_attention(q_u, k_u, v_u, NH, NKV, True).reshape(-1, NH * HD)
+    ak = mx.concatenate([k_u, k_g], axis=0); av = mx.concatenate([v_u, v_g], axis=0)
+    fo = gqa_attention(q_g, ak, av, NH, NKV, False).reshape(-1, NH * HD)
+    und = und + linear(co, P["self_attn.to_out.weight"])
+    gen = gen + linear(fo, P["self_attn.to_add_out.weight"])
+    und_m = rms_norm(und, P["post_attention_layernorm.weight"], EPS)
+    gen_m = rms_norm(gen, P["post_attention_layernorm_moe_gen.weight"], EPS)
+    und = und + linear(silu(linear(und_m, P["mlp.gate_proj.weight"])) * linear(und_m, P["mlp.up_proj.weight"]), P["mlp.down_proj.weight"])
+    gen = gen + linear(silu(linear(gen_m, P["mlp_moe_gen.gate_proj.weight"])) * linear(gen_m, P["mlp_moe_gen.up_proj.weight"]), P["mlp_moe_gen.down_proj.weight"])
+    return und, gen
+# ---- patchify / pack / unpatchify (pure-tensor glue; matches torch methods) ----
+def patchify_pack(latent, p, C):
+    """latent [C,T,H,W] -> packed [num_patches, p*p*C], (T, hpat, wpat)."""
+    _, T, H, W = latent.shape
+    Hp = ((H + p - 1) // p) * p; Wp = ((W + p - 1) // p) * p
+    if Hp != H or Wp != W:
+        pad = mx.zeros((C, T, Hp, Wp), dtype=latent.dtype)
+        pad[:, :, :H, :W] = latent; latent = pad
+    hpat, wpat = Hp // p, Wp // p
+    latent = latent.reshape(C, T, hpat, p, wpat, p)
+    latent = mx.einsum("cthpwq->thwpqc", latent).reshape(-1, p * p * C)
+    return latent, (T, hpat, wpat)
+def unpatchify(packed, token_shape, orig_hw, p, C):
+    """packed [num_patches, p*p*C] -> latent [C, T, H, W]."""
+    T, hpat, wpat = token_shape
+    H, W = orig_hw
+    x = packed.reshape(T, hpat, wpat, p, p, C)
+    x = mx.einsum("thwpqc->cthpwq", x).reshape(C, T, hpat * p, wpat * p)
+    return x[:, :, :H, :W]
+def scatter_timestep_single(tokens, t_embed, n_noisy_tokens):
+    """t2i / all-noisy single-item case: add the (broadcast) timestep embed to the first
+    n_noisy_tokens rows. General multi-frame scatter handled in the pipeline layer."""
+    if t_embed.ndim == 1:
+        t_embed = mx.broadcast_to(t_embed[None, :], (n_noisy_tokens, tokens.shape[1]))
+    tokens[:n_noisy_tokens] = tokens[:n_noisy_tokens] + t_embed
+    return tokens

mlx_pipeline.py ADDED Viewed

	@@ -0,0 +1,113 @@

+"""End-to-end text2image with the MLX 4-bit transformer + torch pipeline orchestration.
+A torch nn.Module wrapper routes the transformer forward to MLX; everything else (tokenizer,
+UniPC scheduler, CFG, VAE decode) stays in torch (small, fits RAM). The 33GB torch transformer
+is never loaded."""
+import glob, json, sys, time
+import numpy as np, torch
+import mlx.core as mx
+from types import SimpleNamespace
+sys.path.insert(0, "/Users/studio/cosmos_mlx/work")
+import mlx_cosmos3 as M
+NANO = "/Users/studio/cosmos_mlx/models/Cosmos3-Nano"
+EXPORT = "/Users/studio/cosmos_mlx/export/Cosmos3-Nano-MLX-4bit/transformer"
+HID, HD, NH, NKV, NL, EPS = 4096, 128, 32, 8, 36, 1e-6
+P_PATCH, C_LAT, AXES, THETA, TS_SCALE = 2, 48, [24, 20, 20], 5e6, 0.001
+def _t2m(t):  # torch -> mlx
+    return mx.array(t.detach().to(torch.float32).cpu().numpy())
+def _m2t(a, dtype=torch.bfloat16):
+    return torch.from_numpy(np.array(a.astype(mx.float32))).to(dtype)
+class MLXCosmos3Transformer(torch.nn.Module):
+    def __init__(self, export_dir):
+        super().__init__()
+        self.W = {}
+        for f in sorted(glob.glob(export_dir + "/*.safetensors")):
+            self.W.update(mx.load(f))
+        cfgd = json.load(open(NANO + "/transformer/config.json"))
+        cfgd = {k: v for k, v in cfgd.items() if not k.startswith("_")}
+        self.config = SimpleNamespace(**cfgd)  # real config -> all fields the pipeline reads
+        qc = json.load(open(export_dir + "/mlx_quant_config.json"))
+        M.QGROUP, M.QBITS = qc.get("group_size", 64), qc.get("bits", 4)  # 4 or 8 bit
+        self._dbg = True
+        self._dtype = torch.bfloat16
+    @property
+    def dtype(self): return self._dtype
+    @property
+    def device(self): return torch.device("cpu")
+    def to(self, *a, **k): return self
+    def eval(self): return self
+    def _lp(self, i):
+        pre = f"layers.{i}."; P = {}
+        for k in self.W:
+            if not k.startswith(pre) or k.endswith(".scales") or k.endswith(".biases"): continue
+            n = k[len(pre):]
+            P[n] = (self.W[k], self.W[k + ".scales"], self.W[k + ".biases"]) if k + ".scales" in self.W else self.W[k]
+        return P
+    def _gv(self, n):
+        return (self.W[n], self.W[n + ".scales"], self.W[n + ".biases"]) if n + ".scales" in self.W else self.W[n]
+    @torch.no_grad()
+    def forward(self, input_ids, text_indexes, position_ids, und_len, sequence_length,
+                vision_tokens, vision_token_shapes, vision_sequence_indexes, vision_mse_loss_indexes,
+                vision_timesteps, vision_noisy_frame_indexes, **sound_kw):
+        W = self.W
+        ii = mx.array(input_ids.cpu().numpy().astype(np.int32))
+        ti = mx.array(text_indexes.cpu().numpy().astype(np.int32))
+        vsi = mx.array(vision_sequence_indexes.cpu().numpy().astype(np.int32))
+        vmi = mx.array(vision_mse_loss_indexes.cpu().numpy().astype(np.int32))
+        pid = mx.array(position_ids.cpu().numpy().astype(np.int32))
+        latent = _t2m(vision_tokens[0]).reshape(C_LAT, *vision_tokens[0].shape[-3:])  # [C,T,H,W]
+        H, Wd = int(latent.shape[-2]), int(latent.shape[-1])
+        if getattr(self, "_dbg", False):
+            print(f"[wrapper] vision_tokens[0].shape={tuple(vision_tokens[0].shape)} latent T={latent.shape[1]} "
+                  f"seq_len={sequence_length} und_len={und_len} mse_idx={vision_mse_loss_indexes.shape} "
+                  f"token_shapes={vision_token_shapes} noisy={[ (x.tolist() if hasattr(x,'tolist') else x) for x in vision_noisy_frame_indexes]}", flush=True)
+            self._dbg = False
+        tstep = float(vision_timesteps[0].item())
+        emb = W["embed_tokens.weight"][ii]
+        hidden = mx.zeros((sequence_length, HID), dtype=emb.dtype)
+        hidden[ti] = emb
+        packed, shape = M.patchify_pack(latent, P_PATCH, C_LAT)
+        packed = M.linear(packed.astype(emb.dtype), self._gv("proj_in.weight"), W["proj_in.bias"])
+        te = M.get_timestep_embedding(mx.array([tstep * TS_SCALE]))
+        te = M.timestep_embedder(te, W["time_embedder.linear_1.weight"], W["time_embedder.linear_1.bias"],
+                                 W["time_embedder.linear_2.weight"], W["time_embedder.linear_2.bias"])[0].astype(emb.dtype)
+        packed = M.scatter_timestep_single(packed, te, packed.shape[0])   # t2i: all vision tokens noisy
+        hidden[vsi] = packed
+        cos, sin = M.RoPE3D(HD, THETA, AXES)(pid)
+        cos = cos.astype(emb.dtype); sin = sin.astype(emb.dtype)
+        und, gen = hidden[:und_len], hidden[und_len:]
+        rope = (cos[:und_len], sin[:und_len], cos[und_len:], sin[und_len:])
+        for i in range(NL):
+            und, gen = M.decoder_layer(und, gen, rope, self._lp(i), (NH, NKV, HD, EPS)); mx.eval(und, gen)
+        und = M.rms_norm(und, W["norm.weight"], EPS); gen = M.rms_norm(gen, W["norm_moe_gen.weight"], EPS)
+        last = mx.concatenate([und, gen], axis=0)
+        preds = M.linear(last[vmi], self._gv("proj_out.weight"), W["proj_out.bias"])
+        out = M.unpatchify(preds, shape, (H, Wd), P_PATCH, C_LAT); mx.eval(out)
+        return [_m2t(out, vision_tokens[0].dtype).unsqueeze(0)], None
+if __name__ == "__main__":
+    from diffusers import Cosmos3OmniPipeline, AutoencoderKLWan, UniPCMultistepScheduler
+    from diffusers.models.autoencoders.autoencoder_cosmos3_audio import Cosmos3AVAEAudioTokenizer
+    from transformers import AutoTokenizer
+    dev = "cpu"
+    print("loading components (no torch transformer)...")
+    vae = AutoencoderKLWan.from_pretrained(NANO, subfolder="vae", torch_dtype=torch.float32).to(dev).eval()
+    sched = UniPCMultistepScheduler.from_pretrained(NANO, subfolder="scheduler")
+    tok = AutoTokenizer.from_pretrained(NANO, subfolder="text_tokenizer")
+    st = Cosmos3AVAEAudioTokenizer.from_pretrained(NANO, subfolder="sound_tokenizer", torch_dtype=torch.float32).to(dev).eval()
+    tf = MLXCosmos3Transformer(EXPORT)
+    pipe = Cosmos3OmniPipeline(transformer=tf, text_tokenizer=tok, vae=vae, scheduler=sched,
+                               sound_tokenizer=st, enable_safety_checker=False)
+    print("generating (MLX 4-bit transformer)...")
+    t0 = time.time()
+    out = pipe(prompt="A red panda astronaut floating in a nebula, highly detailed", num_frames=1,
+               height=256, width=256, num_inference_steps=20, generator=torch.Generator().manual_seed(1))
+    img = out.video[0][0] if isinstance(out.video[0], list) else out.video[0]
+    img.save("/Users/studio/cosmos_mlx/work/mlx_t2i.png")
+    print(f"GENERATED in {time.time()-t0:.0f}s -> mlx_t2i.png ({img.size})")

model_index.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_class_name": "Cosmos3OmniDiffusersPipeline",
+  "_diffusers_version": "0.37.1",
+  "scheduler": [
+    "diffusers",
+    "UniPCMultistepScheduler"
+  ],
+  "text_tokenizer": [
+    "transformers",
+    "Qwen2TokenizerFast"
+  ],
+  "transformer": [
+    "diffusers",
+    "Cosmos3OmniTransformer"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKLWan"
+  ],
+  "vision_encoder": [
+    "transformers",
+    "Qwen3VLVisionModel"
+  ],
+  "sound_tokenizer": [
+    "diffusers",
+    "Cosmos3AVAEAudioTokenizer"
+  ]
+}

samples/anime.png ADDED Viewed

Git LFS Details

SHA256: 7a051daf2c86da6cf3d0f4770746a45270d29640905157c52c2e7a6902504d47
Pointer size: 131 Bytes
Size of remote file: 259 kB

samples/barista.png ADDED Viewed

Git LFS Details

SHA256: 82a672bb4c89e6c69c4f4d26e952a13e4d3030b878b91c63fd2a2917e2bd9a59
Pointer size: 131 Bytes
Size of remote file: 212 kB

samples/city.png ADDED Viewed

Git LFS Details

SHA256: 0db0015f32883f0bb27766515a25d108883ef4d3e112ffa0513ef220f1294403
Pointer size: 131 Bytes
Size of remote file: 284 kB

samples/food.png ADDED Viewed

Git LFS Details

SHA256: d9c2957b59170cb39bf3517bea7e524c1b10390acbee3a8245edf6aa0491b917
Pointer size: 131 Bytes
Size of remote file: 205 kB

samples/panda.png ADDED Viewed

Git LFS Details

SHA256: 8a81d1b9113d2aab697b686c2cc8539fa05c4e00c092c00ec82622c99d91ef21
Pointer size: 131 Bytes
Size of remote file: 206 kB

samples/portrait.png ADDED Viewed

Git LFS Details

SHA256: d83e3381364fc4d207bee74d4295cd5a530e4c6392e4a53b98d299bb055bda10
Pointer size: 131 Bytes
Size of remote file: 261 kB

samples/t2v_f0.png ADDED Viewed

samples/t2v_f16.png ADDED Viewed

Git LFS Details

SHA256: da05696339f74176ca26e3e8e2951134c1ecf82430d3b7e15beac3620fd64895
Pointer size: 131 Bytes
Size of remote file: 109 kB

samples/t2v_f8.png ADDED Viewed

Git LFS Details

SHA256: d2a00bbf8292e18e87e622e3f82f711743e3a4ba963451e722e949b46ba36574
Pointer size: 131 Bytes
Size of remote file: 121 kB

samples/t2v_waves.mp4 ADDED Viewed

Binary file (98.1 kB). View file

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_class_name": "UniPCMultistepScheduler",
+  "_diffusers_version": "0.37.1",
+  "beta_end": 0.02,
+  "beta_schedule": "linear",
+  "beta_start": 0.0001,
+  "disable_corrector": [],
+  "dynamic_thresholding_ratio": 0.995,
+  "final_sigmas_type": "zero",
+  "flow_shift": 1.0,
+  "lower_order_final": true,
+  "num_train_timesteps": 1000,
+  "predict_x0": true,
+  "prediction_type": "flow_prediction",
+  "rescale_betas_zero_snr": false,
+  "sample_max_value": 1.0,
+  "shift_terminal": null,
+  "sigma_max": 200.0,
+  "sigma_min": 0.147,
+  "solver_order": 2,
+  "solver_p": null,
+  "solver_type": "bh2",
+  "steps_offset": 0,
+  "thresholding": false,
+  "time_shift_type": "exponential",
+  "timestep_spacing": "linspace",
+  "trained_betas": null,
+  "use_beta_sigmas": false,
+  "use_dynamic_shifting": false,
+  "use_exponential_sigmas": false,
+  "use_flow_sigmas": true,
+  "use_karras_sigmas": true
+}

sound_tokenizer/config.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+    "model_type": "autoencoder_v2",
+    "sampling_rate": 48000,
+    "stereo": true,
+    "use_wav_as_input": true,
+    "normalize_volume": true,
+    "hop_size": 1920,
+    "input_channels": 1,
+    "enc_type": "spec_convnext",
+    "enc_dim": 192,
+    "enc_intermediate_dim": 768,
+    "enc_num_layers": 12,
+    "enc_num_blocks": 2,
+    "enc_n_fft": 64,
+    "enc_hop_length": 16,
+    "enc_latent_dim": 128,
+    "enc_c_mults": [
+        1,
+        2,
+        4
+    ],
+    "enc_strides": [
+        4,
+        5,
+        6
+    ],
+    "enc_identity_init": false,
+    "enc_use_snake": true,
+    "dec_type": "oobleck",
+    "dec_dim": 320,
+    "dec_c_mults": [
+        1,
+        2,
+        4,
+        8,
+        16
+    ],
+    "dec_strides": [
+        2,
+        4,
+        5,
+        6,
+        8
+    ],
+    "dec_use_snake": true,
+    "dec_final_tanh": false,
+    "dec_out_channels": 2,
+    "dec_anti_aliasing": false,
+    "dec_use_nearest_upsample": false,
+    "dec_use_tanh_at_final": false,
+    "bottleneck_type": "vae",
+    "bottleneck": {
+        "type": "vae"
+    },
+    "activation": "snakebeta",
+    "snake_logscale": true,
+    "anti_aliasing": false,
+    "use_cuda_kernel": false,
+    "causal": false,
+    "padding_mode": "zeros",
+    "vocoder_input_dim": 64,
+    "latent_mean": null,
+    "latent_std": null
+}

sound_tokenizer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d4c61cde38acfb0cad9048a140c3533750277a8462b19dc08450d9fe1ad9879
+size 1892409600

text_tokenizer/added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

text_tokenizer/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,120 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' }}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- for message in messages %}
+    {%- if message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content_item in message.content %}
+                {%- if 'text' in content_item %}
+                    {{- content_item.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and message.content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

text_tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

text_tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

text_tokenizer/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
+size 11422654

text_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 262144,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

text_tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

transformer/mlx_quant_config.json ADDED Viewed

	@@ -0,0 +1,474 @@

+{
+  "group_size": 64,
+  "bits": 4,
+  "quantized": [
+    "layers.0.mlp.down_proj.weight",
+    "layers.0.mlp.gate_proj.weight",
+    "layers.0.mlp.up_proj.weight",
+    "layers.0.mlp_moe_gen.down_proj.weight",
+    "layers.0.mlp_moe_gen.gate_proj.weight",
+    "layers.0.mlp_moe_gen.up_proj.weight",
+    "layers.0.self_attn.add_k_proj.weight",
+    "layers.0.self_attn.add_q_proj.weight",
+    "layers.0.self_attn.add_v_proj.weight",
+    "layers.0.self_attn.to_k.weight",
+    "layers.0.self_attn.to_out.weight",
+    "layers.0.self_attn.to_q.weight",
+    "layers.0.self_attn.to_v.weight",
+    "layers.1.mlp.down_proj.weight",
+    "layers.1.mlp.gate_proj.weight",
+    "layers.1.mlp.up_proj.weight",
+    "layers.1.mlp_moe_gen.down_proj.weight",
+    "layers.1.mlp_moe_gen.gate_proj.weight",
+    "layers.1.mlp_moe_gen.up_proj.weight",
+    "layers.1.self_attn.add_k_proj.weight",
+    "layers.1.self_attn.add_q_proj.weight",
+    "layers.1.self_attn.add_v_proj.weight",
+    "layers.1.self_attn.to_k.weight",
+    "layers.1.self_attn.to_out.weight",
+    "layers.1.self_attn.to_q.weight",
+    "layers.1.self_attn.to_v.weight",
+    "layers.2.mlp.down_proj.weight",
+    "layers.2.mlp.gate_proj.weight",
+    "layers.2.mlp.up_proj.weight",
+    "layers.2.mlp_moe_gen.down_proj.weight",
+    "layers.2.mlp_moe_gen.gate_proj.weight",
+    "layers.2.mlp_moe_gen.up_proj.weight",
+    "layers.2.self_attn.add_k_proj.weight",
+    "layers.2.self_attn.add_q_proj.weight",
+    "layers.2.self_attn.add_v_proj.weight",
+    "layers.2.self_attn.to_k.weight",
+    "layers.2.self_attn.to_out.weight",
+    "layers.2.self_attn.to_q.weight",
+    "layers.2.self_attn.to_v.weight",
+    "layers.3.mlp.down_proj.weight",
+    "layers.3.mlp.gate_proj.weight",
+    "layers.3.mlp.up_proj.weight",
+    "layers.3.mlp_moe_gen.down_proj.weight",
+    "layers.3.mlp_moe_gen.gate_proj.weight",
+    "layers.3.mlp_moe_gen.up_proj.weight",
+    "layers.3.self_attn.add_k_proj.weight",
+    "layers.3.self_attn.add_q_proj.weight",
+    "layers.3.self_attn.add_v_proj.weight",
+    "layers.3.self_attn.to_k.weight",
+    "layers.3.self_attn.to_out.weight",
+    "layers.3.self_attn.to_q.weight",
+    "layers.3.self_attn.to_v.weight",
+    "layers.4.mlp.down_proj.weight",
+    "layers.4.mlp.gate_proj.weight",
+    "layers.4.mlp.up_proj.weight",
+    "layers.4.mlp_moe_gen.gate_proj.weight",
+    "layers.4.self_attn.add_k_proj.weight",
+    "layers.4.self_attn.add_q_proj.weight",
+    "layers.4.self_attn.add_v_proj.weight",
+    "layers.4.self_attn.to_k.weight",
+    "layers.4.self_attn.to_out.weight",
+    "layers.4.self_attn.to_q.weight",
+    "layers.4.self_attn.to_v.weight",
+    "layers.10.mlp.down_proj.weight",
+    "layers.10.mlp.gate_proj.weight",
+    "layers.10.mlp.up_proj.weight",
+    "layers.10.mlp_moe_gen.down_proj.weight",
+    "layers.10.mlp_moe_gen.gate_proj.weight",
+    "layers.10.mlp_moe_gen.up_proj.weight",
+    "layers.10.self_attn.add_k_proj.weight",
+    "layers.10.self_attn.add_q_proj.weight",
+    "layers.10.self_attn.add_v_proj.weight",
+    "layers.10.self_attn.to_k.weight",
+    "layers.10.self_attn.to_out.weight",
+    "layers.10.self_attn.to_q.weight",
+    "layers.10.self_attn.to_v.weight",
+    "layers.11.self_attn.add_k_proj.weight",
+    "layers.11.self_attn.add_q_proj.weight",
+    "layers.11.self_attn.add_v_proj.weight",
+    "layers.11.self_attn.to_k.weight",
+    "layers.11.self_attn.to_out.weight",
+    "layers.11.self_attn.to_q.weight",
+    "layers.11.self_attn.to_v.weight",
+    "layers.4.mlp_moe_gen.down_proj.weight",
+    "layers.4.mlp_moe_gen.up_proj.weight",
+    "layers.5.mlp.down_proj.weight",
+    "layers.5.mlp.gate_proj.weight",
+    "layers.5.mlp.up_proj.weight",
+    "layers.5.mlp_moe_gen.down_proj.weight",
+    "layers.5.mlp_moe_gen.gate_proj.weight",
+    "layers.5.mlp_moe_gen.up_proj.weight",
+    "layers.5.self_attn.add_k_proj.weight",
+    "layers.5.self_attn.add_q_proj.weight",
+    "layers.5.self_attn.add_v_proj.weight",
+    "layers.5.self_attn.to_k.weight",
+    "layers.5.self_attn.to_out.weight",
+    "layers.5.self_attn.to_q.weight",
+    "layers.5.self_attn.to_v.weight",
+    "layers.6.mlp.down_proj.weight",
+    "layers.6.mlp.gate_proj.weight",
+    "layers.6.mlp.up_proj.weight",
+    "layers.6.mlp_moe_gen.down_proj.weight",
+    "layers.6.mlp_moe_gen.gate_proj.weight",
+    "layers.6.mlp_moe_gen.up_proj.weight",
+    "layers.6.self_attn.add_k_proj.weight",
+    "layers.6.self_attn.add_q_proj.weight",
+    "layers.6.self_attn.add_v_proj.weight",
+    "layers.6.self_attn.to_k.weight",
+    "layers.6.self_attn.to_out.weight",
+    "layers.6.self_attn.to_q.weight",
+    "layers.6.self_attn.to_v.weight",
+    "layers.7.mlp.down_proj.weight",
+    "layers.7.mlp.gate_proj.weight",
+    "layers.7.mlp.up_proj.weight",
+    "layers.7.mlp_moe_gen.down_proj.weight",
+    "layers.7.mlp_moe_gen.gate_proj.weight",
+    "layers.7.mlp_moe_gen.up_proj.weight",
+    "layers.7.self_attn.add_k_proj.weight",
+    "layers.7.self_attn.add_q_proj.weight",
+    "layers.7.self_attn.add_v_proj.weight",
+    "layers.7.self_attn.to_k.weight",
+    "layers.7.self_attn.to_out.weight",
+    "layers.7.self_attn.to_q.weight",
+    "layers.7.self_attn.to_v.weight",
+    "layers.8.mlp.down_proj.weight",
+    "layers.8.mlp.gate_proj.weight",
+    "layers.8.mlp.up_proj.weight",
+    "layers.8.mlp_moe_gen.down_proj.weight",
+    "layers.8.mlp_moe_gen.gate_proj.weight",
+    "layers.8.mlp_moe_gen.up_proj.weight",
+    "layers.8.self_attn.add_k_proj.weight",
+    "layers.8.self_attn.add_q_proj.weight",
+    "layers.8.self_attn.add_v_proj.weight",
+    "layers.8.self_attn.to_k.weight",
+    "layers.8.self_attn.to_out.weight",
+    "layers.8.self_attn.to_q.weight",
+    "layers.8.self_attn.to_v.weight",
+    "layers.9.mlp.down_proj.weight",
+    "layers.9.mlp.gate_proj.weight",
+    "layers.9.mlp.up_proj.weight",
+    "layers.9.mlp_moe_gen.down_proj.weight",
+    "layers.9.mlp_moe_gen.gate_proj.weight",
+    "layers.9.mlp_moe_gen.up_proj.weight",
+    "layers.9.self_attn.add_k_proj.weight",
+    "layers.9.self_attn.add_q_proj.weight",
+    "layers.9.self_attn.add_v_proj.weight",
+    "layers.9.self_attn.to_k.weight",
+    "layers.9.self_attn.to_out.weight",
+    "layers.9.self_attn.to_q.weight",
+    "layers.9.self_attn.to_v.weight",
+    "layers.11.mlp.down_proj.weight",
+    "layers.11.mlp.gate_proj.weight",
+    "layers.11.mlp.up_proj.weight",
+    "layers.11.mlp_moe_gen.down_proj.weight",
+    "layers.11.mlp_moe_gen.gate_proj.weight",
+    "layers.11.mlp_moe_gen.up_proj.weight",
+    "layers.12.mlp.down_proj.weight",
+    "layers.12.mlp.gate_proj.weight",
+    "layers.12.mlp.up_proj.weight",
+    "layers.12.mlp_moe_gen.down_proj.weight",
+    "layers.12.mlp_moe_gen.gate_proj.weight",
+    "layers.12.mlp_moe_gen.up_proj.weight",
+    "layers.12.self_attn.add_k_proj.weight",
+    "layers.12.self_attn.add_q_proj.weight",
+    "layers.12.self_attn.add_v_proj.weight",
+    "layers.12.self_attn.to_k.weight",
+    "layers.12.self_attn.to_out.weight",
+    "layers.12.self_attn.to_q.weight",
+    "layers.12.self_attn.to_v.weight",
+    "layers.13.mlp.down_proj.weight",
+    "layers.13.mlp.gate_proj.weight",
+    "layers.13.mlp.up_proj.weight",
+    "layers.13.mlp_moe_gen.down_proj.weight",
+    "layers.13.mlp_moe_gen.gate_proj.weight",
+    "layers.13.mlp_moe_gen.up_proj.weight",
+    "layers.13.self_attn.add_k_proj.weight",
+    "layers.13.self_attn.add_q_proj.weight",
+    "layers.13.self_attn.add_v_proj.weight",
+    "layers.13.self_attn.to_k.weight",
+    "layers.13.self_attn.to_out.weight",
+    "layers.13.self_attn.to_q.weight",
+    "layers.13.self_attn.to_v.weight",
+    "layers.14.mlp.down_proj.weight",
+    "layers.14.mlp.gate_proj.weight",
+    "layers.14.mlp.up_proj.weight",
+    "layers.14.mlp_moe_gen.down_proj.weight",
+    "layers.14.mlp_moe_gen.gate_proj.weight",
+    "layers.14.mlp_moe_gen.up_proj.weight",
+    "layers.14.self_attn.add_k_proj.weight",
+    "layers.14.self_attn.add_q_proj.weight",
+    "layers.14.self_attn.add_v_proj.weight",
+    "layers.14.self_attn.to_k.weight",
+    "layers.14.self_attn.to_out.weight",
+    "layers.14.self_attn.to_q.weight",
+    "layers.14.self_attn.to_v.weight",
+    "layers.15.mlp.down_proj.weight",
+    "layers.15.mlp.gate_proj.weight",
+    "layers.15.mlp.up_proj.weight",
+    "layers.15.mlp_moe_gen.down_proj.weight",
+    "layers.15.mlp_moe_gen.gate_proj.weight",
+    "layers.15.mlp_moe_gen.up_proj.weight",
+    "layers.15.self_attn.add_k_proj.weight",
+    "layers.15.self_attn.add_q_proj.weight",
+    "layers.15.self_attn.add_v_proj.weight",
+    "layers.15.self_attn.to_k.weight",
+    "layers.15.self_attn.to_out.weight",
+    "layers.15.self_attn.to_q.weight",
+    "layers.15.self_attn.to_v.weight",
+    "layers.16.mlp.down_proj.weight",
+    "layers.16.mlp.gate_proj.weight",
+    "layers.16.mlp.up_proj.weight",
+    "layers.16.mlp_moe_gen.down_proj.weight",
+    "layers.16.mlp_moe_gen.gate_proj.weight",
+    "layers.16.mlp_moe_gen.up_proj.weight",
+    "layers.16.self_attn.add_k_proj.weight",
+    "layers.16.self_attn.add_q_proj.weight",
+    "layers.16.self_attn.add_v_proj.weight",
+    "layers.16.self_attn.to_k.weight",
+    "layers.16.self_attn.to_out.weight",
+    "layers.16.self_attn.to_q.weight",
+    "layers.16.self_attn.to_v.weight",
+    "layers.17.mlp.down_proj.weight",
+    "layers.17.mlp.gate_proj.weight",
+    "layers.17.mlp.up_proj.weight",
+    "layers.17.self_attn.add_k_proj.weight",
+    "layers.17.self_attn.add_q_proj.weight",
+    "layers.17.self_attn.add_v_proj.weight",
+    "layers.17.self_attn.to_k.weight",
+    "layers.17.self_attn.to_out.weight",
+    "layers.17.self_attn.to_q.weight",
+    "layers.17.self_attn.to_v.weight",
+    "layers.17.mlp_moe_gen.down_proj.weight",
+    "layers.17.mlp_moe_gen.gate_proj.weight",
+    "layers.17.mlp_moe_gen.up_proj.weight",
+    "layers.18.mlp.down_proj.weight",
+    "layers.18.mlp.gate_proj.weight",
+    "layers.18.mlp.up_proj.weight",
+    "layers.18.mlp_moe_gen.down_proj.weight",
+    "layers.18.mlp_moe_gen.gate_proj.weight",
+    "layers.18.mlp_moe_gen.up_proj.weight",
+    "layers.18.self_attn.add_k_proj.weight",
+    "layers.18.self_attn.add_q_proj.weight",
+    "layers.18.self_attn.add_v_proj.weight",
+    "layers.18.self_attn.to_k.weight",
+    "layers.18.self_attn.to_out.weight",
+    "layers.18.self_attn.to_q.weight",
+    "layers.18.self_attn.to_v.weight",
+    "layers.19.mlp.down_proj.weight",
+    "layers.19.mlp.gate_proj.weight",
+    "layers.19.mlp.up_proj.weight",
+    "layers.19.mlp_moe_gen.down_proj.weight",
+    "layers.19.mlp_moe_gen.gate_proj.weight",
+    "layers.19.mlp_moe_gen.up_proj.weight",
+    "layers.19.self_attn.add_k_proj.weight",
+    "layers.19.self_attn.add_q_proj.weight",
+    "layers.19.self_attn.add_v_proj.weight",
+    "layers.19.self_attn.to_k.weight",
+    "layers.19.self_attn.to_out.weight",
+    "layers.19.self_attn.to_q.weight",
+    "layers.19.self_attn.to_v.weight",
+    "layers.20.mlp.down_proj.weight",
+    "layers.20.mlp.gate_proj.weight",
+    "layers.20.mlp.up_proj.weight",
+    "layers.20.mlp_moe_gen.down_proj.weight",
+    "layers.20.mlp_moe_gen.gate_proj.weight",
+    "layers.20.mlp_moe_gen.up_proj.weight",
+    "layers.20.self_attn.add_k_proj.weight",
+    "layers.20.self_attn.add_q_proj.weight",
+    "layers.20.self_attn.add_v_proj.weight",
+    "layers.20.self_attn.to_k.weight",
+    "layers.20.self_attn.to_out.weight",
+    "layers.20.self_attn.to_q.weight",
+    "layers.20.self_attn.to_v.weight",
+    "layers.21.mlp.down_proj.weight",
+    "layers.21.mlp.gate_proj.weight",
+    "layers.21.mlp.up_proj.weight",
+    "layers.21.mlp_moe_gen.down_proj.weight",
+    "layers.21.mlp_moe_gen.gate_proj.weight",
+    "layers.21.mlp_moe_gen.up_proj.weight",
+    "layers.21.self_attn.add_k_proj.weight",
+    "layers.21.self_attn.add_q_proj.weight",
+    "layers.21.self_attn.add_v_proj.weight",
+    "layers.21.self_attn.to_k.weight",
+    "layers.21.self_attn.to_out.weight",
+    "layers.21.self_attn.to_q.weight",
+    "layers.21.self_attn.to_v.weight",
+    "layers.22.mlp.down_proj.weight",
+    "layers.22.mlp.gate_proj.weight",
+    "layers.22.mlp.up_proj.weight",
+    "layers.22.mlp_moe_gen.down_proj.weight",
+    "layers.22.mlp_moe_gen.gate_proj.weight",
+    "layers.22.mlp_moe_gen.up_proj.weight",
+    "layers.22.self_attn.add_k_proj.weight",
+    "layers.22.self_attn.add_q_proj.weight",
+    "layers.22.self_attn.add_v_proj.weight",
+    "layers.22.self_attn.to_k.weight",
+    "layers.22.self_attn.to_out.weight",
+    "layers.22.self_attn.to_q.weight",
+    "layers.22.self_attn.to_v.weight",
+    "layers.23.mlp.down_proj.weight",
+    "layers.23.mlp.gate_proj.weight",
+    "layers.23.mlp.up_proj.weight",
+    "layers.23.mlp_moe_gen.down_proj.weight",
+    "layers.23.mlp_moe_gen.gate_proj.weight",
+    "layers.23.mlp_moe_gen.up_proj.weight",
+    "layers.23.self_attn.add_k_proj.weight",
+    "layers.23.self_attn.add_q_proj.weight",
+    "layers.23.self_attn.add_v_proj.weight",
+    "layers.23.self_attn.to_k.weight",
+    "layers.23.self_attn.to_out.weight",
+    "layers.23.self_attn.to_q.weight",
+    "layers.23.self_attn.to_v.weight",
+    "layers.24.self_attn.to_k.weight",
+    "layers.24.self_attn.to_q.weight",
+    "layers.24.self_attn.to_v.weight",
+    "layers.24.mlp.down_proj.weight",
+    "layers.24.mlp.gate_proj.weight",
+    "layers.24.mlp.up_proj.weight",
+    "layers.24.mlp_moe_gen.down_proj.weight",
+    "layers.24.mlp_moe_gen.gate_proj.weight",
+    "layers.24.mlp_moe_gen.up_proj.weight",
+    "layers.24.self_attn.add_k_proj.weight",
+    "layers.24.self_attn.add_q_proj.weight",
+    "layers.24.self_attn.add_v_proj.weight",
+    "layers.24.self_attn.to_out.weight",
+    "layers.25.mlp.down_proj.weight",
+    "layers.25.mlp.gate_proj.weight",
+    "layers.25.mlp.up_proj.weight",
+    "layers.25.mlp_moe_gen.down_proj.weight",
+    "layers.25.mlp_moe_gen.gate_proj.weight",
+    "layers.25.mlp_moe_gen.up_proj.weight",
+    "layers.25.self_attn.add_k_proj.weight",
+    "layers.25.self_attn.add_q_proj.weight",
+    "layers.25.self_attn.add_v_proj.weight",
+    "layers.25.self_attn.to_k.weight",
+    "layers.25.self_attn.to_out.weight",
+    "layers.25.self_attn.to_q.weight",
+    "layers.25.self_attn.to_v.weight",
+    "layers.26.mlp.down_proj.weight",
+    "layers.26.mlp.gate_proj.weight",
+    "layers.26.mlp.up_proj.weight",
+    "layers.26.mlp_moe_gen.down_proj.weight",
+    "layers.26.mlp_moe_gen.gate_proj.weight",
+    "layers.26.mlp_moe_gen.up_proj.weight",
+    "layers.26.self_attn.add_k_proj.weight",
+    "layers.26.self_attn.add_q_proj.weight",
+    "layers.26.self_attn.add_v_proj.weight",
+    "layers.26.self_attn.to_k.weight",
+    "layers.26.self_attn.to_out.weight",
+    "layers.26.self_attn.to_q.weight",
+    "layers.26.self_attn.to_v.weight",
+    "layers.27.mlp.down_proj.weight",
+    "layers.27.mlp.gate_proj.weight",
+    "layers.27.mlp.up_proj.weight",
+    "layers.27.mlp_moe_gen.down_proj.weight",
+    "layers.27.mlp_moe_gen.gate_proj.weight",
+    "layers.27.mlp_moe_gen.up_proj.weight",
+    "layers.27.self_attn.add_k_proj.weight",
+    "layers.27.self_attn.add_q_proj.weight",
+    "layers.27.self_attn.add_v_proj.weight",
+    "layers.27.self_attn.to_k.weight",
+    "layers.27.self_attn.to_out.weight",
+    "layers.27.self_attn.to_q.weight",
+    "layers.27.self_attn.to_v.weight",
+    "layers.28.mlp.down_proj.weight",
+    "layers.28.mlp.gate_proj.weight",
+    "layers.28.mlp.up_proj.weight",
+    "layers.28.mlp_moe_gen.down_proj.weight",
+    "layers.28.mlp_moe_gen.gate_proj.weight",
+    "layers.28.mlp_moe_gen.up_proj.weight",
+    "layers.28.self_attn.add_k_proj.weight",
+    "layers.28.self_attn.add_q_proj.weight",
+    "layers.28.self_attn.add_v_proj.weight",
+    "layers.28.self_attn.to_k.weight",
+    "layers.28.self_attn.to_out.weight",
+    "layers.28.self_attn.to_q.weight",
+    "layers.28.self_attn.to_v.weight",
+    "layers.29.mlp.down_proj.weight",
+    "layers.29.mlp.gate_proj.weight",
+    "layers.29.mlp.up_proj.weight",
+    "layers.29.mlp_moe_gen.down_proj.weight",
+    "layers.29.mlp_moe_gen.gate_proj.weight",
+    "layers.29.mlp_moe_gen.up_proj.weight",
+    "layers.29.self_attn.add_k_proj.weight",
+    "layers.29.self_attn.add_q_proj.weight",
+    "layers.29.self_attn.add_v_proj.weight",
+    "layers.29.self_attn.to_k.weight",
+    "layers.29.self_attn.to_out.weight",
+    "layers.29.self_attn.to_q.weight",
+    "layers.29.self_attn.to_v.weight",
+    "layers.30.mlp.gate_proj.weight",
+    "layers.30.mlp.up_proj.weight",
+    "layers.30.self_attn.add_k_proj.weight",
+    "layers.30.self_attn.add_q_proj.weight",
+    "layers.30.self_attn.add_v_proj.weight",
+    "layers.30.self_attn.to_k.weight",
+    "layers.30.self_attn.to_out.weight",
+    "layers.30.self_attn.to_q.weight",
+    "layers.30.self_attn.to_v.weight",
+    "layers.30.mlp.down_proj.weight",
+    "layers.30.mlp_moe_gen.down_proj.weight",
+    "layers.30.mlp_moe_gen.gate_proj.weight",
+    "layers.30.mlp_moe_gen.up_proj.weight",
+    "layers.31.mlp.down_proj.weight",
+    "layers.31.mlp.gate_proj.weight",
+    "layers.31.mlp.up_proj.weight",
+    "layers.31.mlp_moe_gen.down_proj.weight",
+    "layers.31.mlp_moe_gen.gate_proj.weight",
+    "layers.31.mlp_moe_gen.up_proj.weight",
+    "layers.31.self_attn.add_k_proj.weight",
+    "layers.31.self_attn.add_q_proj.weight",
+    "layers.31.self_attn.add_v_proj.weight",
+    "layers.31.self_attn.to_k.weight",
+    "layers.31.self_attn.to_out.weight",
+    "layers.31.self_attn.to_q.weight",
+    "layers.31.self_attn.to_v.weight",
+    "layers.32.mlp.down_proj.weight",
+    "layers.32.mlp.gate_proj.weight",
+    "layers.32.mlp.up_proj.weight",
+    "layers.32.mlp_moe_gen.down_proj.weight",
+    "layers.32.mlp_moe_gen.gate_proj.weight",
+    "layers.32.mlp_moe_gen.up_proj.weight",
+    "layers.32.self_attn.add_k_proj.weight",
+    "layers.32.self_attn.add_q_proj.weight",
+    "layers.32.self_attn.add_v_proj.weight",
+    "layers.32.self_attn.to_k.weight",
+    "layers.32.self_attn.to_out.weight",
+    "layers.32.self_attn.to_q.weight",
+    "layers.32.self_attn.to_v.weight",
+    "layers.33.mlp.down_proj.weight",
+    "layers.33.mlp.gate_proj.weight",
+    "layers.33.mlp.up_proj.weight",
+    "layers.33.mlp_moe_gen.down_proj.weight",
+    "layers.33.mlp_moe_gen.gate_proj.weight",
+    "layers.33.mlp_moe_gen.up_proj.weight",
+    "layers.33.self_attn.add_k_proj.weight",
+    "layers.33.self_attn.add_q_proj.weight",
+    "layers.33.self_attn.add_v_proj.weight",
+    "layers.33.self_attn.to_k.weight",
+    "layers.33.self_attn.to_out.weight",
+    "layers.33.self_attn.to_q.weight",
+    "layers.33.self_attn.to_v.weight",
+    "layers.34.mlp.down_proj.weight",
+    "layers.34.mlp.gate_proj.weight",
+    "layers.34.mlp.up_proj.weight",
+    "layers.34.mlp_moe_gen.down_proj.weight",
+    "layers.34.mlp_moe_gen.gate_proj.weight",
+    "layers.34.mlp_moe_gen.up_proj.weight",
+    "layers.34.self_attn.add_k_proj.weight",
+    "layers.34.self_attn.add_q_proj.weight",
+    "layers.34.self_attn.add_v_proj.weight",
+    "layers.34.self_attn.to_k.weight",
+    "layers.34.self_attn.to_out.weight",
+    "layers.34.self_attn.to_q.weight",
+    "layers.34.self_attn.to_v.weight",
+    "layers.35.mlp.down_proj.weight",
+    "layers.35.mlp.gate_proj.weight",
+    "layers.35.mlp.up_proj.weight",
+    "layers.35.mlp_moe_gen.down_proj.weight",
+    "layers.35.mlp_moe_gen.gate_proj.weight",
+    "layers.35.mlp_moe_gen.up_proj.weight",
+    "layers.35.self_attn.add_k_proj.weight",
+    "layers.35.self_attn.add_q_proj.weight",
+    "layers.35.self_attn.add_v_proj.weight",
+    "layers.35.self_attn.to_k.weight",
+    "layers.35.self_attn.to_out.weight",
+    "layers.35.self_attn.to_q.weight",
+    "layers.35.self_attn.to_v.weight"
+  ]
+}

transformer/model-00001-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33aa7326bc74dba9d1420041eb3b0f6e051befac05082053c80f8ecb1c22f90d
+size 2503129397

transformer/model-00002-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc369f9576c0cadfe691be1399ef580a5d34f9130fd9ec4d2cc765ac43220389
+size 1724131896

transformer/model-00003-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79e4356afb93a0a5e01d1844b2d7f6ae1e9250555d0ac4a2bfda1ae152959448
+size 1680055054

transformer/model-00004-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:765416dc6fa5c34504b587c8a5da28789d9e62822d275519eb7dde3dbb53bfff
+size 1695818020

transformer/model-00005-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa8c7cfe7c709e5ac217e56959d45f2813d49a54f0e4d56459055c1da782049a
+size 1708369224

transformer/model-00006-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:29d5b08926ee17dc43270744a77926b929482e9f19b8ba1206b1e18babc073e2
+size 1447282090

transformer/model-00007-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f36f39ad47fd8b0cb2b43d7117cd9d03784d108fb77c9a13474da6184aa0bf08
+size 1318361139

vae/config.json ADDED Viewed

	@@ -0,0 +1,129 @@

+{
+  "_class_name": "AutoencoderKLWan",
+  "_diffusers_version": "0.37.1",
+  "_name_or_path": "Wan-AI/Wan2.2-TI2V-5B-Diffusers",
+  "attn_scales": [],
+  "base_dim": 160,
+  "clip_output": false,
+  "decoder_base_dim": 256,
+  "dim_mult": [
+    1,
+    2,
+    4,
+    4
+  ],
+  "dropout": 0.0,
+  "in_channels": 12,
+  "is_residual": true,
+  "latents_mean": [
+    -0.2289,
+    -0.0052,
+    -0.1323,
+    -0.2339,
+    -0.2799,
+    0.0174,
+    0.1838,
+    0.1557,
+    -0.1382,
+    0.0542,
+    0.2813,
+    0.0891,
+    0.157,
+    -0.0098,
+    0.0375,
+    -0.1825,
+    -0.2246,
+    -0.1207,
+    -0.0698,
+    0.5109,
+    0.2665,
+    -0.2108,
+    -0.2158,
+    0.2502,
+    -0.2055,
+    -0.0322,
+    0.1109,
+    0.1567,
+    -0.0729,
+    0.0899,
+    -0.2799,
+    -0.123,
+    -0.0313,
+    -0.1649,
+    0.0117,
+    0.0723,
+    -0.2839,
+    -0.2083,
+    -0.052,
+    0.3748,
+    0.0152,
+    0.1957,
+    0.1433,
+    -0.2944,
+    0.3573,
+    -0.0548,
+    -0.1681,
+    -0.0667
+  ],
+  "latents_std": [
+    0.4765,
+    1.0364,
+    0.4514,
+    1.1677,
+    0.5313,
+    0.499,
+    0.4818,
+    0.5013,
+    0.8158,
+    1.0344,
+    0.5894,
+    1.0901,
+    0.6885,
+    0.6165,
+    0.8454,
+    0.4978,
+    0.5759,
+    0.3523,
+    0.7135,
+    0.6804,
+    0.5833,
+    1.4146,
+    0.8986,
+    0.5659,
+    0.7069,
+    0.5338,
+    0.4889,
+    0.4917,
+    0.4069,
+    0.4999,
+    0.6866,
+    0.4093,
+    0.5709,
+    0.6065,
+    0.6415,
+    0.4944,
+    0.5726,
+    1.2042,
+    0.5458,
+    1.6887,
+    0.3971,
+    1.06,
+    0.3943,
+    0.5537,
+    0.5444,
+    0.4089,
+    0.7468,
+    0.7744
+  ],
+  "num_res_blocks": 2,
+  "out_channels": 12,
+  "patch_size": 2,
+  "scale_factor_spatial": 16,
+  "scale_factor_temporal": 4,
+  "temperal_downsample": [
+    false,
+    true,
+    true
+  ],
+  "z_dim": 48
+}

vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:230496cb59ff85bc9c040487737c4062480cb61c71e697b197b4c30142f2a0da
+size 1409400600