BiliSakura commited on May 23

Commit

bb3feea

verified ·

1 Parent(s): 4e67e00

Upload folder using huggingface_hub

Browse files

Files changed (27) hide show

.gitattributes +2 -0
README.md +88 -15
__pycache__/pipeline.cpython-312.pyc +0 -0
demo.png +3 -0
demo_inference.py +161 -0
model_index.json +27 -0
pipeline.py +278 -0
scheduler/scheduler_config.json +18 -0
text_encoder/config.json +30 -0
text_encoder/generation_config.json +13 -0
text_encoder/model.safetensors +3 -0
tokenizer/merges.txt +0 -0
tokenizer/tokenizer.json +3 -0
tokenizer/tokenizer_config.json +239 -0
tokenizer/vocab.json +0 -0
transformer/__pycache__/transformer_mvsplit_dit.cpython-312.pyc +0 -0
transformer/config.json +20 -0
transformer/diffusion_pytorch_model-00001-of-00006.safetensors +3 -0
transformer/diffusion_pytorch_model-00002-of-00006.safetensors +3 -0
transformer/diffusion_pytorch_model-00003-of-00006.safetensors +3 -0
transformer/diffusion_pytorch_model-00004-of-00006.safetensors +3 -0
transformer/diffusion_pytorch_model-00005-of-00006.safetensors +3 -0
transformer/diffusion_pytorch_model-00006-of-00006.safetensors +3 -0
transformer/diffusion_pytorch_model.safetensors.index.json +0 -0
transformer/transformer_mvsplit_dit.py +350 -0
vae/config.json +40 -0
vae/diffusion_pytorch_model.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 MVSplit-DiT-1000L/demo.png filter=lfs diff=lfs merge=lfs -text
 MVSplit-DiT-1000L/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 MVSplit-DiT-1000L/demo.png filter=lfs diff=lfs merge=lfs -text
 MVSplit-DiT-1000L/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+demo.png filter=lfs diff=lfs merge=lfs -text
+tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -11,40 +11,113 @@ tags:
   - text-to-image
   - flow-matching
   - mvsplit
 widget:
   - text: a red panda climbing a bamboo stalk
     output:
-      url: MVSplit-DiT-1000L/demo.png
 ---
-# BiliSakura/MVSplit-DiT-diffusers
-Diffusers-ready checkpoints for **MVSplit-DiT** (Mean–Variance Split Residual Diffusion Transformers), converted for local/offline use with a project-owned custom `MVSplitDiTPipeline`.
-> **Re-distribution notice:** weights are converted from [`StableKirito/mvsplit-dit-1000l`](https://huggingface.co/StableKirito/mvsplit-dit-1000l). Original work: [Mean Mode Screaming](https://huggingface.co/papers/2605.06169). License: [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0).
-## Available checkpoints
-| Subfolder | Params | Task | Resolution |
-| --- | ---: | --- | ---: |
-| [`MVSplit-DiT-1000L/`](MVSplit-DiT-1000L/) | 1000L | text-to-image | 256×256 |
-Each subfolder is a self-contained Diffusers model repo with `pipeline.py`, `model_index.json`, and component weights.
 ## Demo
-![MVSplit-DiT-1000L demo](MVSplit-DiT-1000L/demo.png)
 Prompt: *a red panda climbing a bamboo stalk* — 256×256, 35 steps, CFG 2.0.
 ## Inference
 ```bash
-cd MVSplit-DiT-1000L
 python demo_inference.py
 ```
-See [`MVSplit-DiT-1000L/README.md`](MVSplit-DiT-1000L/README.md) for full usage and recommended settings.
 ## Citation

   - text-to-image
   - flow-matching
   - mvsplit
+inference: true
 widget:
   - text: a red panda climbing a bamboo stalk
     output:
+      url: demo.png
 ---
+# MVSplit-DiT-1000L
+Self-contained Diffusers checkpoint for **MVSplit-DiT** (1000-layer Diffusion Transformer) with a custom `MVSplitDiTPipeline` (`pipeline.py`).
+> **Re-distribution notice:** weights are converted from [`StableKirito/mvsplit-dit-1000l`](https://huggingface.co/StableKirito/mvsplit-dit-1000l). Original work: [Mean Mode Screaming: Mean–Variance Split Residuals for 1000-Layer Diffusion Transformers](https://huggingface.co/papers/2605.06169). License: [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0).
 ## Demo
+![MVSplit-DiT-1000L demo](demo.png)
 Prompt: *a red panda climbing a bamboo stalk* — 256×256, 35 steps, CFG 2.0.
+## Components
+- `pipeline.py` — `MVSplitDiTPipeline`
+- `model_index.json`
+- `transformer/` — `MVSplitDiTTransformer2DModel` (bf16, 1000 layers)
+- `scheduler/` — `FlowMatchEulerDiscreteScheduler`
+- `text_encoder/` — Qwen3-0.6B (`AutoModel`)
+- `tokenizer/` — Qwen3 tokenizer
+- `vae/` — FLUX2 VAE (`AutoencoderKLFlux2`)
 ## Inference
+Run the bundled demo script:
 ```bash
 python demo_inference.py
 ```
+This writes `demo.png` with the default prompt and settings below.
+```python
+from pathlib import Path
+import importlib.util
+import sys
+import torch
+from diffusers import AutoencoderKLFlux2
+from transformers import AutoModel, AutoTokenizer
+model_dir = Path(".").resolve()
+transformer_path = model_dir / "transformer" / "transformer_mvsplit_dit.py"
+spec = importlib.util.spec_from_file_location("transformer_mvsplit_dit", transformer_path)
+module = importlib.util.module_from_spec(spec)
+sys.modules[spec.name] = module
+spec.loader.exec_module(module)
+pipe_spec = importlib.util.spec_from_file_location("mvsplit_pipeline", model_dir / "pipeline.py")
+pipe_module = importlib.util.module_from_spec(pipe_spec)
+sys.modules[pipe_spec.name] = pipe_module
+pipe_spec.loader.exec_module(pipe_module)
+transformer = module.MVSplitDiTTransformer2DModel.from_pretrained(
+    model_dir / "transformer",
+    torch_dtype=torch.bfloat16,
+    local_files_only=True,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_dir / "tokenizer", local_files_only=True)
+text_encoder = AutoModel.from_pretrained(
+    model_dir / "text_encoder",
+    torch_dtype=torch.bfloat16,
+    local_files_only=True,
+)
+vae = AutoencoderKLFlux2.from_pretrained(
+    model_dir / "vae",
+    torch_dtype=torch.bfloat16,
+    local_files_only=True,
+)
+pipe = pipe_module.MVSplitDiTPipeline(
+    transformer=transformer,
+    vae=vae,
+    text_encoder=text_encoder,
+    tokenizer=tokenizer,
+    time_shift_alpha=4.0,
+)
+pipe.enable_sequential_cpu_offload()
+generator = torch.Generator(device="cpu").manual_seed(42)
+image = pipe(
+    prompt="a red panda climbing a bamboo stalk",
+    height=256,
+    width=256,
+    num_inference_steps=35,
+    guidance_scale=2.0,
+    generator=generator,
+).images[0]
+image.save("demo.png")
+```
+### Recommended settings
+| Parameter | Default | Notes |
+| --- | ---: | --- |
+| `height` / `width` | 256 | Square output resolution |
+| `num_inference_steps` | 35 | Flow-matching Euler steps |
+| `guidance_scale` | 2.0 | Classifier-free guidance |
+| `time_shift_alpha` | 4.0 | Time-shift in the flow schedule (must match training) |
+| `seed` | 42 | Reproducible sampling |
 ## Citation

__pycache__/pipeline.cpython-312.pyc ADDED Viewed

Binary file (12.9 kB). View file

demo.png ADDED Viewed

Git LFS Details

SHA256: 6e5f8bae051bb3441bfe109f6fc509dd1ed12afbd58e74a0f257729d8a44ce9f
Pointer size: 131 Bytes
Size of remote file: 130 kB

demo_inference.py ADDED Viewed

	@@ -0,0 +1,161 @@

+#!/usr/bin/env python3
+"""Smoke-test MVSplit-DiT inference from the converted Diffusers Hub folder."""
+from __future__ import annotations
+import argparse
+import importlib.util
+import sys
+from pathlib import Path
+import torch
+from diffusers import AutoencoderKLFlux2
+from transformers import AutoModel, AutoTokenizer
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run MVSplit-DiT inference.")
+    parser.add_argument(
+        "--model",
+        type=Path,
+        default=Path(__file__).resolve().parent,
+        help="Path to MVSplit-DiT-1000L pipeline directory.",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="a red panda climbing a bamboo stalk",
+        help="Text prompt for generation.",
+    )
+    parser.add_argument("--height", type=int, default=256)
+    parser.add_argument("--width", type=int, default=256)
+    parser.add_argument("--num-inference-steps", type=int, default=35)
+    parser.add_argument("--guidance-scale", type=float, default=2.0)
+    parser.add_argument("--time-shift-alpha", type=float, default=4.0)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path(__file__).resolve().parent / "demo.png",
+        help="Output image path. Ignored when --output-type=latent.",
+    )
+    parser.add_argument(
+        "--output-type",
+        choices=("pil", "latent"),
+        default="pil",
+        help="Return decoded image or raw latents.",
+    )
+    parser.add_argument(
+        "--skip-vae",
+        action="store_true",
+        help="Skip VAE decode even when output-type=pil (saves memory).",
+    )
+    parser.add_argument(
+        "--device",
+        choices=("auto", "cuda", "cpu"),
+        default="auto",
+        help="Execution device. auto prefers CUDA when available.",
+    )
+    parser.add_argument(
+        "--cpu-offload",
+        action="store_true",
+        help="Use sequential CPU offload instead of keeping the pipeline on GPU.",
+    )
+    return parser.parse_args()
+def _resolve_device(choice: str) -> torch.device:
+    if choice == "auto":
+        return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    return torch.device(choice)
+def _load_pipeline_class(model_dir: Path):
+    transformer_path = model_dir / "transformer" / "transformer_mvsplit_dit.py"
+    spec = importlib.util.spec_from_file_location("transformer_mvsplit_dit", transformer_path)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[spec.name] = module
+    spec.loader.exec_module(module)
+    pipe_spec = importlib.util.spec_from_file_location("mvsplit_pipeline", model_dir / "pipeline.py")
+    pipe_module = importlib.util.module_from_spec(pipe_spec)
+    sys.modules[pipe_spec.name] = pipe_module
+    pipe_spec.loader.exec_module(pipe_module)
+    return module.MVSplitDiTTransformer2DModel, pipe_module.MVSplitDiTPipeline
+def main() -> None:
+    args = parse_args()
+    model_dir = args.model.resolve()
+    device = _resolve_device(args.device)
+    transformer_cls, pipeline_cls = _load_pipeline_class(model_dir)
+    print(f"Loading components on {device}...", flush=True)
+    transformer = transformer_cls.from_pretrained(
+        model_dir / "transformer",
+        torch_dtype=torch.bfloat16,
+        local_files_only=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_dir / "tokenizer", local_files_only=True)
+    text_encoder = AutoModel.from_pretrained(
+        model_dir / "text_encoder",
+        torch_dtype=torch.bfloat16,
+        local_files_only=True,
+    )
+    vae = None
+    if not args.skip_vae and args.output_type == "pil":
+        vae = AutoencoderKLFlux2.from_pretrained(
+            model_dir / "vae",
+            torch_dtype=torch.bfloat16,
+            local_files_only=True,
+        )
+    pipe = pipeline_cls(
+        transformer=transformer,
+        scheduler=None,
+        vae=vae,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        time_shift_alpha=args.time_shift_alpha,
+    )
+    if args.cpu_offload and device.type == "cuda":
+        pipe.enable_sequential_cpu_offload(gpu_id=device.index or 0)
+    else:
+        pipe.to(device)
+    print(
+        f"Running inference ({args.num_inference_steps} steps, {args.height}x{args.width})...",
+        flush=True,
+    )
+    generator_device = "cpu" if args.cpu_offload else device.type
+    generator = torch.Generator(device=generator_device).manual_seed(args.seed)
+    result = pipe(
+        prompt=args.prompt,
+        height=args.height,
+        width=args.width,
+        num_inference_steps=args.num_inference_steps,
+        guidance_scale=args.guidance_scale,
+        generator=generator,
+        output_type=args.output_type,
+    )
+    if args.output_type == "latent":
+        latents = result.images
+        print(f"latent shape={tuple(latents.shape)} dtype={latents.dtype}")
+        print(
+            "latent stats:",
+            f"min={float(latents.min()):.4f}",
+            f"max={float(latents.max()):.4f}",
+            f"mean={float(latents.mean()):.4f}",
+        )
+        return
+    image = result.images[0]
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    image.save(args.output)
+    print(f"Saved image to {args.output}")
+if __name__ == "__main__":
+    main()

model_index.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "_class_name": [
+    "pipeline",
+    "MVSplitDiTPipeline"
+  ],
+  "_diffusers_version": "0.36.0",
+  "scheduler": [
+    "diffusers",
+    "FlowMatchEulerDiscreteScheduler"
+  ],
+  "transformer": [
+    "transformer_mvsplit_dit",
+    "MVSplitDiTTransformer2DModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKLFlux2"
+  ],
+  "text_encoder": [
+    "transformers",
+    "AutoModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "AutoTokenizer"
+  ]
+}

pipeline.py ADDED Viewed

	@@ -0,0 +1,278 @@

+"""Hub custom pipeline: MVSplitDiTPipeline.
+Load with native Hugging Face diffusers and trust_remote_code=True.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch
+from einops import rearrange
+try:
+    from diffusers.image_processor import VaeImageProcessor
+    from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+    from diffusers.utils import BaseOutput
+except Exception:
+    class BaseOutput(dict):
+        def __post_init__(self):
+            self.update(self.__dict__)
+    class DiffusionPipeline:
+        def register_modules(self, **kwargs):
+            for name, module in kwargs.items():
+                setattr(self, name, module)
+        @property
+        def _execution_device(self):
+            return torch.device("cpu")
+        def maybe_free_model_hooks(self):
+            pass
+    class VaeImageProcessor:
+        def postprocess(self, image, output_type="pil"):
+            return image
+# DiT operates on packed FLUX2 latents at 1/16 of the image resolution.
+LATENT_DOWNSAMPLE_FACTOR = 16
+@dataclass
+class MVSplitDiTPipelineOutput(BaseOutput):
+    images: Union[torch.FloatTensor, List]
+class MVSplitDiTPipeline(DiffusionPipeline):
+    """
+    Text-to-image pipeline for MVSplit DiT.
+    Sampling follows the official mv-split Euler ODE integrator with time-shift
+    (see https://github.com/erwold/mv-split sample.py).
+    """
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _optional_components = ["vae", "text_encoder", "tokenizer"]
+    def __init__(
+        self,
+        transformer,
+        scheduler=None,
+        vae=None,
+        text_encoder=None,
+        tokenizer=None,
+        max_length: int = 256,
+        time_shift_alpha: float = 4.0,
+    ):
+        super().__init__()
+        self.register_modules(
+            transformer=transformer,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+        )
+        self.max_length = max_length
+        self.time_shift_alpha = time_shift_alpha
+        self.image_processor = VaeImageProcessor()
+    @staticmethod
+    def _shift_time(t: float, alpha: float) -> float:
+        return t * alpha / (1.0 + (alpha - 1.0) * t)
+    def _prepare_latents(
+        self,
+        batch_size: int,
+        height: int,
+        width: int,
+        device: torch.device,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]],
+    ) -> torch.Tensor:
+        if height % LATENT_DOWNSAMPLE_FACTOR != 0 or width % LATENT_DOWNSAMPLE_FACTOR != 0:
+            raise ValueError(
+                f"height and width must be divisible by {LATENT_DOWNSAMPLE_FACTOR}."
+            )
+        latent_height = height // LATENT_DOWNSAMPLE_FACTOR
+        latent_width = width // LATENT_DOWNSAMPLE_FACTOR
+        latent_shape = (batch_size, self.transformer.config.in_channels, latent_height, latent_width)
+        gen_device = device
+        if generator is not None and getattr(generator, "device", None) is not None:
+            gen_device = generator.device
+        noise = torch.randn(latent_shape, generator=generator, device=gen_device, dtype=torch.float32)
+        return noise.to(device)
+    def _encode_text(self, text: Union[str, List[str]], device: torch.device) -> torch.Tensor:
+        if self.tokenizer is None or self.text_encoder is None:
+            raise ValueError("Both tokenizer and text_encoder must be provided for text-to-image inference.")
+        if isinstance(text, str):
+            text = [text]
+        if not self.tokenizer.pad_token:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        tokens = self.tokenizer(
+            text,
+            padding="longest",
+            truncation=True,
+            max_length=self.max_length,
+            return_tensors="pt",
+        )
+        input_ids = tokens.input_ids.to(device)
+        attention_mask = tokens.attention_mask.to(device)
+        text_model = getattr(self.text_encoder, "model", self.text_encoder)
+        embed_tokens = getattr(text_model, "embed_tokens", None)
+        if embed_tokens is None:
+            outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
+            if hasattr(outputs, "last_hidden_state") and outputs.last_hidden_state is not None:
+                return outputs.last_hidden_state
+            if hasattr(outputs, "hidden_states") and outputs.hidden_states is not None:
+                return outputs.hidden_states[-1]
+            if isinstance(outputs, (tuple, list)):
+                return outputs[0]
+            raise ValueError("Unable to extract text hidden states from text_encoder output.")
+        inputs_embeds = embed_tokens(input_ids)
+        outputs = text_model(
+            input_ids=None,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+        )
+        return outputs.last_hidden_state
+    def _decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        if self.vae is None:
+            return latents
+        vae = self.vae
+        if not hasattr(vae, "bn"):
+            decoded = vae.decode(latents)
+            return decoded.sample if hasattr(decoded, "sample") else decoded
+        bn = vae.bn.float().eval()
+        running_var = bn.running_var.view(1, -1, 1, 1)
+        running_mean = bn.running_mean.view(1, -1, 1, 1)
+        latents = (latents.float() * torch.sqrt(running_var + bn.eps) + running_mean).to(latents.dtype)
+        patch_size = getattr(vae.config, "patch_size", (2, 2))
+        if isinstance(patch_size, int):
+            patch_size = (patch_size, patch_size)
+        latents = rearrange(
+            latents,
+            "... (c pi pj) i j -> ... c (i pi) (j pj)",
+            pi=patch_size[0],
+            pj=patch_size[1],
+        )
+        decoded = vae.decode(latents)
+        return decoded.sample if hasattr(decoded, "sample") else decoded
+    def _euler_sample(
+        self,
+        latents: torch.Tensor,
+        prompt_embeds: torch.Tensor,
+        negative_prompt_embeds: Optional[torch.Tensor],
+        num_inference_steps: int,
+        guidance_scale: float,
+    ) -> torch.Tensor:
+        model_dtype = next(self.transformer.parameters()).dtype
+        alpha = self.time_shift_alpha
+        do_cfg = guidance_scale > 1.0 and negative_prompt_embeds is not None
+        latents = latents.to(torch.float32)
+        for step_index in range(num_inference_steps, 0, -1):
+            t = step_index / num_inference_steps
+            t_next = (step_index - 1) / num_inference_steps
+            t_shifted = self._shift_time(t, alpha)
+            t_next_shifted = self._shift_time(t_next, alpha)
+            dt = t_shifted - t_next_shifted
+            model_input = latents.to(dtype=model_dtype)
+            if do_cfg:
+                velocity_cond = self.transformer(
+                    model_input,
+                    encoder_hidden_states=prompt_embeds.to(dtype=model_dtype),
+                    return_dict=True,
+                ).sample
+                velocity_uncond = self.transformer(
+                    model_input,
+                    encoder_hidden_states=negative_prompt_embeds.to(dtype=model_dtype),
+                    return_dict=True,
+                ).sample
+                velocity = velocity_uncond + guidance_scale * (velocity_cond - velocity_uncond)
+            else:
+                velocity = self.transformer(
+                    model_input,
+                    encoder_hidden_states=prompt_embeds.to(dtype=model_dtype),
+                    return_dict=True,
+                ).sample
+            latents = latents + dt * velocity.to(torch.float32)
+        return latents
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 256,
+        width: int = 256,
+        num_inference_steps: int = 35,
+        guidance_scale: float = 2.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+    ) -> Union[MVSplitDiTPipelineOutput, Tuple]:
+        """Run denoising with the MVSplit Euler sampler and decode the output."""
+        device = self._execution_device
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        batch_size = len(prompt)
+        prompt_embeds = self._encode_text(prompt, device=device)
+        negative_prompt_embeds = None
+        if guidance_scale > 1.0:
+            if negative_prompt is None:
+                negative_prompt = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                negative_prompt = [negative_prompt] * batch_size
+            elif len(negative_prompt) != batch_size:
+                raise ValueError("negative_prompt must have the same batch size as prompt.")
+            # Match mv-split sample.py: encode cond + uncond in one batch so empty
+            # prompts pick up padding from the conditional sequence length.
+            all_embeds = self._encode_text(list(prompt) + list(negative_prompt), device=device)
+            prompt_embeds, negative_prompt_embeds = all_embeds.chunk(2, dim=0)
+        latents = self._prepare_latents(
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            device=device,
+            generator=generator,
+        )
+        latents = self._euler_sample(
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+        )
+        if output_type == "latent":
+            image = latents
+        else:
+            decode_dtype = next(self.vae.parameters()).dtype if self.vae is not None else latents.dtype
+            image = self._decode_latents(latents.to(decode_dtype))
+            image = image.mul(0.5).add(0.5).clamp(0, 1)
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return MVSplitDiTPipelineOutput(images=image)

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "_class_name": "FlowMatchEulerDiscreteScheduler",
+  "_diffusers_version": "0.38.0",
+  "base_image_seq_len": 256,
+  "base_shift": 0.5,
+  "invert_sigmas": false,
+  "max_image_seq_len": 4096,
+  "max_shift": 1.15,
+  "num_train_timesteps": 1000,
+  "shift": 4.0,
+  "shift_terminal": null,
+  "stochastic_sampling": false,
+  "time_shift_type": "exponential",
+  "use_beta_sigmas": false,
+  "use_dynamic_shifting": false,
+  "use_exponential_sigmas": false,
+  "use_karras_sigmas": false
+}

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

text_encoder/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "bos_token_id": 151643,
+    "do_sample": true,
+    "eos_token_id": [
+        151645,
+        151643
+    ],
+    "pad_token_id": 151643,
+    "temperature": 0.6,
+    "top_k": 20,
+    "top_p": 0.95,
+    "transformers_version": "4.51.0"
+}

text_encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f47f71177f32bcd101b7573ec9171e6a57f4f4d31148d38e382306f42996874b
+size 1503300328

tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
+size 11422654

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if message.content is string %}\n        {%- set content = message.content %}\n    {%- else %}\n        {%- set content = '' %}\n    {%- endif %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is string %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in content %}\n                {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n                {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n            {%- else %}\n                {{- '<|im_start|>' + message.role + '\\n' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

transformer/__pycache__/transformer_mvsplit_dit.cpython-312.pyc ADDED Viewed

Binary file (21.4 kB). View file

transformer/config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "_class_name": "MVSplitDiTTransformer2DModel",
+  "_diffusers_version": "0.38.0",
+  "context_dim": 1024,
+  "depth": 1000,
+  "hidden_size": 1024,
+  "in_channels": 128,
+  "init_alpha": 0.0,
+  "init_beta": 0.03,
+  "mlp_hidden_dim": 3072,
+  "norm_eps": 1e-05,
+  "num_heads": 8,
+  "num_kv_heads": 8,
+  "patch_size": 1,
+  "qkv_bias": false,
+  "rope_base": 10000,
+  "trainable_rms": true,
+  "use_rope": true,
+  "torch_dtype": "bfloat16"
+}

transformer/diffusion_pytorch_model-00001-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ebd66315a82685b17dcd82724bd8cb91c5d92af4cec794ab2afa94ac48c0038
+size 4998288504

transformer/diffusion_pytorch_model-00002-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b19bf5b84b48ae73e88c039809a63eb60d3f3cb74a541abe0fcba71d387e3839
+size 4993827600

transformer/diffusion_pytorch_model-00003-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d3b16f617d0934d373015d9097661d37abb46c382f747eb006e1070d28bbdbb
+size 4991729616

transformer/diffusion_pytorch_model-00004-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95883e73ca3680ba3ecbe9cdb88ea1d7794fb384ccffa47a9646d2e8e4bbef76
+size 4991729616

transformer/diffusion_pytorch_model-00005-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abb4700307e188f5cfd71b4fc2c1319d22ecee354c1ad56267cc61796a2d0fbe
+size 4991729616

transformer/diffusion_pytorch_model-00006-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf5e9d723915a3db3e6f84e181c96c1237378f4f6f1aff2230ca542cdf42a5af
+size 2310435160

transformer/diffusion_pytorch_model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

transformer/transformer_mvsplit_dit.py ADDED Viewed

	@@ -0,0 +1,350 @@

+from dataclasses import dataclass
+import math
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.models.activations import SwiGLU
+from diffusers.models.embeddings import PatchEmbed, apply_rotary_emb
+from diffusers.models.normalization import RMSNorm
+try:
+    from diffusers.configuration_utils import ConfigMixin, register_to_config
+    from diffusers.models.modeling_utils import ModelMixin
+    from diffusers.utils import BaseOutput
+except Exception:
+    class BaseOutput(dict):
+        def __post_init__(self):
+            self.update(self.__dict__)
+    class _Config(dict):
+        def __getattr__(self, key):
+            try:
+                return self[key]
+            except KeyError as error:
+                raise AttributeError(key) from error
+    class ConfigMixin:
+        config_name = "config.json"
+    class ModelMixin(nn.Module):
+        pass
+    def register_to_config(init):
+        def wrapper(self, *args, **kwargs):
+            import inspect
+            signature = inspect.signature(init)
+            bound = signature.bind(self, *args, **kwargs)
+            bound.apply_defaults()
+            self.config = _Config({key: value for key, value in bound.arguments.items() if key != "self"})
+            init(self, *args, **kwargs)
+        return wrapper
+@dataclass
+class MVSplitDiTTransformer2DModelOutput(BaseOutput):
+    sample: torch.FloatTensor
+class TwoDimRotary(nn.Module):
+    def __init__(self, dim: int, base: int = 10000):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, dtype=torch.float32) / max(dim, 1)))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(
+        self,
+        height: int,
+        width: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        pos_h = torch.arange(height, device=device, dtype=self.inv_freq.dtype)
+        pos_w = torch.arange(width, device=device, dtype=self.inv_freq.dtype)
+        freqs_h = torch.outer(pos_h, self.inv_freq).unsqueeze(1).repeat(1, width, 1)
+        freqs_w = torch.outer(pos_w, self.inv_freq).unsqueeze(0).repeat(height, 1, 1)
+        freqs = torch.cat([freqs_h, freqs_w], dim=-1).reshape(height * width, -1)
+        cos = freqs.cos().to(dtype=dtype)
+        sin = freqs.sin().to(dtype=dtype)
+        return cos, sin
+class QKNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6, trainable: bool = False):
+        super().__init__()
+        self.query_norm = RMSNorm(dim, eps=eps, elementwise_affine=trainable)
+        self.key_norm = RMSNorm(dim, eps=eps, elementwise_affine=trainable)
+    def forward(self, query: torch.Tensor, key: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self.query_norm(query), self.key_norm(key)
+class FusedMVSplitNorm1(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5, init_alpha: float = 0.0, init_beta: float = 0.03):
+        super().__init__()
+        self.eps = eps
+        self.alpha = nn.Parameter(torch.full((dim,), init_alpha))
+        self.beta = nn.Parameter(torch.full((dim,), init_beta))
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _rms_norm(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        original_dtype = hidden_states.dtype
+        hidden_states = hidden_states.float()
+        hidden_states = hidden_states * torch.rsqrt(hidden_states.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+        hidden_states = hidden_states * self.weight.float()
+        return hidden_states.to(dtype=original_dtype)
+    def forward(
+        self,
+        residual: torch.Tensor,
+        update: torch.Tensor,
+        l_image_tokens: Optional[int] = None,
+    ) -> torch.Tensor:
+        if l_image_tokens is not None and 0 < l_image_tokens < residual.shape[1]:
+            residual_img, residual_txt = residual[:, :l_image_tokens], residual[:, l_image_tokens:]
+            update_img, update_txt = update[:, :l_image_tokens], update[:, l_image_tokens:]
+            residual_img_mean = residual_img.mean(dim=1, keepdim=True)
+            residual_txt_mean = residual_txt.mean(dim=1, keepdim=True)
+            update_img_mean = update_img.mean(dim=1, keepdim=True)
+            update_txt_mean = update_txt.mean(dim=1, keepdim=True)
+            update_img_var = update_img - update_img_mean
+            update_txt_var = update_txt - update_txt_mean
+            alpha = self.alpha.view(1, 1, -1)
+            beta = self.beta.view(1, 1, -1)
+            var_update = torch.cat([update_img_var * beta, update_txt_var * beta], dim=1)
+            mean_update = torch.cat(
+                [
+                    (alpha * (update_img_mean - residual_img_mean)).expand_as(residual_img),
+                    (alpha * (update_txt_mean - residual_txt_mean)).expand_as(residual_txt),
+                ],
+                dim=1,
+            )
+        else:
+            residual_mean = residual.mean(dim=1, keepdim=True)
+            update_mean = update.mean(dim=1, keepdim=True)
+            var_update = self.beta * (update - update_mean)
+            mean_update = self.alpha * (update_mean - residual_mean).expand_as(residual)
+        return self._rms_norm(residual + var_update + mean_update)
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        num_kv_heads: int,
+        qkv_bias: bool,
+        trainable_rms: bool,
+    ):
+        super().__init__()
+        if dim % num_heads != 0:
+            raise ValueError("dim must be divisible by num_heads.")
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = dim // num_heads
+        if self.num_heads % self.num_kv_heads != 0:
+            raise ValueError("num_heads must be divisible by num_kv_heads.")
+        self.num_groups = self.num_heads // self.num_kv_heads
+        kv_dim = self.num_kv_heads * self.head_dim
+        self.q_proj = nn.Linear(dim, dim, bias=qkv_bias)
+        self.k_proj = nn.Linear(dim, kv_dim, bias=qkv_bias)
+        self.v_proj = nn.Linear(dim, kv_dim, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim, bias=False)
+        self.qk_norm = QKNorm(self.head_dim, trainable=trainable_rms)
+        self.scale = 1.0 / math.sqrt(self.head_dim)
+    def forward(self, hidden_states: torch.Tensor, rope: Optional[Tuple[torch.Tensor, torch.Tensor]]) -> torch.Tensor:
+        batch_size, _, _ = hidden_states.shape
+        query = self.q_proj(hidden_states).reshape(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        key = self.k_proj(hidden_states).reshape(batch_size, -1, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        value = self.v_proj(hidden_states).reshape(batch_size, -1, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        if rope is not None:
+            query = apply_rotary_emb(query, rope)
+            key = apply_rotary_emb(key, rope)
+        query, key = self.qk_norm(query, key)
+        if self.num_groups > 1:
+            key = torch.repeat_interleave(key, self.num_groups, dim=1)
+            value = torch.repeat_interleave(value, self.num_groups, dim=1)
+        hidden_states = F.scaled_dot_product_attention(query, key, value, scale=self.scale)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
+        return self.proj(hidden_states)
+class DiTBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        mlp_hidden_dim: int,
+        qkv_bias: bool,
+        trainable_rms: bool,
+        norm_eps: float,
+        init_alpha: float,
+        init_beta: float,
+    ):
+        super().__init__()
+        self.attn = Attention(hidden_size, num_heads, num_kv_heads, qkv_bias=qkv_bias, trainable_rms=trainable_rms)
+        self.ffn = nn.Sequential(
+            SwiGLU(hidden_size, mlp_hidden_dim, bias=qkv_bias),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=qkv_bias),
+        )
+        self.norm1 = FusedMVSplitNorm1(hidden_size, eps=norm_eps, init_alpha=init_alpha, init_beta=init_beta)
+        self.norm2 = FusedMVSplitNorm1(hidden_size, eps=norm_eps, init_alpha=init_alpha, init_beta=init_beta)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rope: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        l_image_tokens: Optional[int],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.attn(hidden_states, rope=rope)
+        hidden_states = self.norm1(residual, hidden_states, l_image_tokens=l_image_tokens)
+        residual = hidden_states
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = self.norm2(residual, hidden_states, l_image_tokens=l_image_tokens)
+        return hidden_states
+class MVSplitDiTTransformer2DModel(ModelMixin, ConfigMixin):
+    config_name = "config.json"
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 128,
+        patch_size: int = 1,
+        hidden_size: int = 1024,
+        depth: int = 1000,
+        num_heads: int = 8,
+        num_kv_heads: int = 8,
+        mlp_hidden_dim: int = 3072,
+        context_dim: int = 1024,
+        qkv_bias: bool = False,
+        trainable_rms: bool = False,
+        use_rope: bool = True,
+        rope_base: int = 10000,
+        norm_eps: float = 1e-5,
+        init_alpha: float = 0.0,
+        init_beta: float = 0.03,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.patch_size = patch_size
+        self.hidden_size = hidden_size
+        self.use_rope = use_rope
+        self.rope_dim = hidden_size // (2 * num_heads)
+        self.patch_embed = PatchEmbed(
+            height=1,
+            width=1,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=hidden_size,
+            layer_norm=False,
+            flatten=True,
+            bias=True,
+            pos_embed_type=None,
+        )
+        self.norm_img_input = RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=trainable_rms)
+        self.norm_text_input = RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=trainable_rms)
+        self.context_proj = nn.Identity() if context_dim == hidden_size else nn.Linear(context_dim, hidden_size, bias=False)
+        self.rope = TwoDimRotary(self.rope_dim, base=rope_base) if use_rope else None
+        self.blocks = nn.ModuleList(
+            [
+                DiTBlock(
+                    hidden_size=hidden_size,
+                    num_heads=num_heads,
+                    num_kv_heads=num_kv_heads,
+                    mlp_hidden_dim=mlp_hidden_dim,
+                    qkv_bias=qkv_bias,
+                    trainable_rms=trainable_rms,
+                    norm_eps=norm_eps,
+                    init_alpha=init_alpha,
+                    init_beta=init_beta,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.final_proj = nn.Linear(hidden_size, patch_size * patch_size * self.out_channels, bias=True)
+    def _unpatchify(
+        self,
+        hidden_states: torch.Tensor,
+        batch_size: int,
+        height_tokens: int,
+        width_tokens: int,
+    ) -> torch.Tensor:
+        patch = self.patch_size
+        hidden_states = hidden_states.reshape(
+            batch_size, height_tokens, width_tokens, patch, patch, self.out_channels
+        )
+        hidden_states = hidden_states.permute(0, 5, 1, 3, 2, 4).reshape(
+            batch_size, self.out_channels, height_tokens * patch, width_tokens * patch
+        )
+        return hidden_states
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        timestep: Optional[Union[torch.Tensor, float]] = None,
+        return_dict: bool = True,
+    ) -> Union[MVSplitDiTTransformer2DModelOutput, Tuple[torch.Tensor]]:
+        del timestep
+        if hidden_states.ndim != 4:
+            raise ValueError("hidden_states must have shape [B, C, H, W].")
+        if encoder_hidden_states.ndim != 3:
+            raise ValueError("encoder_hidden_states must have shape [B, L_text, context_dim].")
+        batch_size, channels, height, width = hidden_states.shape
+        if channels != self.in_channels:
+            raise ValueError(f"Expected {self.in_channels} latent channels, got {channels}.")
+        if height % self.patch_size != 0 or width % self.patch_size != 0:
+            raise ValueError("Latent height and width must be divisible by patch_size.")
+        height_tokens = height // self.patch_size
+        width_tokens = width // self.patch_size
+        image_tokens = self.norm_img_input(self.patch_embed(hidden_states))
+        l_image_tokens = image_tokens.shape[1]
+        text_tokens = self.norm_text_input(self.context_proj(encoder_hidden_states))
+        sequence = torch.cat([image_tokens, text_tokens], dim=1)
+        rope = None
+        if self.use_rope and self.rope is not None:
+            cos_image, sin_image = self.rope(height_tokens, width_tokens, sequence.device, sequence.dtype)
+            text_length = text_tokens.shape[1]
+            rope_width = cos_image.shape[-1]
+            if text_length > 0:
+                cos_text = torch.ones((text_length, rope_width), device=sequence.device, dtype=sequence.dtype)
+                sin_text = torch.zeros((text_length, rope_width), device=sequence.device, dtype=sequence.dtype)
+                rope = (torch.cat([cos_image, cos_text], dim=0), torch.cat([sin_image, sin_text], dim=0))
+            else:
+                rope = (cos_image, sin_image)
+        for block in self.blocks:
+            sequence = block(sequence, rope=rope, l_image_tokens=l_image_tokens)
+        sequence = self.final_proj(sequence[:, :l_image_tokens, :])
+        sequence = self._unpatchify(sequence, batch_size=batch_size, height_tokens=height_tokens, width_tokens=width_tokens)
+        if not return_dict:
+            return (sequence,)
+        return MVSplitDiTTransformer2DModelOutput(sample=sequence)

vae/config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "_class_name": "AutoencoderKLFlux2",
+  "_diffusers_version": "0.37.0.dev0",
+  "_name_or_path": "black-forest-labs/FLUX.2-dev",
+  "act_fn": "silu",
+  "batch_norm_eps": 0.0001,
+  "batch_norm_momentum": 0.1,
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "force_upcast": true,
+  "in_channels": 3,
+  "latent_channels": 32,
+  "layers_per_block": 2,
+  "mid_block_add_attention": true,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "patch_size": [
+    2,
+    2
+  ],
+  "sample_size": 1024,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ],
+  "use_post_quant_conv": true,
+  "use_quant_conv": true
+}

vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca70d2202afe6415bdbcb8793ba8cd99fd159cfe6192381504d6c4d3036e0f04
+size 168120878