lsmpp commited on Oct 16, 2025

Commit

4960ef6

verified ·

1 Parent(s): d926b4c

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/.gitignore +1 -0
.venv/CACHEDIR.TAG +1 -0
.venv/pyvenv.cfg +6 -0
arch/README.md +278 -0
arch/__init__.py +59 -0
arch/adapter.py +86 -0
arch/data_loader.py +658 -0
arch/example_train.py +377 -0
arch/model_loader.py +325 -0
arch/pipeline.py +348 -0
arch/text_encoder.py +155 -0
arch/training.py +307 -0
diffusers/.github/PULL_REQUEST_TEMPLATE.md +61 -0
diffusers/docs/README.md +268 -0
diffusers/docs/TRANSLATING.md +69 -0
diffusers/scripts/conversion_ldm_uncond.py +56 -0
diffusers/scripts/convert_animatediff_motion_lora_to_diffusers.py +69 -0
diffusers/scripts/convert_cogvideox_to_diffusers.py +346 -0
diffusers/scripts/convert_consistency_decoder.py +1128 -0
diffusers/scripts/convert_dance_diffusion_to_diffusers.py +346 -0
diffusers/scripts/convert_dcae_to_diffusers.py +323 -0
diffusers/scripts/convert_diffusers_sdxl_lora_to_webui.py +56 -0
diffusers/scripts/convert_flux_xlabs_ipadapter_to_diffusers.py +97 -0
diffusers/scripts/convert_hunyuandit_controlnet_to_diffusers.py +241 -0
diffusers/scripts/convert_i2vgen_to_diffusers.py +510 -0
diffusers/scripts/convert_if.py +1250 -0
diffusers/scripts/convert_lora_safetensor_to_diffusers.py +128 -0
diffusers/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py +185 -0
diffusers/scripts/convert_omnigen_to_diffusers.py +203 -0
diffusers/scripts/convert_original_audioldm2_to_diffusers.py +1135 -0
diffusers/scripts/convert_original_musicldm_to_diffusers.py +1056 -0
diffusers/scripts/convert_pixart_sigma_to_diffusers.py +223 -0
diffusers/scripts/convert_sana_to_diffusers.py +456 -0
diffusers/scripts/convert_stable_cascade.py +218 -0
diffusers/scripts/convert_vae_pt_to_diffusers.py +177 -0
diffusers/scripts/convert_wuerstchen.py +115 -0
illustrious_generated/low_quality_images.json +0 -0
illustrious_generated/natural_caption_generation_report.txt +14 -0
illustrious_generated/optimization_final_results.json +0 -0
illustrious_generated/optimization_summary_report.txt +20 -0
illustrious_generated/regeneration_results.json +0 -0
peft/.gitignore +145 -0
peft/.pre-commit-config.yaml +13 -0
peft/LICENSE +201 -0
peft/Makefile +66 -0
peft/README.md +189 -0
peft/pyproject.toml +50 -0
peft/requirements.txt +15 -0
peft/setup.py +110 -0
sentence-transformers/.gitignore +69 -0

.venv/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *

.venv/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1 @@


1	+ Signature: 8a477f597d28d172789f06886806bc55

.venv/pyvenv.cfg ADDED Viewed

	@@ -0,0 +1,6 @@

+home = /home/ubuntu/.local/share/uv/python/cpython-3.12.10-linux-x86_64-gnu/bin
+implementation = CPython
+uv = 0.7.3
+version_info = 3.12.10
+include-system-site-packages = false
+prompt = QwenIllustrious

arch/README.md ADDED Viewed

	@@ -0,0 +1,278 @@

+# Qwen-SDXL Architecture Components
+本目录包含了 Qwen-SDXL 项目的解耦架构组件，便于训练和推理的模块化使用。
+## 📁 组件结构
+```
+arch/
+├── __init__.py              # 组件导入和导出
+├── adapter.py               # Qwen 嵌入适配器
+├── text_encoder.py          # Qwen 文本编码器
+├── model_loader.py          # 模型加载工具
+├── pipeline.py              # 推理管道
+├── training.py              # 训练工具和损失函数
+├── data_loader.py           # 数据加载器
+├── example_train.py         # 示例训练脚本
+└── README.md               # 本文件
+```
+## 🧩 核心组件
+### 1. QwenEmbeddingAdapter (`adapter.py`)
+将 Qwen3 的 1024 维嵌入投影到 SDXL 兼容的维度：
+- 文本嵌入: 1024 → 2048 (用于 encoder_hidden_states)
+- 池化嵌入: 1024 → 1280 (用于 text_embeds)
+```python
+from arch import QwenEmbeddingAdapter
+adapter = QwenEmbeddingAdapter()
+text_embeddings = adapter.forward_text_embeddings(qwen_embeddings)
+pooled_embeddings = adapter.forward_pooled_embeddings(qwen_pooled)
+```
+### 2. QwenTextEncoder (`text_encoder.py`)
+封装 Qwen3 模型的文本编码功能：
+```python
+from arch import QwenTextEncoder
+text_encoder = QwenTextEncoder(model_path="path/to/qwen3")
+text_emb, pooled_emb = text_encoder.encode_prompts(
+    ["a beautiful landscape"],
+    ["low quality"]
+)
+```
+### 3. 模型加载器 (`model_loader.py`)
+提供各种模型组件的加载功能：
+```python
+from arch import load_unet_from_safetensors, load_vae_from_safetensors
+unet = load_unet_from_safetensors("unet.safetensors", "unet_config.json")
+vae = load_vae_from_safetensors("vae.safetensors", "vae_config.json")
+```
+### 4. 推理管道 (`pipeline.py`)
+完整的 Qwen-SDXL 推理管道：
+```python
+from arch import QwenSDXLInference
+pipeline = QwenSDXLInference(
+    qwen_model_path="path/to/qwen3",
+    adapter_path="path/to/trained/adapter.safetensors"  # 可选
+)
+images = pipeline.generate(
+    prompt="a beautiful landscape",
+    height=1024,
+    width=1024
+)
+```
+## 🎓 训练组件
+### 1. DiffusionLoss (`training.py`)
+扩散训练损失函数，支持多种损失类型和 SNR 加权：
+```python
+from arch import DiffusionLoss
+loss_fn = DiffusionLoss(
+    noise_scheduler=scheduler,
+    loss_type="mse",
+    snr_gamma=5.0  # Min-SNR weighting
+)
+```
+### 2. AdapterTrainingStep (`training.py`)
+适配器训练步骤，自动处理前向传播和损失计算：
+```python
+from arch import AdapterTrainingStep
+training_step = AdapterTrainingStep(
+    unet=unet,
+    vae=vae,
+    text_encoder=text_encoder,
+    adapter=adapter,
+    noise_scheduler=scheduler,
+    loss_fn=loss_fn
+)
+result = training_step.training_step(images, prompts)
+loss = result["loss"]
+```
+### 3. 数据加载器 (`data_loader.py`)
+#### 基础数据集
+```python
+from arch import ImageCaptionDataset, create_dataloader
+dataset = ImageCaptionDataset(
+    data_root="/path/to/images",
+    annotations_file="captions.jsonl"
+)
+dataloader = create_dataloader(dataset, batch_size=4)
+```
+#### 多长宽比数据集
+```python
+from arch import MultiAspectDataset
+dataset = MultiAspectDataset(
+    data_root="/path/to/images",
+    annotations_file="captions.jsonl",
+    aspect_ratios=[(1024, 1024), (1152, 896), (896, 1152)]
+)
+```
+## 🚀 使用示例
+### 1. 快速推理
+```python
+from arch import QwenSDXLInference
+# 初始化管道
+pipeline = QwenSDXLInference()
+# 生成图像
+images = pipeline.generate(
+    prompt="a serene mountain landscape at sunset",
+    negative_prompt="low quality, blurry",
+    height=1024,
+    width=1024,
+    num_inference_steps=50,
+    guidance_scale=7.5
+)
+# 保存图像
+images[0].save("generated_image.png")
+```
+### 2. 训练适配器
+参考 `example_train.py` 中的完整训练脚本：
+```bash
+python arch/example_train.py \
+    --data_root /path/to/images \
+    --annotations_file /path/to/captions.jsonl \
+    --batch_size 4 \
+    --learning_rate 1e-4 \
+    --num_epochs 10 \
+    --output_dir ./checkpoints \
+    --use_wandb
+```
+### 3. 使用训练好的适配器
+```python
+from arch import QwenSDXLInference
+# 加载带有训练好的适配器的管道
+pipeline = QwenSDXLInference(
+    adapter_path="checkpoints/adapter_epoch_10_step_5000.safetensors"
+)
+images = pipeline.generate("your prompt here")
+```
+## 📊 训练配置
+### 推荐的训练设置
+- **学习率**: 1e-4 (AdamW)
+- **批量大小**: 4-8 (根据 GPU 内存调整)
+- **梯度累积**: 如果内存不足可使用
+- **损失函数**: MSE 或 Huber
+- **SNR 加权**: gamma=5.0 (Min-SNR)
+- **EMA**: decay=0.9999
+- **学习率调度**: 余弦退火 + 预热
+### 数据格式
+支持 JSON 和 JSONL 格式的标注文件：
+```jsonl
+{"image": "image1.jpg", "caption": "A beautiful landscape"}
+{"image": "image2.jpg", "caption": "A cute cat"}
+```
+## 🔧 自定义和扩展
+### 1. 自定义适配器架构
+修改 `adapter.py` 中的 `QwenEmbeddingAdapter` 类：
+```python
+class CustomAdapter(QwenEmbeddingAdapter):
+    def __init__(self, qwen_dim=1024, sdxl_text_dim=2048, sdxl_pooled_dim=1280):
+        super().__init__(qwen_dim, sdxl_text_dim, sdxl_pooled_dim)
+        # 添加自定义层
+        self.custom_layer = nn.Linear(sdxl_text_dim, sdxl_text_dim)
+```
+### 2. 自定义损失函数
+继承 `DiffusionLoss` 类：
+```python
+class CustomLoss(DiffusionLoss):
+    def forward(self, model_pred, target, timesteps, mask=None):
+        # 自定义损失计算
+        base_loss = super().forward(model_pred, target, timesteps, mask)
+        # 添加额外的正则化项
+        return base_loss + custom_regularization
+```
+### 3. 自定义数据加载
+继承数据集类：
+```python
+class CustomDataset(ImageCaptionDataset):
+    def __getitem__(self, idx):
+        sample = super().__getitem__(idx)
+        # 添加自定义数据增强或预处理
+        return sample
+```
+## ⚠️ 注意事项
+1. **内存管理**: 使用大模型时注意 GPU 内存使用
+2. **数据类型**: 推荐使用 bfloat16 以平衡性能和精度
+3. **检查点保存**: 定期保存检查点以防止训练中断
+4. **验证集**: 使用验证集监控训练进度
+5. **梯度裁剪**: 防止梯度爆炸
+## 🐛 故障排除
+### 常见问题
+1. **CUDA 内存不足**: 减小批量大小或使用梯度累积
+2. **模型加载失败**: 检查模型路径和配置文件
+3. **生成图像质量差**: 调整学习率、损失函数或训练更多步数
+4. **训练不稳定**: 使用梯度裁剪和 EMA
+### 调试模式
+可以在各个组件中启用调试输出：
+```python
+# 启用详细日志
+import logging
+logging.basicConfig(level=logging.DEBUG)
+```
+## 📚 参考资料
+- [Stable Diffusion XL](https://arxiv.org/abs/2307.01952)
+- [Qwen3 Embedding](https://github.com/QwenLM/Qwen)
+- [Diffusers Library](https://github.com/huggingface/diffusers)

arch/__init__.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""
+Architecture components for Qwen-SDXL
+Qwen-SDXL 架构组件
+"""
+# Core components
+from .adapter import QwenEmbeddingAdapter
+from .text_encoder import QwenTextEncoder, encode_text_with_qwen
+from .model_loader import (
+    load_qwen_model,
+    load_unet_from_safetensors,
+    load_vae_from_safetensors,
+    create_scheduler,
+    save_model_components,
+    load_checkpoint
+)
+from .pipeline import QwenIllustriousInference
+# Training components
+from .training import (
+    DiffusionLoss,
+    AdapterTrainingStep,
+    get_cosine_schedule_with_warmup,
+    EMAModel
+)
+# Data loading components
+from .data_loader import (
+    ImageCaptionDataset,
+    MultiAspectDataset,
+    collate_fn,
+    create_dataloader
+)
+__all__ = [
+    # Core components
+    "QwenEmbeddingAdapter",
+    "QwenTextEncoder",
+    "encode_text_with_qwen",
+    "load_qwen_model",
+    "load_unet_from_safetensors",
+    "load_vae_from_safetensors",
+    "create_scheduler",
+    "save_model_components",
+    "load_checkpoint",
+    "QwenIllustriousInference",
+    # Training components
+    "DiffusionLoss",
+    "AdapterTrainingStep",
+    "get_cosine_schedule_with_warmup",
+    "EMAModel",
+    # Data loading components
+    "ImageCaptionDataset",
+    "MultiAspectDataset",
+    "collate_fn",
+    "create_dataloader"
+]

arch/adapter.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+Qwen Embedding Adapter
+Qwen 嵌入适配器 - 将 Qwen3 嵌入维度投影到 SDXL 兼容维度
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class QwenEmbeddingAdapter(nn.Module):
+    """
+    Adapter layer to project Qwen3 embeddings to SDXL-compatible dimensions
+    将 Qwen3 嵌入维度投影到 SDXL 兼容维度
+    - Text embeddings: 1024 -> 2048 (for encoder_hidden_states)
+    - Pooled embeddings: 1024 -> 1280 (for text_embeds in added_cond_kwargs)
+    """
+    def __init__(self, qwen_dim=1024, sdxl_text_dim=2048, sdxl_pooled_dim=1280):
+        super().__init__()
+        self.qwen_dim = qwen_dim
+        self.sdxl_text_dim = sdxl_text_dim
+        self.sdxl_pooled_dim = sdxl_pooled_dim
+        # Text embeddings projection (for encoder_hidden_states)
+        self.text_projection = nn.Linear(qwen_dim, sdxl_text_dim)
+        self.text_layer_norm = nn.LayerNorm(sdxl_text_dim)
+        self.text_activation = nn.GELU()
+        # Pooled embeddings MLP (for text_embeds in added_cond_kwargs)
+        self.pooled_mlp = nn.Sequential(
+            nn.Linear(qwen_dim, qwen_dim * 2),
+            nn.GELU(),
+            nn.Dropout(0.1),
+            nn.Linear(qwen_dim * 2, sdxl_pooled_dim),
+            nn.LayerNorm(sdxl_pooled_dim)
+        )
+        # 初始化权重
+        self._init_weights()
+    def _init_weights(self):
+        """Initialize weights for better training stability"""
+        # Text projection initialization
+        nn.init.xavier_uniform_(self.text_projection.weight)
+        nn.init.zeros_(self.text_projection.bias)
+        # Pooled MLP initialization
+        for module in self.pooled_mlp:
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_uniform_(module.weight)
+                nn.init.zeros_(module.bias)
+    def forward_text_embeddings(self, qwen_embeddings):
+        """
+        Project text embeddings for encoder_hidden_states
+        Args:
+            qwen_embeddings: tensor of shape [batch_size, seq_len, 1024]
+        Returns:
+            text_embeddings: tensor of shape [batch_size, seq_len, 2048]
+        """
+        projected = self.text_projection(qwen_embeddings)
+        projected = self.text_activation(projected)
+        return self.text_layer_norm(projected)
+    def forward_pooled_embeddings(self, qwen_embeddings):
+        """
+        Project pooled embeddings for text_embeds (using MLP)
+        Args:
+            qwen_embeddings: tensor of shape [batch_size, 1024]
+        Returns:
+            pooled_embeddings: tensor of shape [batch_size, 1280]
+        """
+        return self.pooled_mlp(qwen_embeddings)
+    def forward(self, text_embeddings, pooled_embeddings):
+        """
+        Forward pass for both text and pooled embeddings
+        Args:
+            text_embeddings: tensor of shape [batch_size, seq_len, 1024]
+            pooled_embeddings: tensor of shape [batch_size, 1024]
+        Returns:
+            tuple: (projected_text_embeddings, projected_pooled_embeddings)
+        """
+        projected_text = self.forward_text_embeddings(text_embeddings)
+        projected_pooled = self.forward_pooled_embeddings(pooled_embeddings)
+        return projected_text, projected_pooled

arch/data_loader.py ADDED Viewed

	@@ -0,0 +1,658 @@

+"""
+Data Loading Utilities for QwenIllustrious
+数据加载工具 - 处理训练数据的加载和预处理，支持预计算嵌入加速
+"""
+import os
+import json
+import torch
+from torch.utils.data import Dataset
+from PIL import Image
+import torchvision.transforms as transforms
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import pickle
+from tqdm import tqdm
+class QwenIllustriousDataset(Dataset):
+    """
+    Dataset for QwenIllustrious training
+    支持以下功能：
+    - 从 metadata.json 文件加载图像和标注
+    - 图像预处理和增强
+    - Qwen 文本编码缓存
+    - VAE 潜在空间编码缓存
+    - 训练时的预计算加速
+    """
+    def __init__(
+        self,
+        dataset_path: str,
+        qwen_text_encoder=None,
+        vae=None,
+        image_size: int = 1024,
+        cache_dir: Optional[str] = None,
+        precompute_embeddings: bool = False
+    ):
+        self.dataset_path = Path(dataset_path)
+        self.qwen_text_encoder = qwen_text_encoder
+        self.vae = vae
+        self.image_size = image_size
+        self.cache_dir = Path(cache_dir) if cache_dir else None
+        self.precompute_embeddings = precompute_embeddings
+        # Setup image transforms
+        self.image_transforms = transforms.Compose([
+            transforms.Resize((image_size, image_size), interpolation=transforms.InterpolationMode.LANCZOS),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5])  # Normalize to [-1, 1]
+        ])
+        # Load metadata
+        self.metadata = self._load_metadata()
+        # Setup cache directories
+        if self.cache_dir:
+            self.cache_dir.mkdir(exist_ok=True)
+            self.text_cache_dir = self.cache_dir / "text_embeddings"
+            self.vae_cache_dir = self.cache_dir / "vae_latents"
+            self.text_cache_dir.mkdir(exist_ok=True)
+            self.vae_cache_dir.mkdir(exist_ok=True)
+        # Precomputed data storage
+        self.precomputed_data = {}
+    def _load_metadata(self) -> List[Dict]:
+        """Load all metadata files"""
+        metadata_dir = self.dataset_path / "metadata"
+        if not metadata_dir.exists():
+            raise ValueError(f"Metadata directory not found: {metadata_dir}")
+        metadata_files = list(metadata_dir.glob("*.json"))
+        metadata_list = []
+        print(f"Loading metadata from {len(metadata_files)} files...")
+        for file_path in tqdm(metadata_files, desc="Loading metadata"):
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    data = json.load(f)
+                    # Add file path info
+                    data['metadata_file'] = str(file_path)
+                    data['image_file'] = str(self.dataset_path / f"{data['filename_hash']}.png")
+                    metadata_list.append(data)
+            except Exception as e:
+                print(f"Error loading {file_path}: {e}")
+                continue
+        print(f"Successfully loaded {len(metadata_list)} metadata files")
+        return metadata_list
+    def _get_text_cache_path(self, filename_hash: str) -> Path:
+        """Get path for cached text embeddings"""
+        return self.text_cache_dir / f"{filename_hash}_text.pt"
+    def _get_vae_cache_path(self, filename_hash: str) -> Path:
+        """Get path for cached VAE latents"""
+        return self.vae_cache_dir / f"{filename_hash}_vae.pt"
+    def _compute_text_embeddings(self, prompt: str, device='cpu') -> Dict[str, torch.Tensor]:
+        """Compute text embeddings using Qwen text encoder"""
+        if not self.qwen_text_encoder:
+            # Return dummy embeddings
+            return {
+                'text_embeddings': torch.zeros(1, 2048),  # SDXL text embedding size
+                'pooled_embeddings': torch.zeros(1, 1280)  # SDXL pooled embedding size
+            }
+        with torch.no_grad():
+            # Move to device temporarily for computation
+            original_device = next(self.qwen_text_encoder.parameters()).device
+            self.qwen_text_encoder.to(device)
+            embeddings = self.qwen_text_encoder.encode_prompts([prompt])
+            # Move back to original device
+            self.qwen_text_encoder.to(original_device)
+            return {
+                'text_embeddings': embeddings[0].cpu(),
+                'pooled_embeddings': embeddings[1].cpu() if len(embeddings) > 1 else embeddings[0].cpu()
+            }
+    def _compute_vae_latents(self, image: torch.Tensor, device='cpu') -> torch.Tensor:
+        """Compute VAE latents for image"""
+        if not self.vae:
+            # Return dummy latents
+            return torch.zeros(1, 4, self.image_size // 8, self.image_size // 8)
+        with torch.no_grad():
+            # Move to device temporarily for computation
+            original_device = next(self.vae.parameters()).device
+            self.vae.to(device)
+            # Add batch dimension if needed
+            if image.dim() == 3:
+                image = image.unsqueeze(0)
+            image = image.to(device).to(self.vae.dtype)
+            latents = self.vae.encode(image).latent_dist.sample()
+            latents = latents * self.vae.config.scaling_factor
+            # Move back to original device
+            self.vae.to(original_device)
+            return latents.cpu()
+    def _load_or_compute_text_embeddings(self, prompt: str, filename_hash: str, device='cpu') -> Dict[str, torch.Tensor]:
+        """Load cached text embeddings or compute new ones"""
+        if self.cache_dir:
+            cache_path = self._get_text_cache_path(filename_hash)
+            # Try to load from cache
+            if cache_path.exists():
+                try:
+                    return torch.load(cache_path, map_location='cpu')
+                except Exception as e:
+                    print(f"Error loading cached text embeddings {cache_path}: {e}")
+        # Compute new embeddings
+        embeddings = self._compute_text_embeddings(prompt, device)
+        # Cache the embeddings
+        if self.cache_dir:
+            try:
+                torch.save(embeddings, cache_path)
+            except Exception as e:
+                print(f"Error saving text embeddings cache {cache_path}: {e}")
+        return embeddings
+    def _load_or_compute_vae_latents(self, image_path: str, filename_hash: str, device='cpu') -> torch.Tensor:
+        """Load cached VAE latents or compute new ones"""
+        if self.cache_dir:
+            cache_path = self._get_vae_cache_path(filename_hash)
+            # Try to load from cache
+            if cache_path.exists():
+                try:
+                    return torch.load(cache_path, map_location='cpu')
+                except Exception as e:
+                    print(f"Error loading cached VAE latents {cache_path}: {e}")
+        # Load and process image
+        try:
+            image = Image.open(image_path).convert('RGB')
+            image = self.image_transforms(image)
+        except Exception as e:
+            print(f"Error loading image {image_path}: {e}")
+            image = torch.zeros(3, self.image_size, self.image_size)
+        # Compute latents
+        latents = self._compute_vae_latents(image, device)
+        # Cache the latents
+        if self.cache_dir:
+            try:
+                torch.save(latents, cache_path)
+            except Exception as e:
+                print(f"Error saving VAE latents cache {cache_path}: {e}")
+        return latents
+    def precompute_all(self, device='cuda'):
+        """Precompute all embeddings and latents for faster training"""
+        print("Precomputing all embeddings and latents...")
+        for idx in tqdm(range(len(self.metadata)), desc="Precomputing"):
+            metadata = self.metadata[idx]
+            filename_hash = metadata['filename_hash']
+            # Get prompt
+            prompt = metadata.get('natural_caption_data', {}).get('natural_caption', '')
+            if not prompt:
+                prompt = metadata.get('original_prompt_data', {}).get('positive_prompt', '')
+            # Precompute text embeddings
+            text_embeddings = self._load_or_compute_text_embeddings(prompt, filename_hash, device)
+            # Precompute VAE latents
+            vae_latents = self._load_or_compute_vae_latents(metadata['image_file'], filename_hash, device)
+            # Store in memory for fast access
+            self.precomputed_data[filename_hash] = {
+                'text_embeddings': text_embeddings['text_embeddings'].squeeze(0),
+                'pooled_embeddings': text_embeddings['pooled_embeddings'].squeeze(0),
+                'latents': vae_latents.squeeze(0),
+                'prompt': prompt
+            }
+        print(f"Precomputation completed for {len(self.precomputed_data)} items")
+    def __len__(self):
+        return len(self.metadata)
+    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
+        metadata = self.metadata[idx]
+        filename_hash = metadata['filename_hash']
+        if self.precompute_embeddings and filename_hash in self.precomputed_data:
+            # Use precomputed data
+            data = self.precomputed_data[filename_hash]
+            return {
+                'text_embeddings': data['text_embeddings'],
+                'pooled_embeddings': data['pooled_embeddings'],
+                'latents': data['latents'],
+                'prompts': data['prompt'],
+                'filename_hash': filename_hash,
+                'metadata': metadata
+            }
+        else:
+            # Load data on-the-fly
+            # Load image
+            image_path = metadata['image_file']
+            try:
+                image = Image.open(image_path).convert('RGB')
+                image = self.image_transforms(image)
+            except Exception as e:
+                print(f"Error loading image {image_path}: {e}")
+                image = torch.zeros(3, self.image_size, self.image_size)
+            # Get prompt
+            prompt = metadata.get('natural_caption_data', {}).get('natural_caption', '')
+            if not prompt:
+                prompt = metadata.get('original_prompt_data', {}).get('positive_prompt', '')
+            # Get text embeddings (will use cache if available)
+            text_embeddings = self._load_or_compute_text_embeddings(prompt, filename_hash)
+            return {
+                'images': image,
+                'prompts': prompt,
+                'text_embeddings': text_embeddings['text_embeddings'].squeeze(0),
+                'pooled_embeddings': text_embeddings['pooled_embeddings'].squeeze(0),
+                'filename_hash': filename_hash,
+                'metadata': metadata
+            }
+def collate_fn(examples: List[Dict]) -> Dict[str, torch.Tensor]:
+    """Custom collate function for DataLoader"""
+    batch = {}
+    # Handle different data formats (precomputed vs on-the-fly)
+    if 'latents' in examples[0]:
+        # Precomputed format - embeddings and latents are already computed
+        batch['latents'] = torch.stack([example['latents'] for example in examples])
+        batch['text_embeddings'] = torch.stack([example['text_embeddings'] for example in examples])
+        batch['pooled_embeddings'] = torch.stack([example['pooled_embeddings'] for example in examples])
+    else:
+        # On-the-fly format - need to handle images
+        batch['images'] = torch.stack([example['images'] for example in examples])
+        batch['text_embeddings'] = torch.stack([example['text_embeddings'] for example in examples])
+        batch['pooled_embeddings'] = torch.stack([example['pooled_embeddings'] for example in examples])
+    # Handle string fields
+    batch['prompts'] = [example['prompts'] for example in examples]
+    batch['filename_hash'] = [example['filename_hash'] for example in examples]
+    batch['metadata'] = [example['metadata'] for example in examples]
+    return batch
+import torch
+from torch.utils.data import Dataset, DataLoader
+from PIL import Image
+import json
+import os
+from typing import List, Dict, Any, Optional, Tuple, Union
+import torchvision.transforms as transforms
+import random
+class ImageCaptionDataset(Dataset):
+    """
+    Dataset for image-caption pairs
+    图像-标题对数据集
+    """
+    def __init__(
+        self,
+        data_root: str,
+        annotations_file: str,
+        image_size: int = 1024,
+        center_crop: bool = True,
+        random_flip: bool = True,
+        caption_column: str = "caption",
+        image_column: str = "image",
+        max_caption_length: int = 512
+    ):
+        self.data_root = data_root
+        self.image_size = image_size
+        self.caption_column = caption_column
+        self.image_column = image_column
+        self.max_caption_length = max_caption_length
+        # Load annotations
+        self.annotations = self._load_annotations(annotations_file)
+        # Setup image transforms
+        self.image_transforms = self._setup_transforms(image_size, center_crop, random_flip)
+        print(f"📚 数据集加载完成: {len(self.annotations)} 个样本")
+    def _load_annotations(self, annotations_file: str) -> List[Dict]:
+        """Load annotations from file"""
+        if annotations_file.endswith('.json'):
+            with open(annotations_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+        elif annotations_file.endswith('.jsonl'):
+            data = []
+            with open(annotations_file, 'r', encoding='utf-8') as f:
+                for line in f:
+                    if line.strip():
+                        data.append(json.loads(line))
+        else:
+            raise ValueError(f"Unsupported annotation file format: {annotations_file}")
+        # Filter valid samples
+        valid_data = []
+        for item in data:
+            if self.caption_column in item and self.image_column in item:
+                if isinstance(item[self.caption_column], str) and item[self.caption_column].strip():
+                    valid_data.append(item)
+        print(f"📋 有效样本数: {len(valid_data)} / {len(data)}")
+        return valid_data
+    def _setup_transforms(self, size: int, center_crop: bool, random_flip: bool):
+        """Setup image preprocessing transforms"""
+        transform_list = []
+        # Resize
+        if center_crop:
+            transform_list.extend([
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size)
+            ])
+        else:
+            transform_list.append(
+                transforms.Resize((size, size), interpolation=transforms.InterpolationMode.BILINEAR)
+            )
+        # Random horizontal flip
+        if random_flip:
+            transform_list.append(transforms.RandomHorizontalFlip(p=0.5))
+        # Convert to tensor and normalize
+        transform_list.extend([
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5])  # Scale to [-1, 1]
+        ])
+        return transforms.Compose(transform_list)
+    def __len__(self):
+        return len(self.annotations)
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        """Get a single sample"""
+        annotation = self.annotations[idx]
+        # Load image
+        image_path = os.path.join(self.data_root, annotation[self.image_column])
+        try:
+            image = Image.open(image_path)
+            if image.mode != 'RGB':
+                image = image.convert('RGB')
+        except Exception as e:
+            print(f"⚠️ 加载图像失败 {image_path}: {e}")
+            # Return a black image as fallback
+            image = Image.new('RGB', (self.image_size, self.image_size), (0, 0, 0))
+        # Apply transforms
+        image = self.image_transforms(image)
+        # Get caption
+        caption = annotation[self.caption_column]
+        if len(caption) > self.max_caption_length:
+            caption = caption[:self.max_caption_length]
+        return {
+            "images": image,
+            "captions": caption,
+            "image_paths": image_path
+        }
+class MultiAspectDataset(Dataset):
+    """
+    Dataset that supports multiple aspect ratios
+    支持多种长宽比的数据集
+    """
+    def __init__(
+        self,
+        data_root: str,
+        annotations_file: str,
+        base_size: int = 1024,
+        aspect_ratios: List[Tuple[int, int]] = None,
+        bucket_tolerance: float = 0.1,
+        caption_column: str = "caption",
+        image_column: str = "image",
+        max_caption_length: int = 512
+    ):
+        self.data_root = data_root
+        self.base_size = base_size
+        self.caption_column = caption_column
+        self.image_column = image_column
+        self.max_caption_length = max_caption_length
+        # Default aspect ratios for SDXL
+        if aspect_ratios is None:
+            aspect_ratios = [
+                (1024, 1024),  # 1:1
+                (1152, 896),   # 9:7
+                (896, 1152),   # 7:9
+                (1216, 832),   # 3:2
+                (832, 1216),   # 2:3
+                (1344, 768),   # 7:4
+                (768, 1344),   # 4:7
+                (1536, 640),   # 12:5
+                (640, 1536),   # 5:12
+            ]
+        self.aspect_ratios = aspect_ratios
+        self.bucket_tolerance = bucket_tolerance
+        # Load and bucket annotations
+        self.annotations = self._load_and_bucket_annotations(annotations_file)
+        print(f"📚 多长宽比数据集加载完成: {len(self.annotations)} 个样本")
+        self._print_bucket_stats()
+    def _load_and_bucket_annotations(self, annotations_file: str) -> List[Dict]:
+        """Load annotations and assign to aspect ratio buckets"""
+        # Load annotations
+        if annotations_file.endswith('.json'):
+            with open(annotations_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+        elif annotations_file.endswith('.jsonl'):
+            data = []
+            with open(annotations_file, 'r', encoding='utf-8') as f:
+                for line in f:
+                    if line.strip():
+                        data.append(json.loads(line))
+        bucketed_data = []
+        for item in data:
+            if self.caption_column not in item or self.image_column not in item:
+                continue
+            caption = item[self.caption_column]
+            if not isinstance(caption, str) or not caption.strip():
+                continue
+            # Try to get image dimensions to assign bucket
+            image_path = os.path.join(self.data_root, item[self.image_column])
+            try:
+                with Image.open(image_path) as img:
+                    width, height = img.size
+                    aspect_ratio = width / height
+                    # Find best matching bucket
+                    best_bucket = self._find_best_bucket(aspect_ratio)
+                    item_copy = item.copy()
+                    item_copy['bucket_width'] = best_bucket[0]
+                    item_copy['bucket_height'] = best_bucket[1]
+                    item_copy['original_width'] = width
+                    item_copy['original_height'] = height
+                    bucketed_data.append(item_copy)
+            except Exception as e:
+                print(f"⚠️ 无法获取图像尺寸 {image_path}: {e}")
+                # Use default 1:1 bucket
+                item_copy = item.copy()
+                item_copy['bucket_width'] = 1024
+                item_copy['bucket_height'] = 1024
+                item_copy['original_width'] = 1024
+                item_copy['original_height'] = 1024
+                bucketed_data.append(item_copy)
+        return bucketed_data
+    def _find_best_bucket(self, aspect_ratio: float) -> Tuple[int, int]:
+        """Find the best matching aspect ratio bucket"""
+        best_bucket = self.aspect_ratios[0]
+        best_diff = float('inf')
+        for bucket_w, bucket_h in self.aspect_ratios:
+            bucket_ratio = bucket_w / bucket_h
+            diff = abs(aspect_ratio - bucket_ratio)
+            if diff < best_diff:
+                best_diff = diff
+                best_bucket = (bucket_w, bucket_h)
+        return best_bucket
+    def _print_bucket_stats(self):
+        """Print statistics about bucket distribution"""
+        bucket_counts = {}
+        for item in self.annotations:
+            bucket = (item['bucket_width'], item['bucket_height'])
+            bucket_counts[bucket] = bucket_counts.get(bucket, 0) + 1
+        print("📊 长宽比分布:")
+        for bucket, count in sorted(bucket_counts.items()):
+            ratio = bucket[0] / bucket[1]
+            print(f"  {bucket[0]}×{bucket[1]} (比例 {ratio:.2f}): {count} 个样本")
+    def _get_transforms(self, target_width: int, target_height: int):
+        """Get transforms for specific target size"""
+        return transforms.Compose([
+            transforms.Resize((target_height, target_width), interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.RandomHorizontalFlip(p=0.5),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5])
+        ])
+    def __len__(self):
+        return len(self.annotations)
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        """Get a single sample"""
+        annotation = self.annotations[idx]
+        # Get target dimensions from bucket
+        target_width = annotation['bucket_width']
+        target_height = annotation['bucket_height']
+        # Load and transform image
+        image_path = os.path.join(self.data_root, annotation[self.image_column])
+        try:
+            image = Image.open(image_path)
+            if image.mode != 'RGB':
+                image = image.convert('RGB')
+        except Exception as e:
+            print(f"⚠️ 加载图像失败 {image_path}: {e}")
+            image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
+        # Apply transforms
+        transforms_fn = self._get_transforms(target_width, target_height)
+        image = transforms_fn(image)
+        # Get caption
+        caption = annotation[self.caption_column]
+        if len(caption) > self.max_caption_length:
+            caption = caption[:self.max_caption_length]
+        return {
+            "images": image,
+            "captions": caption,
+            "image_paths": image_path,
+            "width": target_width,
+            "height": target_height
+        }
+# def collate_fn(batch: List[Dict[str, Any]]) -> Dict[str, Any]:
+#     """
+#     Custom collate function for batching
+#     自定义批处理整理函数
+#     """
+#     # Check if all images have the same size
+#     sizes = [(item["images"].shape[-2], item["images"].shape[-1]) for item in batch]
+#     if len(set(sizes)) == 1:
+#         # All same size, can batch normally
+#         images = torch.stack([item["images"] for item in batch])
+#         captions = [item["captions"] for item in batch]
+#         result = {
+#             "images": images,
+#             "captions": captions,
+#             "image_paths": [item["image_paths"] for item in batch]
+#         }
+#         # Add width/height if available
+#         if "width" in batch[0]:
+#             result["widths"] = [item["width"] for item in batch]
+#             result["heights"] = [item["height"] for item in batch]
+#         return result
+#     else:
+#         # Different sizes, return as list
+#         return {
+#             "images": [item["images"] for item in batch],
+#             "captions": [item["captions"] for item in batch],
+#             "image_paths": [item["image_paths"] for item in batch],
+#             "widths": [item.get("width", item["images"].shape[-1]) for item in batch],
+#             "heights": [item.get("height", item["images"].shape[-2]) for item in batch]
+#         }
+def create_dataloader(
+    dataset: Dataset,
+    batch_size: int = 4,
+    shuffle: bool = True,
+    num_workers: int = 4,
+    pin_memory: bool = True,
+    drop_last: bool = True
+) -> DataLoader:
+    """
+    Create dataloader with appropriate settings
+    创建具有适当设置的数据加载器
+    """
+    return DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        drop_last=drop_last,
+        collate_fn=collate_fn
+    )

arch/example_train.py ADDED Viewed

	@@ -0,0 +1,377 @@

+"""
+Example Training Script using Arch Components
+使用架构组件的示例训练脚本
+"""
+import torch
+import torch.optim as optim
+from torch.utils.data import DataLoader
+import os
+import argparse
+from tqdm import tqdm
+import wandb
+from typing import Optional
+# Import arch components
+from arch import (
+    QwenTextEncoder,
+    QwenEmbeddingAdapter,
+    load_unet_from_safetensors,
+    load_vae_from_safetensors,
+    create_scheduler,
+    DiffusionLoss,
+    AdapterTrainingStep,
+    get_cosine_schedule_with_warmup,
+    EMAModel,
+    ImageCaptionDataset,
+    MultiAspectDataset,
+    create_dataloader
+)
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train Qwen-SDXL Adapter")
+    # Model paths
+    parser.add_argument("--qwen_model_path", type=str, default="models/Qwen3-Embedding-0.6B")
+    parser.add_argument("--unet_path", type=str, default="models/extracted_components/waiNSFWIllustrious_v140_unet.safetensors")
+    parser.add_argument("--unet_config_path", type=str, default="models/extracted_components/waiNSFWIllustrious_v140_unet_config.json")
+    parser.add_argument("--vae_path", type=str, default="models/extracted_components/waiNSFWIllustrious_v140_vae.safetensors")
+    parser.add_argument("--vae_config_path", type=str, default="models/extracted_components/waiNSFWIllustrious_v140_vae_config.json")
+    # Data
+    parser.add_argument("--data_root", type=str, required=True, help="Root directory of training images")
+    parser.add_argument("--annotations_file", type=str, required=True, help="Path to annotations file (JSON/JSONL)")
+    parser.add_argument("--caption_column", type=str, default="caption")
+    parser.add_argument("--image_column", type=str, default="image")
+    parser.add_argument("--use_multi_aspect", action="store_true", help="Use multi-aspect ratio dataset")
+    # Training
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--learning_rate", type=float, default=1e-4)
+    parser.add_argument("--num_epochs", type=int, default=10)
+    parser.add_argument("--warmup_steps", type=int, default=500)
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
+    parser.add_argument("--max_grad_norm", type=float, default=1.0)
+    # Loss
+    parser.add_argument("--loss_type", type=str, default="mse", choices=["mse", "l1", "huber"])
+    parser.add_argument("--snr_gamma", type=float, default=None, help="SNR gamma for loss weighting")
+    parser.add_argument("--use_v_parameterization", action="store_true")
+    # Optimization
+    parser.add_argument("--optimizer", type=str, default="adamw", choices=["adamw", "adam"])
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--use_ema", action="store_true", help="Use EMA for adapter")
+    parser.add_argument("--ema_decay", type=float, default=0.9999)
+    # Checkpointing
+    parser.add_argument("--output_dir", type=str, default="./checkpoints")
+    parser.add_argument("--save_steps", type=int, default=1000)
+    parser.add_argument("--resume_from_checkpoint", type=str, default=None)
+    # Logging
+    parser.add_argument("--logging_steps", type=int, default=50)
+    parser.add_argument("--use_wandb", action="store_true")
+    parser.add_argument("--wandb_project", type=str, default="qwen-sdxl-training")
+    parser.add_argument("--wandb_run_name", type=str, default=None)
+    # Hardware
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--dtype", type=str, default="bfloat16", choices=["float32", "float16", "bfloat16"])
+    parser.add_argument("--num_workers", type=int, default=4)
+    return parser.parse_args()
+def setup_models(args):
+    """Setup all model components"""
+    print("🚀 设置模型组件...")
+    # Convert dtype string to torch dtype
+    dtype_map = {
+        "float32": torch.float32,
+        "float16": torch.float16,
+        "bfloat16": torch.bfloat16
+    }
+    dtype = dtype_map[args.dtype]
+    # Load text encoder
+    print("📝 加载 Qwen 文本编码器...")
+    text_encoder = QwenTextEncoder(
+        model_path=args.qwen_model_path,
+        device=args.device,
+        freeze_encoder=True
+    )
+    # Initialize adapter
+    print("🔧 初始化适配器...")
+    adapter = QwenEmbeddingAdapter()
+    adapter.to(args.device, dtype)
+    # Load UNet
+    print("🏗️ 加载 UNet...")
+    unet = load_unet_from_safetensors(
+        args.unet_path,
+        args.unet_config_path,
+        args.device,
+        dtype
+    )
+    # Load VAE
+    print("🎨 加载 VAE...")
+    vae = load_vae_from_safetensors(
+        args.vae_path,
+        args.vae_config_path,
+        args.device,
+        dtype
+    )
+    # Create scheduler
+    print("⏰ 创建调度器...")
+    noise_scheduler = create_scheduler("DDPM")
+    return text_encoder, adapter, unet, vae, noise_scheduler, dtype
+def setup_data(args):
+    """Setup data loaders"""
+    print("📚 设置数据加载器...")
+    if args.use_multi_aspect:
+        dataset = MultiAspectDataset(
+            data_root=args.data_root,
+            annotations_file=args.annotations_file,
+            caption_column=args.caption_column,
+            image_column=args.image_column
+        )
+    else:
+        dataset = ImageCaptionDataset(
+            data_root=args.data_root,
+            annotations_file=args.annotations_file,
+            caption_column=args.caption_column,
+            image_column=args.image_column
+        )
+    dataloader = create_dataloader(
+        dataset,
+        batch_size=args.batch_size,
+        shuffle=True,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=True
+    )
+    return dataloader
+def setup_training(args, adapter, noise_scheduler):
+    """Setup training components"""
+    print("🎯 设置训练组件...")
+    # Loss function
+    loss_fn = DiffusionLoss(
+        noise_scheduler=noise_scheduler,
+        loss_type=args.loss_type,
+        snr_gamma=args.snr_gamma,
+        use_v_parameterization=args.use_v_parameterization
+    )
+    # Optimizer
+    if args.optimizer == "adamw":
+        optimizer = optim.AdamW(
+            adapter.parameters(),
+            lr=args.learning_rate,
+            weight_decay=args.weight_decay,
+            betas=(0.9, 0.999)
+        )
+    else:
+        optimizer = optim.Adam(
+            adapter.parameters(),
+            lr=args.learning_rate,
+            weight_decay=args.weight_decay
+        )
+    # EMA
+    ema = None
+    if args.use_ema:
+        ema = EMAModel(adapter, decay=args.ema_decay)
+    return loss_fn, optimizer, ema
+def train_step(training_step_fn, batch, optimizer, args, ema=None):
+    """Execute one training step"""
+    # Handle different batch formats
+    if isinstance(batch["images"], list):
+        # Multi-size batch, train one by one
+        total_loss = 0
+        num_samples = 0
+        for i in range(len(batch["images"])):
+            images = batch["images"][i].unsqueeze(0)
+            captions = [batch["captions"][i]]
+            step_output = training_step_fn.training_step(images, captions)
+            loss = step_output["loss"] / args.gradient_accumulation_steps
+            loss.backward()
+            total_loss += loss.item()
+            num_samples += 1
+        avg_loss = total_loss / num_samples if num_samples > 0 else 0
+    else:
+        # Regular batch
+        images = batch["images"]
+        captions = batch["captions"]
+        step_output = training_step_fn.training_step(images, captions)
+        loss = step_output["loss"] / args.gradient_accumulation_steps
+        loss.backward()
+        avg_loss = loss.item()
+    # Gradient clipping and optimization step
+    torch.nn.utils.clip_grad_norm_(training_step_fn.adapter.parameters(), args.max_grad_norm)
+    optimizer.step()
+    optimizer.zero_grad()
+    # Update EMA
+    if ema is not None:
+        ema.update()
+    return avg_loss
+def save_checkpoint(adapter, optimizer, ema, epoch, step, args):
+    """Save training checkpoint"""
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Save adapter
+    adapter_path = os.path.join(args.output_dir, f"adapter_epoch_{epoch}_step_{step}.safetensors")
+    if hasattr(adapter, 'save_adapter'):
+        adapter.save_adapter(adapter_path)
+    else:
+        import safetensors.torch
+        safetensors.torch.save_file(adapter.state_dict(), adapter_path)
+    # Save EMA adapter if available
+    if ema is not None:
+        ema.apply_shadow()
+        ema_path = os.path.join(args.output_dir, f"adapter_ema_epoch_{epoch}_step_{step}.safetensors")
+        import safetensors.torch
+        safetensors.torch.save_file(adapter.state_dict(), ema_path)
+        ema.restore()
+    # Save training state
+    state_path = os.path.join(args.output_dir, f"training_state_epoch_{epoch}_step_{step}.pt")
+    torch.save({
+        "epoch": epoch,
+        "step": step,
+        "optimizer_state_dict": optimizer.state_dict(),
+        "args": args
+    }, state_path)
+    print(f"💾 检查点已保存: epoch {epoch}, step {step}")
+def main():
+    args = parse_args()
+    # Setup wandb
+    if args.use_wandb:
+        wandb.init(
+            project=args.wandb_project,
+            name=args.wandb_run_name,
+            config=vars(args)
+        )
+    # Setup models
+    text_encoder, adapter, unet, vae, noise_scheduler, dtype = setup_models(args)
+    # Setup data
+    dataloader = setup_data(args)
+    # Setup training
+    loss_fn, optimizer, ema = setup_training(args, adapter, noise_scheduler)
+    # Create training step function
+    training_step_fn = AdapterTrainingStep(
+        unet=unet,
+        vae=vae,
+        text_encoder=text_encoder,
+        adapter=adapter,
+        noise_scheduler=noise_scheduler,
+        loss_fn=loss_fn,
+        device=args.device,
+        dtype=dtype
+    )
+    # Setup learning rate scheduler
+    total_steps = len(dataloader) * args.num_epochs // args.gradient_accumulation_steps
+    lr_scheduler = get_cosine_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=args.warmup_steps,
+        num_training_steps=total_steps
+    )
+    print(f"🎓 开始训练: {args.num_epochs} epochs, {len(dataloader)} steps/epoch")
+    print(f"📊 总训练步数: {total_steps}")
+    # Training loop
+    global_step = 0
+    for epoch in range(args.num_epochs):
+        adapter.train()
+        epoch_loss = 0
+        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{args.num_epochs}")
+        for step, batch in enumerate(progress_bar):
+            step_loss = train_step(training_step_fn, batch, optimizer, args, ema)
+            epoch_loss += step_loss
+            # Update learning rate
+            lr_scheduler.step()
+            global_step += 1
+            # Logging
+            if global_step % args.logging_steps == 0:
+                avg_loss = epoch_loss / (step + 1)
+                current_lr = lr_scheduler.get_last_lr()[0]
+                progress_bar.set_postfix({
+                    "loss": f"{step_loss:.4f}",
+                    "avg_loss": f"{avg_loss:.4f}",
+                    "lr": f"{current_lr:.2e}"
+                })
+                if args.use_wandb:
+                    wandb.log({
+                        "train/loss": step_loss,
+                        "train/avg_loss": avg_loss,
+                        "train/learning_rate": current_lr,
+                        "train/epoch": epoch,
+                        "train/step": global_step
+                    })
+            # Save checkpoint
+            if global_step % args.save_steps == 0:
+                save_checkpoint(adapter, optimizer, ema, epoch, global_step, args)
+        # End of epoch
+        avg_epoch_loss = epoch_loss / len(dataloader)
+        print(f"📈 Epoch {epoch+1} 完成，平均损失: {avg_epoch_loss:.4f}")
+        # Save epoch checkpoint
+        save_checkpoint(adapter, optimizer, ema, epoch+1, global_step, args)
+    print("🎉 训练完成！")
+    if args.use_wandb:
+        wandb.finish()
+if __name__ == "__main__":
+    main()

arch/model_loader.py ADDED Viewed

	@@ -0,0 +1,325 @@

+"""
+Model Loader Utilities
+模型加载工具 - 用于加载各种模型组件
+"""
+import torch
+import json
+import safetensors.torch
+from typing import Optional
+def load_unet_from_safetensors(unet_path: str, config_path: str, device: str = "cuda", dtype: torch.dtype = torch.bfloat16):
+    """
+    Load UNet from safetensors file
+    从 safetensors 文件加载 UNet
+    Args:
+        unet_path: Path to UNet safetensors file
+        config_path: Path to UNet config JSON file
+        device: Device to load model on
+        dtype: Data type for model weights
+    Returns:
+        UNet2DConditionModel or None if loading fails
+    """
+    try:
+        from diffusers import UNet2DConditionModel
+        # Load config
+        with open(config_path, 'r') as f:
+            unet_config = json.load(f)
+        # Create UNet
+        unet = UNet2DConditionModel.from_config(unet_config)
+        # Load weights
+        state_dict = safetensors.torch.load_file(unet_path)
+        unet.load_state_dict(state_dict)
+        unet.to(device, dtype)
+        return unet
+    except Exception as e:
+        print(f"Error loading UNet: {e}")
+        return None
+def load_vae_from_safetensors(vae_path: str, config_path: str, device: str = "cuda", dtype: torch.dtype = torch.bfloat16):
+    """
+    Load VAE from safetensors file
+    从 safetensors 文件加载 VAE
+    Args:
+        vae_path: Path to VAE safetensors file
+        config_path: Path to VAE config JSON file
+        device: Device to load model on
+        dtype: Data type for model weights
+    Returns:
+        AutoencoderKL or None if loading fails
+    """
+    try:
+        from diffusers import AutoencoderKL
+        # Load config
+        with open(config_path, 'r') as f:
+            vae_config = json.load(f)
+        # Create VAE
+        vae = AutoencoderKL.from_config(vae_config)
+        # Load weights
+        state_dict = safetensors.torch.load_file(vae_path)
+        vae.load_state_dict(state_dict)
+        vae.to(device, dtype)
+        return vae
+    except Exception as e:
+        print(f"Error loading VAE: {e}")
+        return None
+def create_scheduler(scheduler_type: str = "EulerAncestral", model_id: str = "stabilityai/stable-diffusion-xl-base-1.0"):
+    """
+    Create scheduler for diffusion process
+    创建扩散过程调度器
+    Args:
+        scheduler_type: Type of scheduler to create
+        model_id: Model ID to load scheduler config from
+    Returns:
+        Scheduler object or None if creation fails
+    """
+    try:
+        if scheduler_type == "DDPM":
+            from diffusers import DDPMScheduler
+            scheduler = DDPMScheduler.from_pretrained(model_id, subfolder="scheduler")
+        elif scheduler_type == "DDIM":
+            from diffusers import DDIMScheduler
+            scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
+        elif scheduler_type == "DPMSolverMultistep":
+            from diffusers import DPMSolverMultistepScheduler
+            scheduler = DPMSolverMultistepScheduler.from_pretrained(model_id, subfolder="scheduler")
+        elif scheduler_type == "EulerAncestral":
+            from diffusers import EulerAncestralDiscreteScheduler
+            scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
+        else:
+            print(f"Unsupported scheduler type: {scheduler_type}, using DDPM")
+            from diffusers import DDPMScheduler
+            scheduler = DDPMScheduler.from_pretrained(model_id, subfolder="scheduler")
+        return scheduler
+    except Exception as e:
+        print(f"Error creating scheduler: {e}")
+        return None
+def load_qwen_model(model_path: str, device: str = "cuda"):
+    """
+    Load Qwen3 embedding model
+    加载 Qwen3 嵌入模型
+    Args:
+        model_path: Path to Qwen model
+        device: Device to load model on
+    Returns:
+        SentenceTransformer model or None if loading fails
+    """
+    try:
+        from sentence_transformers import SentenceTransformer
+        model = SentenceTransformer(model_path)
+        model.to(device)
+        return model
+    except ImportError:
+        print("Warning: sentence-transformers not available. Using mock embeddings.")
+        return None
+    except Exception as e:
+        print(f"Error loading Qwen model: {e}")
+        return None
+def save_model_components(
+    unet,
+    vae,
+    adapter,
+    text_encoder,
+    save_dir: str,
+    save_format: str = "safetensors"
+):
+    """
+    Save model components for training checkpoints
+    保存模型组件用于训练检查点
+    Args:
+        unet: UNet model
+        vae: VAE model
+        adapter: Qwen embedding adapter
+        text_encoder: Qwen text encoder
+        save_dir: Directory to save components
+        save_format: Format to save in (safetensors or pt)
+    """
+    import os
+    os.makedirs(save_dir, exist_ok=True)
+    try:
+        if save_format == "safetensors":
+            # Save UNet
+            if unet is not None:
+                safetensors.torch.save_file(
+                    unet.state_dict(),
+                    os.path.join(save_dir, "unet.safetensors")
+                )
+            # Save VAE
+            if vae is not None:
+                safetensors.torch.save_file(
+                    vae.state_dict(),
+                    os.path.join(save_dir, "vae.safetensors")
+                )
+            # Save adapter
+            if adapter is not None:
+                safetensors.torch.save_file(
+                    adapter.state_dict(),
+                    os.path.join(save_dir, "adapter.safetensors")
+                )
+        else:  # PyTorch format
+            if unet is not None:
+                torch.save(unet.state_dict(), os.path.join(save_dir, "unet.pt"))
+            if vae is not None:
+                torch.save(vae.state_dict(), os.path.join(save_dir, "vae.pt"))
+            if adapter is not None:
+                torch.save(adapter.state_dict(), os.path.join(save_dir, "adapter.pt"))
+        print(f"Model components saved to {save_dir}")
+    except Exception as e:
+        print(f"Error saving model components: {e}")
+def load_unet_with_lora(
+    unet_path: str,
+    unet_config_path: str,
+    lora_weights_path: Optional[str] = None,
+    lora_config_path: Optional[str] = None,
+    device: str = "cuda",
+    dtype: torch.dtype = torch.bfloat16
+):
+    """
+    Load UNet with optional LoRA weights
+    加载带有可选LoRA权重的UNet
+    Args:
+        base_unet_path: Path to base UNet (can be safetensors file or HF model path)
+        lora_weights_path: Optional path to LoRA weights (safetensors file)
+        lora_config_path: Optional path to LoRA config directory
+        device: Device to load model on
+        dtype: Data type for model weights
+    Returns:
+        UNet model with LoRA applied if specified
+    """
+    try:
+        from diffusers import UNet2DConditionModel
+        from peft import PeftModel, LoraConfig
+        # Load base UNet
+        # if unet_path.endswith(".safetensors"):
+        #     # Load from safetensors file (need config too)
+        #     print("Loading UNet from safetensors format requires separate config file")
+        #     return None
+        # else:
+            # Load from HuggingFace model path
+            # unet = UNet2DConditionModel.from_pretrained(
+            #     base_unet_path,
+            #     subfolder="unet" if "/" in base_unet_path and not base_unet_path.endswith("unet") else None,
+            #     torch_dtype=dtype
+            # )
+        unet = load_unet_from_safetensors(unet_path, unet_config_path, device, dtype)
+        # Apply LoRA if provided
+        if lora_weights_path and lora_config_path:
+            print(f"Loading LoRA weights from {lora_weights_path}")
+            # Load LoRA weights
+            if lora_weights_path.endswith(".safetensors"):
+                import safetensors.torch
+                lora_state_dict = safetensors.torch.load_file(lora_weights_path)
+            else:
+                lora_state_dict = torch.load(lora_weights_path, map_location=device)
+            # Load LoRA config
+            lora_config = LoraConfig.from_pretrained(lora_config_path)
+            # Apply LoRA to UNet
+            from peft import get_peft_model, set_peft_model_state_dict
+            unet = get_peft_model(unet, lora_config)
+            set_peft_model_state_dict(unet, lora_state_dict)
+            print("LoRA weights applied to UNet")
+        unet.to(device, dtype)
+        return unet
+    except Exception as e:
+        print(f"Error loading UNet with LoRA: {e}")
+        return None
+def load_fused_unet(
+    fused_unet_path: str,
+    device: str = "cuda",
+    dtype: torch.dtype = torch.bfloat16
+):
+    """
+    Load UNet with fused LoRA weights
+    加载融合了LoRA权重的UNet
+    Args:
+        fused_unet_path: Path to fused UNet model directory
+        device: Device to load model on
+        dtype: Data type for model weights
+    Returns:
+        UNet model with fused LoRA weights
+    """
+    try:
+        from diffusers import UNet2DConditionModel
+        unet = UNet2DConditionModel.from_pretrained(
+            fused_unet_path,
+            torch_dtype=dtype
+        )
+        unet.to(device, dtype)
+        print(f"Fused UNet loaded from {fused_unet_path}")
+        return unet
+    except Exception as e:
+        print(f"Error loading fused UNet: {e}")
+        return None
+def load_checkpoint(checkpoint_path: str, device: str = "cuda"):
+    """
+    Load training checkpoint
+    加载训练检查点
+    Args:
+        checkpoint_path: Path to checkpoint file
+        device: Device to load on
+    Returns:
+        Dictionary containing checkpoint data
+    """
+    try:
+        if checkpoint_path.endswith(".safetensors"):
+            return safetensors.torch.load_file(checkpoint_path, device=device)
+        else:
+            return torch.load(checkpoint_path, map_location=device)
+    except Exception as e:
+        print(f"Error loading checkpoint: {e}")
+        return None

arch/pipeline.py ADDED Viewed

	@@ -0,0 +1,348 @@

+"""
+Qwen-SDXL Inference Pipeline
+Qwen-SDXL 推理管道 - 使用 Qwen3 嵌入模型替代 CLIP 文本编码器的 SDXL 推理管道
+"""
+import torch
+import torch.nn as nn
+import numpy as np
+from PIL import Image
+from typing import List, Optional, Union, Tuple
+from .adapter import QwenEmbeddingAdapter
+from .text_encoder import QwenTextEncoder
+from .model_loader import load_qwen_model, load_unet_from_safetensors, load_vae_from_safetensors, create_scheduler
+class QwenIllustriousInference:
+    """
+    Qwen-SDXL 推理管道
+    使用 Qwen3 嵌入模型替代 CLIP 文本编码器的 SDXL 推理管道
+    """
+    def __init__(
+        self,
+        qwen_model_path: str = "models/Qwen3-Embedding-0.6B",
+        unet_path: str = "models/extracted_components/waiNSFWIllustrious_v140_unet.safetensors",
+        unet_config_path: str = "models/extracted_components/waiNSFWIllustrious_v140_unet_config.json",
+        vae_path: str = "models/extracted_components/waiNSFWIllustrious_v140_vae.safetensors",
+        vae_config_path: str = "models/extracted_components/waiNSFWIllustrious_v140_vae_config.json",
+        adapter_path: Optional[str] = "/home/ubuntu/lyl/QwenIllustrious/qwen_illustrious_output/adapter/adapter.safetensors",
+        lora_weights_path: Optional[str] = "/home/ubuntu/lyl/QwenIllustrious/qwen_illustrious_output/lora_weights/lora_weights.safetensors",
+        lora_config_path: Optional[str] = "/home/ubuntu/lyl/QwenIllustrious/qwen_illustrious_output/lora_weights/adapter_config.json",
+        use_fused_unet: bool = False,
+        fused_unet_path: Optional[str] = None,
+        device: str = "cuda",
+        dtype: torch.dtype = torch.bfloat16,
+        scheduler_type: str = "DDPM"
+    ):
+        self.device = device
+        self.dtype = dtype
+        self.vae_scale_factor = 8  # SDXL default
+        print("🚀 初始化 Qwen-SDXL 推理管道...")
+        # Initialize text encoder
+        print("📝 初始化 Qwen 文本编码器...")
+        self.text_encoder = QwenTextEncoder(
+            model_path=qwen_model_path,
+            device=device,
+            freeze_encoder=True
+        )
+        # Initialize adapter layer
+        print("🔧 初始化适配器层...")
+        self.adapter = QwenEmbeddingAdapter()
+        self.adapter.to(device, dtype)
+        # Load adapter weights if provided
+        if adapter_path is not None:
+            print(f"📥 加载适配器权重: {adapter_path}")
+            try:
+                if adapter_path.endswith(".safetensors"):
+                    import safetensors.torch
+                    adapter_state = safetensors.torch.load_file(adapter_path)
+                else:
+                    adapter_state = torch.load(adapter_path, map_location=device)
+                self.adapter.load_state_dict(adapter_state)
+            except Exception as e:
+                print(f"⚠️ 加载适配器权重失败: {e}")
+        # Load UNet (with LoRA support)
+        print("🏗️ 加载 UNet 模型...")
+        from .model_loader import load_unet_with_lora, load_fused_unet
+        if use_fused_unet and fused_unet_path:
+            # Load fused UNet with merged LoRA weights
+            print("📦 使用融合LoRA权重的UNet...")
+            self.unet = load_fused_unet(fused_unet_path, device, dtype)
+        elif lora_weights_path and lora_config_path:
+            # Load UNet with separate LoRA weights
+            print("🔧 加载UNet并应用LoRA权重...")
+            # For this case, use the base SDXL model path instead of safetensors
+            # base_model_path = unet_path.replace("/unet.safetensors", "").replace("/extracted_components/waiNSFWIllustrious_v140_unet.safetensors", "")
+            self.unet = load_unet_with_lora(
+                unet_path=unet_path,
+                unet_config_path=unet_config_path,
+                lora_weights_path=lora_weights_path,
+                lora_config_path=lora_config_path,
+                device=device,
+                dtype=dtype
+            )
+        else:
+            # Load standard UNet from safetensors
+            self.unet = load_unet_from_safetensors(unet_path, unet_config_path, device, dtype)
+        # Load VAE
+        print("🎨 加载 VAE 模型...")
+        self.vae = load_vae_from_safetensors(vae_path, vae_config_path, device, dtype)
+        # Initialize scheduler
+        print(f"⏰ 创建调度器 ({scheduler_type})...")
+        self.scheduler = create_scheduler(scheduler_type)
+        # Check if all components loaded successfully
+        self.is_ready = all([
+            self.text_encoder is not None,
+            self.adapter is not None,
+            self.unet is not None,
+            self.vae is not None,
+            self.scheduler is not None
+        ])
+        if self.is_ready:
+            print("✅ 管道初始化完成！")
+        else:
+            print("❌ 管道初始化失败，某些组件加��失败")
+    def encode_prompts(
+        self,
+        prompts: List[str],
+        negative_prompts: Optional[List[str]] = None,
+        do_classifier_free_guidance: bool = True
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Encode prompts using Qwen3 + adapter
+        使用 Qwen3 + 适配器编码提示词
+        """
+        # Get raw embeddings from Qwen
+        text_embeddings, pooled_embeddings = self.text_encoder.encode_prompts(
+            prompts, negative_prompts, do_classifier_free_guidance
+        )
+        batch_size = len(prompts)
+        if do_classifier_free_guidance:
+            batch_size *= 2
+        # Add sequence dimension for text embeddings (We uses 512 tokens for SDXL)
+        seq_len = 512
+        text_embeddings_seq = text_embeddings.unsqueeze(1).expand(-1, seq_len, -1)  # [B, 512, 1024]
+        # Project to SDXL dimensions using adapter
+        prompt_embeds = self.adapter.forward_text_embeddings(text_embeddings_seq.to(self.dtype))  # [B, 512, 2048]
+        pooled_prompt_embeds = self.adapter.forward_pooled_embeddings(pooled_embeddings.to(self.dtype))  # [B, 1280]
+        return prompt_embeds, pooled_prompt_embeds
+    def prepare_latents(
+        self,
+        batch_size: int,
+        height: int,
+        width: int,
+        generator: Optional[torch.Generator] = None
+    ) -> torch.Tensor:
+        """
+        Prepare initial latents
+        准备初始潜在变量
+        """
+        if self.unet is None:
+            # Mock latents for testing
+            shape = (batch_size, 4, height // self.vae_scale_factor, width // self.vae_scale_factor)
+            return torch.randn(shape, device=self.device, dtype=self.dtype)
+        shape = (
+            batch_size,
+            self.unet.config.in_channels,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        try:
+            from diffusers.utils import randn_tensor
+            latents = randn_tensor(shape, generator=generator, device=self.device, dtype=self.dtype)
+        except ImportError:
+            latents = torch.randn(shape, device=self.device, dtype=self.dtype, generator=generator)
+        # Scale initial noise
+        if self.scheduler is not None:
+            latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def get_time_ids(
+        self,
+        height: int,
+        width: int,
+        original_size: Tuple[int, int],
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None
+    ) -> torch.Tensor:
+        """
+        Get SDXL time IDs for micro-conditioning
+        获取 SDXL 时间 ID 用于微调节
+        """
+        if target_size is None:
+            target_size = (height, width)
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        add_time_ids = torch.tensor([add_time_ids], dtype=self.dtype, device=self.device)
+        return add_time_ids
+    @torch.no_grad()
+    def generate(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 1024,
+        width: int = 1024,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        generator: Optional[torch.Generator] = None,
+        return_type: str = "pil"
+    ) -> List[Image.Image]:
+        """
+        Generate images using Qwen-SDXL pipeline
+        使用 Qwen-SDXL 管道生成图像
+        """
+        if not self.is_ready:
+            print("❌ 管道未准备就绪，无法生成图像")
+            return []
+        # Prepare prompts
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        if isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt]
+        batch_size = len(prompt)
+        do_classifier_free_guidance = guidance_scale > 1.0
+        print(f"🎯 开始生成 {batch_size} 张图像...")
+        print(f"📏 尺寸: {width}x{height}")
+        print(f"🔄 推理步数: {num_inference_steps}")
+        print(f"🎚️ 引导强度: {guidance_scale}")
+        # 1. Encode prompts
+        print("📝 编码提示词...")
+        prompt_embeds, pooled_prompt_embeds = self.encode_prompts(
+            prompt, negative_prompt, do_classifier_free_guidance
+        )
+        # 2. Prepare timesteps
+        print("⏰ 准备时间步...")
+        if self.scheduler is not None:
+            self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+            timesteps = self.scheduler.timesteps
+        else:
+            timesteps = torch.linspace(1000, 0, num_inference_steps, device=self.device)
+        # 3. Prepare latents
+        print("🌀 准备潜在变量...")
+        latents = self.prepare_latents(batch_size, height, width, generator)
+        # 4. Prepare time IDs
+        original_size = (height, width)
+        target_size = (height, width)
+        add_time_ids = self.get_time_ids(height, width, original_size, target_size=target_size)
+        if do_classifier_free_guidance:
+            add_time_ids = add_time_ids.repeat(2, 1)
+        add_time_ids = add_time_ids.repeat(batch_size, 1)
+        # 5. Denoising loop
+        print("🔄 开始去噪过程...")
+        for i, t in enumerate(timesteps):
+            # Expand latents for classifier-free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            if self.scheduler is not None:
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            # Predict noise
+            if self.unet is not None:
+                added_cond_kwargs = {
+                    "text_embeds": pooled_prompt_embeds,
+                    "time_ids": add_time_ids
+                }
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+                # Classifier-free guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # Scheduler step
+                if self.scheduler is not None:
+                    latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+            if (i + 1) % 5 == 0:
+                print(f"   步骤 {i+1}/{len(timesteps)} 完成")
+        # 6. Decode latents
+        print("🎨 解码生成图像...")
+        if self.vae is not None:
+            latents = latents / self.vae.config.scaling_factor
+            images = self.vae.decode(latents, return_dict=False)[0]
+        else:
+            # Mock image generation for testing
+            images = torch.randn(batch_size, 3, height, width, device=self.device)
+        # 7. Convert to PIL images
+        images = (images / 2 + 0.5).clamp(0, 1)
+        images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+        if return_type == "pil":
+            images = [Image.fromarray((img * 255).astype(np.uint8)) for img in images]
+        print("✅ 图像生成完成！")
+        return images
+    def save_adapter(self, save_path: str):
+        """
+        Save adapter weights
+        保存适配器权重
+        """
+        try:
+            if save_path.endswith(".safetensors"):
+                import safetensors.torch
+                safetensors.torch.save_file(self.adapter.state_dict(), save_path)
+            else:
+                torch.save(self.adapter.state_dict(), save_path)
+            print(f"✅ 适配器权重已保存到: {save_path}")
+        except Exception as e:
+            print(f"❌ 保存适配器权重失败: {e}")
+    def load_adapter(self, load_path: str):
+        """
+        Load adapter weights
+        加载适配器权重
+        """
+        try:
+            if load_path.endswith(".safetensors"):
+                import safetensors.torch
+                state_dict = safetensors.torch.load_file(load_path)
+            else:
+                state_dict = torch.load(load_path, map_location=self.device)
+            self.adapter.load_state_dict(state_dict)
+            print(f"✅ 适配器权重已从 {load_path} 加载")
+        except Exception as e:
+            print(f"❌ 加载适配器权重失败: {e}")

arch/text_encoder.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""
+Qwen Text Encoder
+Qwen 文本编码器 - 使用 Qwen3 模型进行文本编码
+"""
+import torch
+import torch.nn as nn
+from typing import List, Optional, Union, Tuple
+def load_qwen_model(model_path: str, device: str = "cuda"):
+    """
+    Load Qwen3 embedding model
+    加载 Qwen3 嵌入模型
+    """
+    try:
+        from sentence_transformers import SentenceTransformer
+        model = SentenceTransformer(model_path)
+        model.to(device)
+        return model
+    except ImportError:
+        print("Warning: sentence-transformers not available. Using mock embeddings.")
+        return None
+def encode_text_with_qwen(
+    qwen_model,
+    texts: List[str],
+    device: str = "cuda",
+    max_length: int = 512,
+    use_query_mode: bool = False
+) -> torch.Tensor:
+    """
+    Encode text using Qwen3 model
+    使用 Qwen3 模型编码文本
+    Args:
+        qwen_model: Qwen3 embedding model
+        texts: List of text strings to encode
+        device: Device to run on
+        max_length: Maximum sequence length
+        use_query_mode: Whether to use query prompt for better understanding
+    """
+    if qwen_model is None:
+        # Mock embeddings for testing when sentence-transformers is not available
+        batch_size = len(texts)
+        return torch.randn(batch_size, 1024, device=device, dtype=torch.float32)
+    with torch.no_grad():
+        # Use query prompt for better text understanding when specified
+        embeddings = qwen_model.encode(
+            texts,
+            prompt_name="query" if use_query_mode else None,
+            convert_to_tensor=True,
+            device=device,
+            max_seq_length=max_length,
+            output_value="token_embeddings" if not use_query_mode else "sentence_embedding"
+        )
+    return embeddings if use_query_mode else torch.stack(embeddings, dim=0)
+class QwenTextEncoder(nn.Module):
+    """
+    Qwen Text Encoder wrapper for training and inference
+    用于训练和推理的 Qwen 文本编码器包装器
+    """
+    def __init__(
+        self,
+        model_path: str = "models/Qwen3-Embedding-0.6B",
+        device: str = "cuda",
+        max_length: int = 512,
+        freeze_encoder: bool = True
+    ):
+        super().__init__()
+        self.device = device
+        self.max_length = max_length
+        self.freeze_encoder = freeze_encoder
+        # Load Qwen model
+        self.qwen_model = load_qwen_model(model_path, device)
+        # Freeze parameters if specified
+        if self.freeze_encoder and self.qwen_model is not None:
+            for param in self.qwen_model.parameters():
+                param.requires_grad = False
+    def encode_prompts(
+        self,
+        prompts: List[str],
+        negative_prompts: Optional[List[str]] = None,
+        do_classifier_free_guidance: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Encode prompts using Qwen3 model
+        使用 Qwen3 模型编码提示词
+        Returns:
+            tuple: (text_embeddings, pooled_embeddings)
+                - text_embeddings: [batch_size, 1024] for sequence embeddings
+                - pooled_embeddings: [batch_size, 1024] for pooled embeddings
+        """
+        batch_size = len(prompts)
+        # Encode positive prompts for text embeddings (normal mode)
+        text_embeddings = encode_text_with_qwen(
+            self.qwen_model, prompts, self.device,
+            max_length=self.max_length, use_query_mode=False
+        )
+        # Encode positive prompts for pooled embeddings (query mode)
+        pooled_embeddings = encode_text_with_qwen(
+            self.qwen_model, prompts, self.device,
+            max_length=self.max_length, use_query_mode=True
+        )
+        # Handle negative prompts
+        if do_classifier_free_guidance:
+            if negative_prompts is None:
+                negative_prompts = [""] * batch_size
+            # Encode negative prompts
+            negative_text_embeddings = encode_text_with_qwen(
+                self.qwen_model, negative_prompts, self.device,
+                max_length=self.max_length, use_query_mode=False
+            )
+            negative_pooled_embeddings = encode_text_with_qwen(
+                self.qwen_model, negative_prompts, self.device,
+                max_length=self.max_length, use_query_mode=True
+            )
+            # Concatenate for classifier-free guidance
+            text_embeddings = torch.cat([negative_text_embeddings, text_embeddings], dim=0)
+            pooled_embeddings = torch.cat([negative_pooled_embeddings, pooled_embeddings], dim=0)
+        return text_embeddings, pooled_embeddings
+    def forward(self, prompts: List[str], negative_prompts: Optional[List[str]] = None):
+        """
+        Forward pass for text encoding
+        Args:
+            prompts: List of text prompts
+            negative_prompts: Optional list of negative prompts
+        Returns:
+            tuple: (text_embeddings, pooled_embeddings)
+        """
+        return self.encode_prompts(prompts, negative_prompts, do_classifier_free_guidance=(negative_prompts is not None))
+    def train(self, mode: bool = True):
+        """Override train mode to handle frozen encoder"""
+        super().train(mode)
+        if self.freeze_encoder and self.qwen_model is not None:
+            self.qwen_model.eval()  # Keep encoder in eval mode
+        return self

arch/training.py ADDED Viewed

	@@ -0,0 +1,307 @@

+"""
+Training Utilities for Qwen-SDXL
+Qwen-SDXL 训练工具 - 包含损失函数、训练步骤等
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, Any, Optional, Tuple
+import math
+class DiffusionLoss(nn.Module):
+    """
+    Diffusion training loss for SDXL with Qwen embeddings
+    使用 Qwen 嵌入的 SDXL 扩散训练损失
+    """
+    def __init__(
+        self,
+        noise_scheduler,
+        loss_type: str = "mse",
+        snr_gamma: Optional[float] = None,
+        use_v_parameterization: bool = False
+    ):
+        super().__init__()
+        self.noise_scheduler = noise_scheduler
+        self.loss_type = loss_type
+        self.snr_gamma = snr_gamma
+        self.use_v_parameterization = use_v_parameterization
+        if loss_type == "mse":
+            self.loss_fn = nn.MSELoss(reduction="none")
+        elif loss_type == "l1":
+            self.loss_fn = nn.L1Loss(reduction="none")
+        elif loss_type == "huber":
+            self.loss_fn = nn.HuberLoss(reduction="none", delta=0.1)
+        else:
+            raise ValueError(f"Unsupported loss type: {loss_type}")
+    def compute_snr(self, timesteps):
+        """
+        Compute signal-to-noise ratio for loss weighting
+        计算信噪比用于损失加权
+        """
+        alphas_cumprod = self.noise_scheduler.alphas_cumprod.to(timesteps.device)[timesteps]
+        sqrt_alphas_cumprod = alphas_cumprod**0.5
+        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+        # SNR = signal^2 / noise^2
+        snr = (sqrt_alphas_cumprod / sqrt_one_minus_alphas_cumprod) ** 2
+        return snr
+    def forward(
+        self,
+        model_pred: torch.Tensor,
+        target: torch.Tensor,
+        timesteps: torch.Tensor,
+        mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """
+        Compute diffusion loss
+        计算扩散损失
+        Args:
+            model_pred: Model prediction (noise or v-parameterization)
+            target: Target (noise or v-parameterization)
+            timesteps: Diffusion timesteps
+            mask: Optional mask for selective loss computation
+        Returns:
+            Loss tensor
+        """
+        # Basic loss computation
+        loss = self.loss_fn(model_pred, target)
+        # Apply mask if provided
+        if mask is not None:
+            loss = loss * mask
+        # Reduce over spatial dimensions
+        loss = loss.mean(dim=list(range(1, len(loss.shape))))
+        # Apply SNR weighting if specified
+        if self.snr_gamma is not None:
+            snr = self.compute_snr(timesteps)
+            if self.snr_gamma >= 1.0:
+                # Min-SNR weighting
+                snr_weight = torch.stack([snr, torch.full_like(snr, self.snr_gamma)], dim=1).min(dim=1)[0]
+            else:
+                # Standard SNR weighting
+                snr_weight = snr ** self.snr_gamma
+            loss = loss * snr_weight
+        return loss.mean()
+class AdapterTrainingStep:
+    """
+    Training step for adapter-only training
+    仅训练适配器的训练步骤
+    """
+    def __init__(
+        self,
+        unet,
+        vae,
+        text_encoder,
+        adapter,
+        noise_scheduler,
+        loss_fn: DiffusionLoss,
+        device: str = "cuda",
+        dtype: torch.dtype = torch.bfloat16
+    ):
+        self.unet = unet
+        self.vae = vae
+        self.text_encoder = text_encoder
+        self.adapter = adapter
+        self.noise_scheduler = noise_scheduler
+        self.loss_fn = loss_fn
+        self.device = device
+        self.dtype = dtype
+        # Freeze components except adapter
+        self._freeze_components()
+    def _freeze_components(self):
+        """Freeze all components except adapter"""
+        if self.unet is not None:
+            for param in self.unet.parameters():
+                param.requires_grad = False
+        if self.vae is not None:
+            for param in self.vae.parameters():
+                param.requires_grad = False
+        # Text encoder is already frozen in QwenTextEncoder
+        # Only adapter parameters should be trainable
+        for param in self.adapter.parameters():
+            param.requires_grad = True
+    def prepare_inputs(
+        self,
+        images: torch.Tensor,
+        prompts: list,
+        negative_prompts: Optional[list] = None
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Prepare inputs for training step
+        准备训练步骤的输入
+        """
+        batch_size = images.shape[0]
+        # Encode images to latents
+        with torch.no_grad():
+            latents = self.vae.encode(images.to(self.dtype)).latent_dist.sample()
+            latents = latents * self.vae.config.scaling_factor
+        # Add noise to latents
+        noise = torch.randn_like(latents)
+        timesteps = torch.randint(
+            0, self.noise_scheduler.config.num_train_timesteps,
+            (batch_size,), device=self.device
+        ).long()
+        noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)
+        # Encode text
+        text_embeddings, pooled_embeddings = self.text_encoder.encode_prompts(
+            prompts, negative_prompts, do_classifier_free_guidance=False
+        )
+        # Add sequence dimension and project through adapter
+        seq_len = 77
+        text_embeddings_seq = text_embeddings.unsqueeze(1).expand(-1, seq_len, -1)
+        encoder_hidden_states = self.adapter.forward_text_embeddings(text_embeddings_seq.to(self.dtype))
+        pooled_prompt_embeds = self.adapter.forward_pooled_embeddings(pooled_embeddings.to(self.dtype))
+        # Prepare time IDs (simplified for training)
+        height, width = images.shape[-2:]
+        original_size = (height, width)
+        target_size = (height, width)
+        crops_coords_top_left = (0, 0)
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        add_time_ids = torch.tensor([add_time_ids] * batch_size, dtype=self.dtype, device=self.device)
+        return {
+            "noisy_latents": noisy_latents,
+            "timesteps": timesteps,
+            "encoder_hidden_states": encoder_hidden_states,
+            "pooled_prompt_embeds": pooled_prompt_embeds,
+            "add_time_ids": add_time_ids,
+            "noise": noise
+        }
+    def training_step(
+        self,
+        images: torch.Tensor,
+        prompts: list,
+        negative_prompts: Optional[list] = None
+    ) -> Dict[str, Any]:
+        """
+        Execute one training step
+        执行一个训练步骤
+        """
+        # Prepare inputs
+        inputs = self.prepare_inputs(images, prompts, negative_prompts)
+        # Forward pass through UNet
+        added_cond_kwargs = {
+            "text_embeds": inputs["pooled_prompt_embeds"],
+            "time_ids": inputs["add_time_ids"]
+        }
+        model_pred = self.unet(
+            inputs["noisy_latents"],
+            inputs["timesteps"],
+            encoder_hidden_states=inputs["encoder_hidden_states"],
+            added_cond_kwargs=added_cond_kwargs,
+            return_dict=False,
+        )[0]
+        # Compute loss
+        if self.loss_fn.use_v_parameterization:
+            # v-parameterization target
+            alphas_cumprod = self.noise_scheduler.alphas_cumprod[inputs["timesteps"]]
+            sqrt_alphas_cumprod = alphas_cumprod**0.5
+            sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+            target = sqrt_alphas_cumprod.view(-1, 1, 1, 1) * inputs["noise"] - \
+                    sqrt_one_minus_alphas_cumprod.view(-1, 1, 1, 1) * inputs["noisy_latents"]
+        else:
+            # Standard noise prediction
+            target = inputs["noise"]
+        loss = self.loss_fn(model_pred, target, inputs["timesteps"])
+        return {
+            "loss": loss,
+            "model_pred": model_pred.detach(),
+            "target": target.detach(),
+            "timesteps": inputs["timesteps"]
+        }
+def get_cosine_schedule_with_warmup(
+    optimizer,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: float = 0.5,
+    last_epoch: int = -1,
+):
+    """
+    Cosine learning rate schedule with warmup
+    带预热的余弦学习率调度
+    """
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+    from torch.optim.lr_scheduler import LambdaLR
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+class EMAModel:
+    """
+    Exponential Moving Average for model parameters
+    模型参数的指数移动平均
+    """
+    def __init__(self, model, decay: float = 0.9999):
+        self.model = model
+        self.decay = decay
+        self.shadow = {}
+        self.backup = {}
+        # Initialize shadow parameters
+        for name, param in model.named_parameters():
+            if param.requires_grad:
+                self.shadow[name] = param.data.clone()
+    def update(self):
+        """Update EMA parameters"""
+        for name, param in self.model.named_parameters():
+            if param.requires_grad and name in self.shadow:
+                self.shadow[name] = self.decay * self.shadow[name] + (1 - self.decay) * param.data
+    def apply_shadow(self):
+        """Apply EMA parameters to model"""
+        for name, param in self.model.named_parameters():
+            if param.requires_grad and name in self.shadow:
+                self.backup[name] = param.data.clone()
+                param.data = self.shadow[name]
+    def restore(self):
+        """Restore original parameters"""
+        for name, param in self.model.named_parameters():
+            if param.requires_grad and name in self.backup:
+                param.data = self.backup[name]
+        self.backup = {}

diffusers/.github/PULL_REQUEST_TEMPLATE.md ADDED Viewed

	@@ -0,0 +1,61 @@

+# What does this PR do?
+<!--
+Congratulations! You've made it this far! You're not quite done yet though.
+Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution.
+Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change.
+Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost.
+-->
+<!-- Remove if not applicable -->
+Fixes # (issue)
+## Before submitting
+- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
+- [ ] Did you read the [contributor guideline](https://github.com/huggingface/diffusers/blob/main/CONTRIBUTING.md)?
+- [ ] Did you read our [philosophy doc](https://github.com/huggingface/diffusers/blob/main/PHILOSOPHY.md) (important for complex PRs)?
+- [ ] Was this discussed/approved via a GitHub issue or the [forum](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63)? Please add a link to it if that's the case.
+- [ ] Did you make sure to update the documentation with your changes? Here are the
+      [documentation guidelines](https://github.com/huggingface/diffusers/tree/main/docs), and
+      [here are tips on formatting docstrings](https://github.com/huggingface/diffusers/tree/main/docs#writing-source-documentation).
+- [ ] Did you write any new necessary tests?
+## Who can review?
+Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
+members/contributors who may be interested in your PR.
+<!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @.
+ If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
+ Please tag fewer than 3 people.
+Core library:
+- Schedulers: @yiyixuxu
+- Pipelines and pipeline callbacks: @yiyixuxu and @asomoza
+- Training examples: @sayakpaul
+- Docs: @stevhliu and @sayakpaul
+- JAX and MPS: @pcuenca
+- Audio: @sanchit-gandhi
+- General functionalities: @sayakpaul @yiyixuxu @DN6
+Integrations:
+- deepspeed: HF Trainer/Accelerate: @SunMarc
+- PEFT: @sayakpaul @BenjaminBossan
+HF projects:
+- accelerate: [different repo](https://github.com/huggingface/accelerate)
+- datasets: [different repo](https://github.com/huggingface/datasets)
+- transformers: [different repo](https://github.com/huggingface/transformers)
+- safetensors: [different repo](https://github.com/huggingface/safetensors)
+-->

diffusers/docs/README.md ADDED Viewed

	@@ -0,0 +1,268 @@

+<!---
+Copyright 2024- The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+# Generating the documentation
+To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
+you can install them with the following command, at the root of the code repository:
+```bash
+pip install -e ".[docs]"
+```
+Then you need to install our open source documentation builder tool:
+```bash
+pip install git+https://github.com/huggingface/doc-builder
+```
+---
+**NOTE**
+You only need to generate the documentation to inspect it locally (if you're planning changes and want to
+check how they look before committing for instance). You don't have to commit the built documentation.
+---
+## Previewing the documentation
+To preview the docs, first install the `watchdog` module with:
+```bash
+pip install watchdog
+```
+Then run the following command:
+```bash
+doc-builder preview {package_name} {path_to_docs}
+```
+For example:
+```bash
+doc-builder preview diffusers docs/source/en
+```
+The docs will be viewable at [http://localhost:3000](http://localhost:3000). You can also preview the docs once you have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives.
+---
+**NOTE**
+The `preview` command only works with existing doc files. When you add a completely new file, you need to update `_toctree.yml` & restart `preview` command (`ctrl-c` to stop it & call `doc-builder preview ...` again).
+---
+## Adding a new element to the navigation bar
+Accepted files are Markdown (.md).
+Create a file with its extension and put it in the source directory. You can then link it to the toc-tree by putting
+the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/diffusers/blob/main/docs/source/en/_toctree.yml) file.
+## Renaming section headers and moving sections
+It helps to keep the old links working when renaming the section header and/or moving sections from one document to another. This is because the old links are likely to be used in Issues, Forums, and Social media and it'd make for a much more superior user experience if users reading those months later could still easily navigate to the originally intended information.
+Therefore, we simply keep a little map of moved sections at the end of the document where the original section was. The key is to preserve the original anchor.
+So if you renamed a section from: "Section A" to "Section B", then you can add at the end of the file:
+```md
+Sections that were moved:
+[ <a href="#section-b">Section A</a><a id="section-a"></a> ]
+```
+and of course, if you moved it to another file, then:
+```md
+Sections that were moved:
+[ <a href="../new-file#section-b">Section A</a><a id="section-a"></a> ]
+```
+Use the relative style to link to the new file so that the versioned docs continue to work.
+For an example of a rich moved section set please see the very end of [the transformers Trainer doc](https://github.com/huggingface/transformers/blob/main/docs/source/en/main_classes/trainer.md).
+## Writing Documentation - Specification
+The `huggingface/diffusers` documentation follows the
+[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style for docstrings,
+although we can write them directly in Markdown.
+### Adding a new tutorial
+Adding a new tutorial or section is done in two steps:
+- Add a new Markdown (.md) file under `docs/source/<languageCode>`.
+- Link that file in `docs/source/<languageCode>/_toctree.yml` on the correct toc-tree.
+Make sure to put your new file under the proper section. It's unlikely to go in the first section (*Get Started*), so
+depending on the intended targets (beginners, more advanced users, or researchers) it should go in sections two, three, or four.
+### Adding a new pipeline/scheduler
+When adding a new pipeline:
+- Create a file `xxx.md` under `docs/source/<languageCode>/api/pipelines` (don't hesitate to copy an existing file as template).
+- Link that file in (*Diffusers Summary*) section in `docs/source/api/pipelines/overview.md`, along with the link to the paper, and a colab notebook (if available).
+- Write a short overview of the diffusion model:
+    - Overview with paper & authors
+    - Paper abstract
+    - Tips and tricks and how to use it best
+    - Possible an end-to-end example of how to use it
+- Add all the pipeline classes that should be linked in the diffusion model. These classes should be added using our Markdown syntax. By default as follows:
+```
+[[autodoc]] XXXPipeline
+    - all
+	- __call__
+```
+This will include every public method of the pipeline that is documented, as well as the  `__call__` method that is not documented by default. If you just want to add additional methods that are not documented, you can put the list of all methods to add in a list that contains `all`.
+```
+[[autodoc]] XXXPipeline
+    - all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+    - enable_xformers_memory_efficient_attention
+    - disable_xformers_memory_efficient_attention
+```
+You can follow the same process to create a new scheduler under the `docs/source/<languageCode>/api/schedulers` folder.
+### Writing source documentation
+Values that should be put in `code` should either be surrounded by backticks: \`like so\`. Note that argument names
+and objects like True, None, or any strings should usually be put in `code`.
+When mentioning a class, function, or method, it is recommended to use our syntax for internal links so that our tool
+adds a link to its documentation with this syntax: \[\`XXXClass\`\] or \[\`function\`\]. This requires the class or
+function to be in the main package.
+If you want to create a link to some internal class or function, you need to
+provide its path. For instance: \[\`pipelines.ImagePipelineOutput\`\]. This will be converted into a link with
+`pipelines.ImagePipelineOutput` in the description. To get rid of the path and only keep the name of the object you are
+linking to in the description, add a ~: \[\`~pipelines.ImagePipelineOutput\`\] will generate a link with `ImagePipelineOutput` in the description.
+The same works for methods so you can either use \[\`XXXClass.method\`\] or \[\`~XXXClass.method\`\].
+#### Defining arguments in a method
+Arguments should be defined with the `Args:` (or `Arguments:` or `Parameters:`) prefix, followed by a line return and
+an indentation. The argument should be followed by its type, with its shape if it is a tensor, a colon, and its
+description:
+```
+    Args:
+        n_layers (`int`): The number of layers of the model.
+```
+If the description is too long to fit in one line, another indentation is necessary before writing the description
+after the argument.
+Here's an example showcasing everything so far:
+```
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AlbertTokenizer`]. See [`~PreTrainedTokenizer.encode`] and
+            [`~PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+```
+For optional arguments or arguments with defaults we follow the following syntax: imagine we have a function with the
+following signature:
+```py
+def my_function(x: str=None, a: float=3.14):
+```
+then its documentation should look like this:
+```
+    Args:
+        x (`str`, *optional*):
+            This argument controls ...
+        a (`float`, *optional*, defaults to `3.14`):
+            This argument is used to ...
+```
+Note that we always omit the "defaults to \`None\`" when None is the default for any argument. Also note that even
+if the first line describing your argument type and its default gets long, you can't break it on several lines. You can
+however write as many lines as you want in the indented description (see the example above with `input_ids`).
+#### Writing a multi-line code block
+Multi-line code blocks can be useful for displaying examples. They are done between two lines of three backticks as usual in Markdown:
+````
+```
+# first line of code
+# second line
+# etc
+```
+````
+#### Writing a return block
+The return block should be introduced with the `Returns:` prefix, followed by a line return and an indentation.
+The first line should be the type of the return, followed by a line return. No need to indent further for the elements
+building the return.
+Here's an example of a single value return:
+```
+    Returns:
+        `List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
+```
+Here's an example of a tuple return, comprising several objects:
+```
+    Returns:
+        `tuple(torch.Tensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:
+        - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.Tensor` of shape `(1,)` --
+          Total loss is the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
+        - **prediction_scores** (`torch.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`) --
+          Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+```
+#### Adding an image
+Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos, and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
+the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
+them by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
+If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
+to this dataset.
+## Styling the docstring
+We have an automatic script running with the `make style` command that will make sure that:
+- the docstrings fully take advantage of the line width
+- all code examples are formatted using black, like the code of the Transformers library
+This script may have some weird failures if you made a syntax mistake or if you uncover a bug. Therefore, it's
+recommended to commit your changes before running `make style`, so you can revert the changes done by that script
+easily.

diffusers/docs/TRANSLATING.md ADDED Viewed

	@@ -0,0 +1,69 @@

+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+### Translating the Diffusers documentation into your language
+As part of our mission to democratize machine learning, we'd love to make the Diffusers library available in many more languages! Follow the steps below if you want to help translate the documentation into your language 🙏.
+**🗞️ Open an issue**
+To get started, navigate to the [Issues](https://github.com/huggingface/diffusers/issues) page of this repo and check if anyone else has opened an issue for your language. If not, open a new issue by selecting the "🌐 Translating a New Language?" from the "New issue" button.
+Once an issue exists, post a comment to indicate which chapters you'd like to work on, and we'll add your name to the list.
+**🍴 Fork the repository**
+First, you'll need to [fork the Diffusers repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo). You can do this by clicking on the **Fork** button on the top-right corner of this repo's page.
+Once you've forked the repo, you'll want to get the files on your local machine for editing. You can do that by cloning the fork with Git as follows:
+```bash
+git clone https://github.com/<YOUR-USERNAME>/diffusers.git
+```
+**📋 Copy-paste the English version with a new language code**
+The documentation files are in one leading directory:
+- [`docs/source`](https://github.com/huggingface/diffusers/tree/main/docs/source): All the documentation materials are organized here by language.
+You'll only need to copy the files in the [`docs/source/en`](https://github.com/huggingface/diffusers/tree/main/docs/source/en) directory, so first navigate to your fork of the repo and run the following:
+```bash
+cd ~/path/to/diffusers/docs
+cp -r source/en source/<LANG-ID>
+```
+Here, `<LANG-ID>` should be one of the ISO 639-1 or ISO 639-2 language codes -- see [here](https://www.loc.gov/standards/iso639-2/php/code_list.php) for a handy table.
+**✍️ Start translating**
+The fun part comes - translating the text!
+The first thing we recommend is translating the part of the `_toctree.yml` file that corresponds to your doc chapter. This file is used to render the table of contents on the website.
+> 🙋 If the `_toctree.yml` file doesn't yet exist for your language, you can create one by copy-pasting from the English version and deleting the sections unrelated to your chapter. Just make sure it exists in the `docs/source/<LANG-ID>/` directory!
+The fields you should add are `local` (with the name of the file containing the translation; e.g. `autoclass_tutorial`), and `title` (with the title of the doc in your language; e.g. `Load pretrained instances with an AutoClass`) -- as a reference, here is the `_toctree.yml` for [English](https://github.com/huggingface/diffusers/blob/main/docs/source/en/_toctree.yml):
+```yaml
+- sections:
+  - local: pipeline_tutorial # Do not change this! Use the same name for your .md file
+    title: Pipelines for inference # Translate this!
+    ...
+  title: Tutorials # Translate this!
+```
+Once you have translated the `_toctree.yml` file, you can start translating the [MDX](https://mdxjs.com/) files associated with your docs chapter.
+> 🙋 If you'd like others to help you with the translation, you should [open an issue](https://github.com/huggingface/diffusers/issues) and tag @patrickvonplaten.

diffusers/scripts/conversion_ldm_uncond.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import argparse
+import torch
+import yaml
+from diffusers import DDIMScheduler, LDMPipeline, UNetLDMModel, VQModel
+def convert_ldm_original(checkpoint_path, config_path, output_path):
+    config = yaml.safe_load(config_path)
+    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
+    keys = list(state_dict.keys())
+    # extract state_dict for VQVAE
+    first_stage_dict = {}
+    first_stage_key = "first_stage_model."
+    for key in keys:
+        if key.startswith(first_stage_key):
+            first_stage_dict[key.replace(first_stage_key, "")] = state_dict[key]
+    # extract state_dict for UNetLDM
+    unet_state_dict = {}
+    unet_key = "model.diffusion_model."
+    for key in keys:
+        if key.startswith(unet_key):
+            unet_state_dict[key.replace(unet_key, "")] = state_dict[key]
+    vqvae_init_args = config["model"]["params"]["first_stage_config"]["params"]
+    unet_init_args = config["model"]["params"]["unet_config"]["params"]
+    vqvae = VQModel(**vqvae_init_args).eval()
+    vqvae.load_state_dict(first_stage_dict)
+    unet = UNetLDMModel(**unet_init_args).eval()
+    unet.load_state_dict(unet_state_dict)
+    noise_scheduler = DDIMScheduler(
+        timesteps=config["model"]["params"]["timesteps"],
+        beta_schedule="scaled_linear",
+        beta_start=config["model"]["params"]["linear_start"],
+        beta_end=config["model"]["params"]["linear_end"],
+        clip_sample=False,
+    )
+    pipeline = LDMPipeline(vqvae, unet, noise_scheduler)
+    pipeline.save_pretrained(output_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_path", type=str, required=True)
+    parser.add_argument("--config_path", type=str, required=True)
+    parser.add_argument("--output_path", type=str, required=True)
+    args = parser.parse_args()
+    convert_ldm_original(args.checkpoint_path, args.config_path, args.output_path)

diffusers/scripts/convert_animatediff_motion_lora_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import argparse
+import os
+import torch
+from huggingface_hub import create_repo, upload_folder
+from safetensors.torch import load_file, save_file
+def convert_motion_module(original_state_dict):
+    converted_state_dict = {}
+    for k, v in original_state_dict.items():
+        if "pos_encoder" in k:
+            continue
+        else:
+            converted_state_dict[
+                k.replace(".norms.0", ".norm1")
+                .replace(".norms.1", ".norm2")
+                .replace(".ff_norm", ".norm3")
+                .replace(".attention_blocks.0", ".attn1")
+                .replace(".attention_blocks.1", ".attn2")
+                .replace(".temporal_transformer", "")
+            ] = v
+    return converted_state_dict
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt_path", type=str, required=True, help="Path to checkpoint")
+    parser.add_argument("--output_path", type=str, required=True, help="Path to output directory")
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        default=False,
+        help="Whether to push the converted model to the HF or not",
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = get_args()
+    if args.ckpt_path.endswith(".safetensors"):
+        state_dict = load_file(args.ckpt_path)
+    else:
+        state_dict = torch.load(args.ckpt_path, map_location="cpu")
+    if "state_dict" in state_dict.keys():
+        state_dict = state_dict["state_dict"]
+    conv_state_dict = convert_motion_module(state_dict)
+    # convert to new format
+    output_dict = {}
+    for module_name, params in conv_state_dict.items():
+        if type(params) is not torch.Tensor:
+            continue
+        output_dict.update({f"unet.{module_name}": params})
+    os.makedirs(args.output_path, exist_ok=True)
+    filepath = os.path.join(args.output_path, "diffusion_pytorch_model.safetensors")
+    save_file(output_dict, filepath)
+    if args.push_to_hub:
+        repo_id = create_repo(args.output_path, exist_ok=True).repo_id
+        upload_folder(repo_id=repo_id, folder_path=args.output_path, repo_type="model")

diffusers/scripts/convert_cogvideox_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import argparse
+from typing import Any, Dict
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+from diffusers import (
+    AutoencoderKLCogVideoX,
+    CogVideoXDDIMScheduler,
+    CogVideoXImageToVideoPipeline,
+    CogVideoXPipeline,
+    CogVideoXTransformer3DModel,
+)
+def reassign_query_key_value_inplace(key: str, state_dict: Dict[str, Any]):
+    to_q_key = key.replace("query_key_value", "to_q")
+    to_k_key = key.replace("query_key_value", "to_k")
+    to_v_key = key.replace("query_key_value", "to_v")
+    to_q, to_k, to_v = torch.chunk(state_dict[key], chunks=3, dim=0)
+    state_dict[to_q_key] = to_q
+    state_dict[to_k_key] = to_k
+    state_dict[to_v_key] = to_v
+    state_dict.pop(key)
+def reassign_query_key_layernorm_inplace(key: str, state_dict: Dict[str, Any]):
+    layer_id, weight_or_bias = key.split(".")[-2:]
+    if "query" in key:
+        new_key = f"transformer_blocks.{layer_id}.attn1.norm_q.{weight_or_bias}"
+    elif "key" in key:
+        new_key = f"transformer_blocks.{layer_id}.attn1.norm_k.{weight_or_bias}"
+    state_dict[new_key] = state_dict.pop(key)
+def reassign_adaln_norm_inplace(key: str, state_dict: Dict[str, Any]):
+    layer_id, _, weight_or_bias = key.split(".")[-3:]
+    weights_or_biases = state_dict[key].chunk(12, dim=0)
+    norm1_weights_or_biases = torch.cat(weights_or_biases[0:3] + weights_or_biases[6:9])
+    norm2_weights_or_biases = torch.cat(weights_or_biases[3:6] + weights_or_biases[9:12])
+    norm1_key = f"transformer_blocks.{layer_id}.norm1.linear.{weight_or_bias}"
+    state_dict[norm1_key] = norm1_weights_or_biases
+    norm2_key = f"transformer_blocks.{layer_id}.norm2.linear.{weight_or_bias}"
+    state_dict[norm2_key] = norm2_weights_or_biases
+    state_dict.pop(key)
+def remove_keys_inplace(key: str, state_dict: Dict[str, Any]):
+    state_dict.pop(key)
+def replace_up_keys_inplace(key: str, state_dict: Dict[str, Any]):
+    key_split = key.split(".")
+    layer_index = int(key_split[2])
+    replace_layer_index = 4 - 1 - layer_index
+    key_split[1] = "up_blocks"
+    key_split[2] = str(replace_layer_index)
+    new_key = ".".join(key_split)
+    state_dict[new_key] = state_dict.pop(key)
+TRANSFORMER_KEYS_RENAME_DICT = {
+    "transformer.final_layernorm": "norm_final",
+    "transformer": "transformer_blocks",
+    "attention": "attn1",
+    "mlp": "ff.net",
+    "dense_h_to_4h": "0.proj",
+    "dense_4h_to_h": "2",
+    ".layers": "",
+    "dense": "to_out.0",
+    "input_layernorm": "norm1.norm",
+    "post_attn1_layernorm": "norm2.norm",
+    "time_embed.0": "time_embedding.linear_1",
+    "time_embed.2": "time_embedding.linear_2",
+    "ofs_embed.0": "ofs_embedding.linear_1",
+    "ofs_embed.2": "ofs_embedding.linear_2",
+    "mixins.patch_embed": "patch_embed",
+    "mixins.final_layer.norm_final": "norm_out.norm",
+    "mixins.final_layer.linear": "proj_out",
+    "mixins.final_layer.adaLN_modulation.1": "norm_out.linear",
+    "mixins.pos_embed.pos_embedding": "patch_embed.pos_embedding",  # Specific to CogVideoX-5b-I2V
+}
+TRANSFORMER_SPECIAL_KEYS_REMAP = {
+    "query_key_value": reassign_query_key_value_inplace,
+    "query_layernorm_list": reassign_query_key_layernorm_inplace,
+    "key_layernorm_list": reassign_query_key_layernorm_inplace,
+    "adaln_layer.adaLN_modulations": reassign_adaln_norm_inplace,
+    "embed_tokens": remove_keys_inplace,
+    "freqs_sin": remove_keys_inplace,
+    "freqs_cos": remove_keys_inplace,
+    "position_embedding": remove_keys_inplace,
+}
+VAE_KEYS_RENAME_DICT = {
+    "block.": "resnets.",
+    "down.": "down_blocks.",
+    "downsample": "downsamplers.0",
+    "upsample": "upsamplers.0",
+    "nin_shortcut": "conv_shortcut",
+    "encoder.mid.block_1": "encoder.mid_block.resnets.0",
+    "encoder.mid.block_2": "encoder.mid_block.resnets.1",
+    "decoder.mid.block_1": "decoder.mid_block.resnets.0",
+    "decoder.mid.block_2": "decoder.mid_block.resnets.1",
+}
+VAE_SPECIAL_KEYS_REMAP = {
+    "loss": remove_keys_inplace,
+    "up.": replace_up_keys_inplace,
+}
+TOKENIZER_MAX_LENGTH = 226
+def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
+    state_dict = saved_dict
+    if "model" in saved_dict.keys():
+        state_dict = state_dict["model"]
+    if "module" in saved_dict.keys():
+        state_dict = state_dict["module"]
+    if "state_dict" in saved_dict.keys():
+        state_dict = state_dict["state_dict"]
+    return state_dict
+def update_state_dict_inplace(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
+    state_dict[new_key] = state_dict.pop(old_key)
+def convert_transformer(
+    ckpt_path: str,
+    num_layers: int,
+    num_attention_heads: int,
+    use_rotary_positional_embeddings: bool,
+    i2v: bool,
+    dtype: torch.dtype,
+    init_kwargs: Dict[str, Any],
+):
+    PREFIX_KEY = "model.diffusion_model."
+    original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", mmap=True))
+    transformer = CogVideoXTransformer3DModel(
+        in_channels=32 if i2v else 16,
+        num_layers=num_layers,
+        num_attention_heads=num_attention_heads,
+        use_rotary_positional_embeddings=use_rotary_positional_embeddings,
+        ofs_embed_dim=512 if (i2v and init_kwargs["patch_size_t"] is not None) else None,  # CogVideoX1.5-5B-I2V
+        use_learned_positional_embeddings=i2v and init_kwargs["patch_size_t"] is None,  # CogVideoX-5B-I2V
+        **init_kwargs,
+    ).to(dtype=dtype)
+    for key in list(original_state_dict.keys()):
+        new_key = key[len(PREFIX_KEY) :]
+        for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
+            new_key = new_key.replace(replace_key, rename_key)
+        update_state_dict_inplace(original_state_dict, key, new_key)
+    for key in list(original_state_dict.keys()):
+        for special_key, handler_fn_inplace in TRANSFORMER_SPECIAL_KEYS_REMAP.items():
+            if special_key not in key:
+                continue
+            handler_fn_inplace(key, original_state_dict)
+    transformer.load_state_dict(original_state_dict, strict=True)
+    return transformer
+def convert_vae(ckpt_path: str, scaling_factor: float, version: str, dtype: torch.dtype):
+    init_kwargs = {"scaling_factor": scaling_factor}
+    if version == "1.5":
+        init_kwargs.update({"invert_scale_latents": True})
+    original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", mmap=True))
+    vae = AutoencoderKLCogVideoX(**init_kwargs).to(dtype=dtype)
+    for key in list(original_state_dict.keys()):
+        new_key = key[:]
+        for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():
+            new_key = new_key.replace(replace_key, rename_key)
+        update_state_dict_inplace(original_state_dict, key, new_key)
+    for key in list(original_state_dict.keys()):
+        for special_key, handler_fn_inplace in VAE_SPECIAL_KEYS_REMAP.items():
+            if special_key not in key:
+                continue
+            handler_fn_inplace(key, original_state_dict)
+    vae.load_state_dict(original_state_dict, strict=True)
+    return vae
+def get_transformer_init_kwargs(version: str):
+    if version == "1.0":
+        vae_scale_factor_spatial = 8
+        init_kwargs = {
+            "patch_size": 2,
+            "patch_size_t": None,
+            "patch_bias": True,
+            "sample_height": 480 // vae_scale_factor_spatial,
+            "sample_width": 720 // vae_scale_factor_spatial,
+            "sample_frames": 49,
+        }
+    elif version == "1.5":
+        vae_scale_factor_spatial = 8
+        init_kwargs = {
+            "patch_size": 2,
+            "patch_size_t": 2,
+            "patch_bias": False,
+            "sample_height": 300,
+            "sample_width": 300,
+            "sample_frames": 81,
+        }
+    else:
+        raise ValueError("Unsupported version of CogVideoX.")
+    return init_kwargs
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--transformer_ckpt_path", type=str, default=None, help="Path to original transformer checkpoint"
+    )
+    parser.add_argument("--vae_ckpt_path", type=str, default=None, help="Path to original vae checkpoint")
+    parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
+    parser.add_argument("--fp16", action="store_true", default=False, help="Whether to save the model weights in fp16")
+    parser.add_argument("--bf16", action="store_true", default=False, help="Whether to save the model weights in bf16")
+    parser.add_argument(
+        "--push_to_hub", action="store_true", default=False, help="Whether to push to HF Hub after saving"
+    )
+    parser.add_argument(
+        "--text_encoder_cache_dir", type=str, default=None, help="Path to text encoder cache directory"
+    )
+    parser.add_argument(
+        "--typecast_text_encoder",
+        action="store_true",
+        default=False,
+        help="Whether or not to apply fp16/bf16 precision to text_encoder",
+    )
+    # For CogVideoX-2B, num_layers is 30. For 5B, it is 42
+    parser.add_argument("--num_layers", type=int, default=30, help="Number of transformer blocks")
+    # For CogVideoX-2B, num_attention_heads is 30. For 5B, it is 48
+    parser.add_argument("--num_attention_heads", type=int, default=30, help="Number of attention heads")
+    # For CogVideoX-2B, use_rotary_positional_embeddings is False. For 5B, it is True
+    parser.add_argument(
+        "--use_rotary_positional_embeddings", action="store_true", default=False, help="Whether to use RoPE or not"
+    )
+    # For CogVideoX-2B, scaling_factor is 1.15258426. For 5B, it is 0.7
+    parser.add_argument("--scaling_factor", type=float, default=1.15258426, help="Scaling factor in the VAE")
+    # For CogVideoX-2B, snr_shift_scale is 3.0. For 5B, it is 1.0
+    parser.add_argument("--snr_shift_scale", type=float, default=3.0, help="Scaling factor in the VAE")
+    parser.add_argument(
+        "--i2v",
+        action="store_true",
+        default=False,
+        help="Whether the model to be converted is the Image-to-Video version of CogVideoX.",
+    )
+    parser.add_argument(
+        "--version",
+        choices=["1.0", "1.5"],
+        default="1.0",
+        help="Which version of CogVideoX to use for initializing default modeling parameters.",
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = get_args()
+    transformer = None
+    vae = None
+    if args.fp16 and args.bf16:
+        raise ValueError("You cannot pass both --fp16 and --bf16 at the same time.")
+    dtype = torch.float16 if args.fp16 else torch.bfloat16 if args.bf16 else torch.float32
+    if args.transformer_ckpt_path is not None:
+        init_kwargs = get_transformer_init_kwargs(args.version)
+        transformer = convert_transformer(
+            args.transformer_ckpt_path,
+            args.num_layers,
+            args.num_attention_heads,
+            args.use_rotary_positional_embeddings,
+            args.i2v,
+            dtype,
+            init_kwargs,
+        )
+    if args.vae_ckpt_path is not None:
+        # Keep VAE in float32 for better quality
+        vae = convert_vae(args.vae_ckpt_path, args.scaling_factor, args.version, torch.float32)
+    text_encoder_id = "google/t5-v1_1-xxl"
+    tokenizer = T5Tokenizer.from_pretrained(text_encoder_id, model_max_length=TOKENIZER_MAX_LENGTH)
+    text_encoder = T5EncoderModel.from_pretrained(text_encoder_id, cache_dir=args.text_encoder_cache_dir)
+    if args.typecast_text_encoder:
+        text_encoder = text_encoder.to(dtype=dtype)
+    # Apparently, the conversion does not work anymore without this :shrug:
+    for param in text_encoder.parameters():
+        param.data = param.data.contiguous()
+    scheduler = CogVideoXDDIMScheduler.from_config(
+        {
+            "snr_shift_scale": args.snr_shift_scale,
+            "beta_end": 0.012,
+            "beta_schedule": "scaled_linear",
+            "beta_start": 0.00085,
+            "clip_sample": False,
+            "num_train_timesteps": 1000,
+            "prediction_type": "v_prediction",
+            "rescale_betas_zero_snr": True,
+            "set_alpha_to_one": True,
+            "timestep_spacing": "trailing",
+        }
+    )
+    if args.i2v:
+        pipeline_cls = CogVideoXImageToVideoPipeline
+    else:
+        pipeline_cls = CogVideoXPipeline
+    pipe = pipeline_cls(
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        vae=vae,
+        transformer=transformer,
+        scheduler=scheduler,
+    )
+    # We don't use variant here because the model must be run in fp16 (2B) or bf16 (5B). It would be weird
+    # for users to specify variant when the default is not fp32 and they want to run with the correct default (which
+    # is either fp16/bf16 here).
+    # This is necessary This is necessary for users with insufficient memory,
+    # such as those using Colab and notebooks, as it can save some memory used for model loading.
+    pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB", push_to_hub=args.push_to_hub)

diffusers/scripts/convert_consistency_decoder.py ADDED Viewed

	@@ -0,0 +1,1128 @@

+import math
+import os
+import urllib
+import warnings
+from argparse import ArgumentParser
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from huggingface_hub.utils import insecure_hashlib
+from safetensors.torch import load_file as stl
+from tqdm import tqdm
+from diffusers import AutoencoderKL, ConsistencyDecoderVAE, DiffusionPipeline, StableDiffusionPipeline, UNet2DModel
+from diffusers.models.autoencoders.vae import Encoder
+from diffusers.models.embeddings import TimestepEmbedding
+from diffusers.models.unets.unet_2d_blocks import ResnetDownsampleBlock2D, ResnetUpsampleBlock2D, UNetMidBlock2D
+args = ArgumentParser()
+args.add_argument("--save_pretrained", required=False, default=None, type=str)
+args.add_argument("--test_image", required=True, type=str)
+args = args.parse_args()
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    # from: https://github.com/openai/guided-diffusion/blob/22e0df8183507e13a7813f8d38d51b072ca1e67c/guided_diffusion/gaussian_diffusion.py#L895    """
+    res = arr[timesteps].float()
+    dims_to_append = len(broadcast_shape) - len(res.shape)
+    return res[(...,) + (None,) * dims_to_append]
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    # from: https://github.com/openai/guided-diffusion/blob/22e0df8183507e13a7813f8d38d51b072ca1e67c/guided_diffusion/gaussian_diffusion.py#L45
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return torch.tensor(betas)
+def _download(url: str, root: str):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        if insecure_hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(
+            total=int(source.info().get("Content-Length")),
+            ncols=80,
+            unit="iB",
+            unit_scale=True,
+            unit_divisor=1024,
+        ) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    if insecure_hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
+        raise RuntimeError("Model has been downloaded but the SHA256 checksum does not match")
+    return download_target
+class ConsistencyDecoder:
+    def __init__(self, device="cuda:0", download_root=os.path.expanduser("~/.cache/clip")):
+        self.n_distilled_steps = 64
+        download_target = _download(
+            "https://openaipublic.azureedge.net/diff-vae/c9cebd3132dd9c42936d803e33424145a748843c8f716c0814838bdc8a2fe7cb/decoder.pt",
+            download_root,
+        )
+        self.ckpt = torch.jit.load(download_target).to(device)
+        self.device = device
+        sigma_data = 0.5
+        betas = betas_for_alpha_bar(1024, lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2).to(device)
+        alphas = 1.0 - betas
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
+        self.sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - alphas_cumprod)
+        sqrt_recip_alphas_cumprod = torch.sqrt(1.0 / alphas_cumprod)
+        sigmas = torch.sqrt(1.0 / alphas_cumprod - 1)
+        self.c_skip = sqrt_recip_alphas_cumprod * sigma_data**2 / (sigmas**2 + sigma_data**2)
+        self.c_out = sigmas * sigma_data / (sigmas**2 + sigma_data**2) ** 0.5
+        self.c_in = sqrt_recip_alphas_cumprod / (sigmas**2 + sigma_data**2) ** 0.5
+    @staticmethod
+    def round_timesteps(timesteps, total_timesteps, n_distilled_steps, truncate_start=True):
+        with torch.no_grad():
+            space = torch.div(total_timesteps, n_distilled_steps, rounding_mode="floor")
+            rounded_timesteps = (torch.div(timesteps, space, rounding_mode="floor") + 1) * space
+            if truncate_start:
+                rounded_timesteps[rounded_timesteps == total_timesteps] -= space
+            else:
+                rounded_timesteps[rounded_timesteps == total_timesteps] -= space
+                rounded_timesteps[rounded_timesteps == 0] += space
+            return rounded_timesteps
+    @staticmethod
+    def ldm_transform_latent(z, extra_scale_factor=1):
+        channel_means = [0.38862467, 0.02253063, 0.07381133, -0.0171294]
+        channel_stds = [0.9654121, 1.0440036, 0.76147926, 0.77022034]
+        if len(z.shape) != 4:
+            raise ValueError()
+        z = z * 0.18215
+        channels = [z[:, i] for i in range(z.shape[1])]
+        channels = [extra_scale_factor * (c - channel_means[i]) / channel_stds[i] for i, c in enumerate(channels)]
+        return torch.stack(channels, dim=1)
+    @torch.no_grad()
+    def __call__(
+        self,
+        features: torch.Tensor,
+        schedule=[1.0, 0.5],
+        generator=None,
+    ):
+        features = self.ldm_transform_latent(features)
+        ts = self.round_timesteps(
+            torch.arange(0, 1024),
+            1024,
+            self.n_distilled_steps,
+            truncate_start=False,
+        )
+        shape = (
+            features.size(0),
+            3,
+            8 * features.size(2),
+            8 * features.size(3),
+        )
+        x_start = torch.zeros(shape, device=features.device, dtype=features.dtype)
+        schedule_timesteps = [int((1024 - 1) * s) for s in schedule]
+        for i in schedule_timesteps:
+            t = ts[i].item()
+            t_ = torch.tensor([t] * features.shape[0]).to(self.device)
+            # noise = torch.randn_like(x_start)
+            noise = torch.randn(x_start.shape, dtype=x_start.dtype, generator=generator).to(device=x_start.device)
+            x_start = (
+                _extract_into_tensor(self.sqrt_alphas_cumprod, t_, x_start.shape) * x_start
+                + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t_, x_start.shape) * noise
+            )
+            c_in = _extract_into_tensor(self.c_in, t_, x_start.shape)
+            import torch.nn.functional as F
+            from diffusers import UNet2DModel
+            if isinstance(self.ckpt, UNet2DModel):
+                input = torch.concat([c_in * x_start, F.upsample_nearest(features, scale_factor=8)], dim=1)
+                model_output = self.ckpt(input, t_).sample
+            else:
+                model_output = self.ckpt(c_in * x_start, t_, features=features)
+            B, C = x_start.shape[:2]
+            model_output, _ = torch.split(model_output, C, dim=1)
+            pred_xstart = (
+                _extract_into_tensor(self.c_out, t_, x_start.shape) * model_output
+                + _extract_into_tensor(self.c_skip, t_, x_start.shape) * x_start
+            ).clamp(-1, 1)
+            x_start = pred_xstart
+        return x_start
+def save_image(image, name):
+    import numpy as np
+    from PIL import Image
+    image = image[0].cpu().numpy()
+    image = (image + 1.0) * 127.5
+    image = image.clip(0, 255).astype(np.uint8)
+    image = Image.fromarray(image.transpose(1, 2, 0))
+    image.save(name)
+def load_image(uri, size=None, center_crop=False):
+    import numpy as np
+    from PIL import Image
+    image = Image.open(uri)
+    if center_crop:
+        image = image.crop(
+            (
+                (image.width - min(image.width, image.height)) // 2,
+                (image.height - min(image.width, image.height)) // 2,
+                (image.width + min(image.width, image.height)) // 2,
+                (image.height + min(image.width, image.height)) // 2,
+            )
+        )
+    if size is not None:
+        image = image.resize(size)
+    image = torch.tensor(np.array(image).transpose(2, 0, 1)).unsqueeze(0).float()
+    image = image / 127.5 - 1.0
+    return image
+class TimestepEmbedding_(nn.Module):
+    def __init__(self, n_time=1024, n_emb=320, n_out=1280) -> None:
+        super().__init__()
+        self.emb = nn.Embedding(n_time, n_emb)
+        self.f_1 = nn.Linear(n_emb, n_out)
+        self.f_2 = nn.Linear(n_out, n_out)
+    def forward(self, x) -> torch.Tensor:
+        x = self.emb(x)
+        x = self.f_1(x)
+        x = F.silu(x)
+        return self.f_2(x)
+class ImageEmbedding(nn.Module):
+    def __init__(self, in_channels=7, out_channels=320) -> None:
+        super().__init__()
+        self.f = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
+    def forward(self, x) -> torch.Tensor:
+        return self.f(x)
+class ImageUnembedding(nn.Module):
+    def __init__(self, in_channels=320, out_channels=6) -> None:
+        super().__init__()
+        self.gn = nn.GroupNorm(32, in_channels)
+        self.f = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
+    def forward(self, x) -> torch.Tensor:
+        return self.f(F.silu(self.gn(x)))
+class ConvResblock(nn.Module):
+    def __init__(self, in_features=320, out_features=320) -> None:
+        super().__init__()
+        self.f_t = nn.Linear(1280, out_features * 2)
+        self.gn_1 = nn.GroupNorm(32, in_features)
+        self.f_1 = nn.Conv2d(in_features, out_features, kernel_size=3, padding=1)
+        self.gn_2 = nn.GroupNorm(32, out_features)
+        self.f_2 = nn.Conv2d(out_features, out_features, kernel_size=3, padding=1)
+        skip_conv = in_features != out_features
+        self.f_s = nn.Conv2d(in_features, out_features, kernel_size=1, padding=0) if skip_conv else nn.Identity()
+    def forward(self, x, t):
+        x_skip = x
+        t = self.f_t(F.silu(t))
+        t = t.chunk(2, dim=1)
+        t_1 = t[0].unsqueeze(dim=2).unsqueeze(dim=3) + 1
+        t_2 = t[1].unsqueeze(dim=2).unsqueeze(dim=3)
+        gn_1 = F.silu(self.gn_1(x))
+        f_1 = self.f_1(gn_1)
+        gn_2 = self.gn_2(f_1)
+        return self.f_s(x_skip) + self.f_2(F.silu(gn_2 * t_1 + t_2))
+# Also ConvResblock
+class Downsample(nn.Module):
+    def __init__(self, in_channels=320) -> None:
+        super().__init__()
+        self.f_t = nn.Linear(1280, in_channels * 2)
+        self.gn_1 = nn.GroupNorm(32, in_channels)
+        self.f_1 = nn.Conv2d(in_channels, in_channels, kernel_size=3, padding=1)
+        self.gn_2 = nn.GroupNorm(32, in_channels)
+        self.f_2 = nn.Conv2d(in_channels, in_channels, kernel_size=3, padding=1)
+    def forward(self, x, t) -> torch.Tensor:
+        x_skip = x
+        t = self.f_t(F.silu(t))
+        t_1, t_2 = t.chunk(2, dim=1)
+        t_1 = t_1.unsqueeze(2).unsqueeze(3) + 1
+        t_2 = t_2.unsqueeze(2).unsqueeze(3)
+        gn_1 = F.silu(self.gn_1(x))
+        avg_pool2d = F.avg_pool2d(gn_1, kernel_size=(2, 2), stride=None)
+        f_1 = self.f_1(avg_pool2d)
+        gn_2 = self.gn_2(f_1)
+        f_2 = self.f_2(F.silu(t_2 + (t_1 * gn_2)))
+        return f_2 + F.avg_pool2d(x_skip, kernel_size=(2, 2), stride=None)
+# Also ConvResblock
+class Upsample(nn.Module):
+    def __init__(self, in_channels=1024) -> None:
+        super().__init__()
+        self.f_t = nn.Linear(1280, in_channels * 2)
+        self.gn_1 = nn.GroupNorm(32, in_channels)
+        self.f_1 = nn.Conv2d(in_channels, in_channels, kernel_size=3, padding=1)
+        self.gn_2 = nn.GroupNorm(32, in_channels)
+        self.f_2 = nn.Conv2d(in_channels, in_channels, kernel_size=3, padding=1)
+    def forward(self, x, t) -> torch.Tensor:
+        x_skip = x
+        t = self.f_t(F.silu(t))
+        t_1, t_2 = t.chunk(2, dim=1)
+        t_1 = t_1.unsqueeze(2).unsqueeze(3) + 1
+        t_2 = t_2.unsqueeze(2).unsqueeze(3)
+        gn_1 = F.silu(self.gn_1(x))
+        upsample = F.upsample_nearest(gn_1, scale_factor=2)
+        f_1 = self.f_1(upsample)
+        gn_2 = self.gn_2(f_1)
+        f_2 = self.f_2(F.silu(t_2 + (t_1 * gn_2)))
+        return f_2 + F.upsample_nearest(x_skip, scale_factor=2)
+class ConvUNetVAE(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.embed_image = ImageEmbedding()
+        self.embed_time = TimestepEmbedding_()
+        down_0 = nn.ModuleList(
+            [
+                ConvResblock(320, 320),
+                ConvResblock(320, 320),
+                ConvResblock(320, 320),
+                Downsample(320),
+            ]
+        )
+        down_1 = nn.ModuleList(
+            [
+                ConvResblock(320, 640),
+                ConvResblock(640, 640),
+                ConvResblock(640, 640),
+                Downsample(640),
+            ]
+        )
+        down_2 = nn.ModuleList(
+            [
+                ConvResblock(640, 1024),
+                ConvResblock(1024, 1024),
+                ConvResblock(1024, 1024),
+                Downsample(1024),
+            ]
+        )
+        down_3 = nn.ModuleList(
+            [
+                ConvResblock(1024, 1024),
+                ConvResblock(1024, 1024),
+                ConvResblock(1024, 1024),
+            ]
+        )
+        self.down = nn.ModuleList(
+            [
+                down_0,
+                down_1,
+                down_2,
+                down_3,
+            ]
+        )
+        self.mid = nn.ModuleList(
+            [
+                ConvResblock(1024, 1024),
+                ConvResblock(1024, 1024),
+            ]
+        )
+        up_3 = nn.ModuleList(
+            [
+                ConvResblock(1024 * 2, 1024),
+                ConvResblock(1024 * 2, 1024),
+                ConvResblock(1024 * 2, 1024),
+                ConvResblock(1024 * 2, 1024),
+                Upsample(1024),
+            ]
+        )
+        up_2 = nn.ModuleList(
+            [
+                ConvResblock(1024 * 2, 1024),
+                ConvResblock(1024 * 2, 1024),
+                ConvResblock(1024 * 2, 1024),
+                ConvResblock(1024 + 640, 1024),
+                Upsample(1024),
+            ]
+        )
+        up_1 = nn.ModuleList(
+            [
+                ConvResblock(1024 + 640, 640),
+                ConvResblock(640 * 2, 640),
+                ConvResblock(640 * 2, 640),
+                ConvResblock(320 + 640, 640),
+                Upsample(640),
+            ]
+        )
+        up_0 = nn.ModuleList(
+            [
+                ConvResblock(320 + 640, 320),
+                ConvResblock(320 * 2, 320),
+                ConvResblock(320 * 2, 320),
+                ConvResblock(320 * 2, 320),
+            ]
+        )
+        self.up = nn.ModuleList(
+            [
+                up_0,
+                up_1,
+                up_2,
+                up_3,
+            ]
+        )
+        self.output = ImageUnembedding()
+    def forward(self, x, t, features) -> torch.Tensor:
+        converted = hasattr(self, "converted") and self.converted
+        x = torch.cat([x, F.upsample_nearest(features, scale_factor=8)], dim=1)
+        if converted:
+            t = self.time_embedding(self.time_proj(t))
+        else:
+            t = self.embed_time(t)
+        x = self.embed_image(x)
+        skips = [x]
+        for i, down in enumerate(self.down):
+            if converted and i in [0, 1, 2, 3]:
+                x, skips_ = down(x, t)
+                for skip in skips_:
+                    skips.append(skip)
+            else:
+                for block in down:
+                    x = block(x, t)
+                    skips.append(x)
+            print(x.float().abs().sum())
+        if converted:
+            x = self.mid(x, t)
+        else:
+            for i in range(2):
+                x = self.mid[i](x, t)
+        print(x.float().abs().sum())
+        for i, up in enumerate(self.up[::-1]):
+            if converted and i in [0, 1, 2, 3]:
+                skip_4 = skips.pop()
+                skip_3 = skips.pop()
+                skip_2 = skips.pop()
+                skip_1 = skips.pop()
+                skips_ = (skip_1, skip_2, skip_3, skip_4)
+                x = up(x, skips_, t)
+            else:
+                for block in up:
+                    if isinstance(block, ConvResblock):
+                        x = torch.concat([x, skips.pop()], dim=1)
+                    x = block(x, t)
+        return self.output(x)
+def rename_state_dict_key(k):
+    k = k.replace("blocks.", "")
+    for i in range(5):
+        k = k.replace(f"down_{i}_", f"down.{i}.")
+        k = k.replace(f"conv_{i}.", f"{i}.")
+        k = k.replace(f"up_{i}_", f"up.{i}.")
+        k = k.replace(f"mid_{i}", f"mid.{i}")
+    k = k.replace("upsamp.", "4.")
+    k = k.replace("downsamp.", "3.")
+    k = k.replace("f_t.w", "f_t.weight").replace("f_t.b", "f_t.bias")
+    k = k.replace("f_1.w", "f_1.weight").replace("f_1.b", "f_1.bias")
+    k = k.replace("f_2.w", "f_2.weight").replace("f_2.b", "f_2.bias")
+    k = k.replace("f_s.w", "f_s.weight").replace("f_s.b", "f_s.bias")
+    k = k.replace("f.w", "f.weight").replace("f.b", "f.bias")
+    k = k.replace("gn_1.g", "gn_1.weight").replace("gn_1.b", "gn_1.bias")
+    k = k.replace("gn_2.g", "gn_2.weight").replace("gn_2.b", "gn_2.bias")
+    k = k.replace("gn.g", "gn.weight").replace("gn.b", "gn.bias")
+    return k
+def rename_state_dict(sd, embedding):
+    sd = {rename_state_dict_key(k): v for k, v in sd.items()}
+    sd["embed_time.emb.weight"] = embedding["weight"]
+    return sd
+# encode with stable diffusion vae
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+pipe.vae.cuda()
+# construct original decoder with jitted model
+decoder_consistency = ConsistencyDecoder(device="cuda:0")
+# construct UNet code, overwrite the decoder with conv_unet_vae
+model = ConvUNetVAE()
+model.load_state_dict(
+    rename_state_dict(
+        stl("consistency_decoder.safetensors"),
+        stl("embedding.safetensors"),
+    )
+)
+model = model.cuda()
+decoder_consistency.ckpt = model
+image = load_image(args.test_image, size=(256, 256), center_crop=True)
+latent = pipe.vae.encode(image.half().cuda()).latent_dist.sample()
+# decode with gan
+sample_gan = pipe.vae.decode(latent).sample.detach()
+save_image(sample_gan, "gan.png")
+# decode with conv_unet_vae
+sample_consistency_orig = decoder_consistency(latent, generator=torch.Generator("cpu").manual_seed(0))
+save_image(sample_consistency_orig, "con_orig.png")
+########### conversion
+print("CONVERSION")
+print("DOWN BLOCK ONE")
+block_one_sd_orig = model.down[0].state_dict()
+block_one_sd_new = {}
+for i in range(3):
+    block_one_sd_new[f"resnets.{i}.norm1.weight"] = block_one_sd_orig.pop(f"{i}.gn_1.weight")
+    block_one_sd_new[f"resnets.{i}.norm1.bias"] = block_one_sd_orig.pop(f"{i}.gn_1.bias")
+    block_one_sd_new[f"resnets.{i}.conv1.weight"] = block_one_sd_orig.pop(f"{i}.f_1.weight")
+    block_one_sd_new[f"resnets.{i}.conv1.bias"] = block_one_sd_orig.pop(f"{i}.f_1.bias")
+    block_one_sd_new[f"resnets.{i}.time_emb_proj.weight"] = block_one_sd_orig.pop(f"{i}.f_t.weight")
+    block_one_sd_new[f"resnets.{i}.time_emb_proj.bias"] = block_one_sd_orig.pop(f"{i}.f_t.bias")
+    block_one_sd_new[f"resnets.{i}.norm2.weight"] = block_one_sd_orig.pop(f"{i}.gn_2.weight")
+    block_one_sd_new[f"resnets.{i}.norm2.bias"] = block_one_sd_orig.pop(f"{i}.gn_2.bias")
+    block_one_sd_new[f"resnets.{i}.conv2.weight"] = block_one_sd_orig.pop(f"{i}.f_2.weight")
+    block_one_sd_new[f"resnets.{i}.conv2.bias"] = block_one_sd_orig.pop(f"{i}.f_2.bias")
+block_one_sd_new["downsamplers.0.norm1.weight"] = block_one_sd_orig.pop("3.gn_1.weight")
+block_one_sd_new["downsamplers.0.norm1.bias"] = block_one_sd_orig.pop("3.gn_1.bias")
+block_one_sd_new["downsamplers.0.conv1.weight"] = block_one_sd_orig.pop("3.f_1.weight")
+block_one_sd_new["downsamplers.0.conv1.bias"] = block_one_sd_orig.pop("3.f_1.bias")
+block_one_sd_new["downsamplers.0.time_emb_proj.weight"] = block_one_sd_orig.pop("3.f_t.weight")
+block_one_sd_new["downsamplers.0.time_emb_proj.bias"] = block_one_sd_orig.pop("3.f_t.bias")
+block_one_sd_new["downsamplers.0.norm2.weight"] = block_one_sd_orig.pop("3.gn_2.weight")
+block_one_sd_new["downsamplers.0.norm2.bias"] = block_one_sd_orig.pop("3.gn_2.bias")
+block_one_sd_new["downsamplers.0.conv2.weight"] = block_one_sd_orig.pop("3.f_2.weight")
+block_one_sd_new["downsamplers.0.conv2.bias"] = block_one_sd_orig.pop("3.f_2.bias")
+assert len(block_one_sd_orig) == 0
+block_one = ResnetDownsampleBlock2D(
+    in_channels=320,
+    out_channels=320,
+    temb_channels=1280,
+    num_layers=3,
+    add_downsample=True,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+)
+block_one.load_state_dict(block_one_sd_new)
+print("DOWN BLOCK TWO")
+block_two_sd_orig = model.down[1].state_dict()
+block_two_sd_new = {}
+for i in range(3):
+    block_two_sd_new[f"resnets.{i}.norm1.weight"] = block_two_sd_orig.pop(f"{i}.gn_1.weight")
+    block_two_sd_new[f"resnets.{i}.norm1.bias"] = block_two_sd_orig.pop(f"{i}.gn_1.bias")
+    block_two_sd_new[f"resnets.{i}.conv1.weight"] = block_two_sd_orig.pop(f"{i}.f_1.weight")
+    block_two_sd_new[f"resnets.{i}.conv1.bias"] = block_two_sd_orig.pop(f"{i}.f_1.bias")
+    block_two_sd_new[f"resnets.{i}.time_emb_proj.weight"] = block_two_sd_orig.pop(f"{i}.f_t.weight")
+    block_two_sd_new[f"resnets.{i}.time_emb_proj.bias"] = block_two_sd_orig.pop(f"{i}.f_t.bias")
+    block_two_sd_new[f"resnets.{i}.norm2.weight"] = block_two_sd_orig.pop(f"{i}.gn_2.weight")
+    block_two_sd_new[f"resnets.{i}.norm2.bias"] = block_two_sd_orig.pop(f"{i}.gn_2.bias")
+    block_two_sd_new[f"resnets.{i}.conv2.weight"] = block_two_sd_orig.pop(f"{i}.f_2.weight")
+    block_two_sd_new[f"resnets.{i}.conv2.bias"] = block_two_sd_orig.pop(f"{i}.f_2.bias")
+    if i == 0:
+        block_two_sd_new[f"resnets.{i}.conv_shortcut.weight"] = block_two_sd_orig.pop(f"{i}.f_s.weight")
+        block_two_sd_new[f"resnets.{i}.conv_shortcut.bias"] = block_two_sd_orig.pop(f"{i}.f_s.bias")
+block_two_sd_new["downsamplers.0.norm1.weight"] = block_two_sd_orig.pop("3.gn_1.weight")
+block_two_sd_new["downsamplers.0.norm1.bias"] = block_two_sd_orig.pop("3.gn_1.bias")
+block_two_sd_new["downsamplers.0.conv1.weight"] = block_two_sd_orig.pop("3.f_1.weight")
+block_two_sd_new["downsamplers.0.conv1.bias"] = block_two_sd_orig.pop("3.f_1.bias")
+block_two_sd_new["downsamplers.0.time_emb_proj.weight"] = block_two_sd_orig.pop("3.f_t.weight")
+block_two_sd_new["downsamplers.0.time_emb_proj.bias"] = block_two_sd_orig.pop("3.f_t.bias")
+block_two_sd_new["downsamplers.0.norm2.weight"] = block_two_sd_orig.pop("3.gn_2.weight")
+block_two_sd_new["downsamplers.0.norm2.bias"] = block_two_sd_orig.pop("3.gn_2.bias")
+block_two_sd_new["downsamplers.0.conv2.weight"] = block_two_sd_orig.pop("3.f_2.weight")
+block_two_sd_new["downsamplers.0.conv2.bias"] = block_two_sd_orig.pop("3.f_2.bias")
+assert len(block_two_sd_orig) == 0
+block_two = ResnetDownsampleBlock2D(
+    in_channels=320,
+    out_channels=640,
+    temb_channels=1280,
+    num_layers=3,
+    add_downsample=True,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+)
+block_two.load_state_dict(block_two_sd_new)
+print("DOWN BLOCK THREE")
+block_three_sd_orig = model.down[2].state_dict()
+block_three_sd_new = {}
+for i in range(3):
+    block_three_sd_new[f"resnets.{i}.norm1.weight"] = block_three_sd_orig.pop(f"{i}.gn_1.weight")
+    block_three_sd_new[f"resnets.{i}.norm1.bias"] = block_three_sd_orig.pop(f"{i}.gn_1.bias")
+    block_three_sd_new[f"resnets.{i}.conv1.weight"] = block_three_sd_orig.pop(f"{i}.f_1.weight")
+    block_three_sd_new[f"resnets.{i}.conv1.bias"] = block_three_sd_orig.pop(f"{i}.f_1.bias")
+    block_three_sd_new[f"resnets.{i}.time_emb_proj.weight"] = block_three_sd_orig.pop(f"{i}.f_t.weight")
+    block_three_sd_new[f"resnets.{i}.time_emb_proj.bias"] = block_three_sd_orig.pop(f"{i}.f_t.bias")
+    block_three_sd_new[f"resnets.{i}.norm2.weight"] = block_three_sd_orig.pop(f"{i}.gn_2.weight")
+    block_three_sd_new[f"resnets.{i}.norm2.bias"] = block_three_sd_orig.pop(f"{i}.gn_2.bias")
+    block_three_sd_new[f"resnets.{i}.conv2.weight"] = block_three_sd_orig.pop(f"{i}.f_2.weight")
+    block_three_sd_new[f"resnets.{i}.conv2.bias"] = block_three_sd_orig.pop(f"{i}.f_2.bias")
+    if i == 0:
+        block_three_sd_new[f"resnets.{i}.conv_shortcut.weight"] = block_three_sd_orig.pop(f"{i}.f_s.weight")
+        block_three_sd_new[f"resnets.{i}.conv_shortcut.bias"] = block_three_sd_orig.pop(f"{i}.f_s.bias")
+block_three_sd_new["downsamplers.0.norm1.weight"] = block_three_sd_orig.pop("3.gn_1.weight")
+block_three_sd_new["downsamplers.0.norm1.bias"] = block_three_sd_orig.pop("3.gn_1.bias")
+block_three_sd_new["downsamplers.0.conv1.weight"] = block_three_sd_orig.pop("3.f_1.weight")
+block_three_sd_new["downsamplers.0.conv1.bias"] = block_three_sd_orig.pop("3.f_1.bias")
+block_three_sd_new["downsamplers.0.time_emb_proj.weight"] = block_three_sd_orig.pop("3.f_t.weight")
+block_three_sd_new["downsamplers.0.time_emb_proj.bias"] = block_three_sd_orig.pop("3.f_t.bias")
+block_three_sd_new["downsamplers.0.norm2.weight"] = block_three_sd_orig.pop("3.gn_2.weight")
+block_three_sd_new["downsamplers.0.norm2.bias"] = block_three_sd_orig.pop("3.gn_2.bias")
+block_three_sd_new["downsamplers.0.conv2.weight"] = block_three_sd_orig.pop("3.f_2.weight")
+block_three_sd_new["downsamplers.0.conv2.bias"] = block_three_sd_orig.pop("3.f_2.bias")
+assert len(block_three_sd_orig) == 0
+block_three = ResnetDownsampleBlock2D(
+    in_channels=640,
+    out_channels=1024,
+    temb_channels=1280,
+    num_layers=3,
+    add_downsample=True,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+)
+block_three.load_state_dict(block_three_sd_new)
+print("DOWN BLOCK FOUR")
+block_four_sd_orig = model.down[3].state_dict()
+block_four_sd_new = {}
+for i in range(3):
+    block_four_sd_new[f"resnets.{i}.norm1.weight"] = block_four_sd_orig.pop(f"{i}.gn_1.weight")
+    block_four_sd_new[f"resnets.{i}.norm1.bias"] = block_four_sd_orig.pop(f"{i}.gn_1.bias")
+    block_four_sd_new[f"resnets.{i}.conv1.weight"] = block_four_sd_orig.pop(f"{i}.f_1.weight")
+    block_four_sd_new[f"resnets.{i}.conv1.bias"] = block_four_sd_orig.pop(f"{i}.f_1.bias")
+    block_four_sd_new[f"resnets.{i}.time_emb_proj.weight"] = block_four_sd_orig.pop(f"{i}.f_t.weight")
+    block_four_sd_new[f"resnets.{i}.time_emb_proj.bias"] = block_four_sd_orig.pop(f"{i}.f_t.bias")
+    block_four_sd_new[f"resnets.{i}.norm2.weight"] = block_four_sd_orig.pop(f"{i}.gn_2.weight")
+    block_four_sd_new[f"resnets.{i}.norm2.bias"] = block_four_sd_orig.pop(f"{i}.gn_2.bias")
+    block_four_sd_new[f"resnets.{i}.conv2.weight"] = block_four_sd_orig.pop(f"{i}.f_2.weight")
+    block_four_sd_new[f"resnets.{i}.conv2.bias"] = block_four_sd_orig.pop(f"{i}.f_2.bias")
+assert len(block_four_sd_orig) == 0
+block_four = ResnetDownsampleBlock2D(
+    in_channels=1024,
+    out_channels=1024,
+    temb_channels=1280,
+    num_layers=3,
+    add_downsample=False,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+)
+block_four.load_state_dict(block_four_sd_new)
+print("MID BLOCK 1")
+mid_block_one_sd_orig = model.mid.state_dict()
+mid_block_one_sd_new = {}
+for i in range(2):
+    mid_block_one_sd_new[f"resnets.{i}.norm1.weight"] = mid_block_one_sd_orig.pop(f"{i}.gn_1.weight")
+    mid_block_one_sd_new[f"resnets.{i}.norm1.bias"] = mid_block_one_sd_orig.pop(f"{i}.gn_1.bias")
+    mid_block_one_sd_new[f"resnets.{i}.conv1.weight"] = mid_block_one_sd_orig.pop(f"{i}.f_1.weight")
+    mid_block_one_sd_new[f"resnets.{i}.conv1.bias"] = mid_block_one_sd_orig.pop(f"{i}.f_1.bias")
+    mid_block_one_sd_new[f"resnets.{i}.time_emb_proj.weight"] = mid_block_one_sd_orig.pop(f"{i}.f_t.weight")
+    mid_block_one_sd_new[f"resnets.{i}.time_emb_proj.bias"] = mid_block_one_sd_orig.pop(f"{i}.f_t.bias")
+    mid_block_one_sd_new[f"resnets.{i}.norm2.weight"] = mid_block_one_sd_orig.pop(f"{i}.gn_2.weight")
+    mid_block_one_sd_new[f"resnets.{i}.norm2.bias"] = mid_block_one_sd_orig.pop(f"{i}.gn_2.bias")
+    mid_block_one_sd_new[f"resnets.{i}.conv2.weight"] = mid_block_one_sd_orig.pop(f"{i}.f_2.weight")
+    mid_block_one_sd_new[f"resnets.{i}.conv2.bias"] = mid_block_one_sd_orig.pop(f"{i}.f_2.bias")
+assert len(mid_block_one_sd_orig) == 0
+mid_block_one = UNetMidBlock2D(
+    in_channels=1024,
+    temb_channels=1280,
+    num_layers=1,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+    add_attention=False,
+)
+mid_block_one.load_state_dict(mid_block_one_sd_new)
+print("UP BLOCK ONE")
+up_block_one_sd_orig = model.up[-1].state_dict()
+up_block_one_sd_new = {}
+for i in range(4):
+    up_block_one_sd_new[f"resnets.{i}.norm1.weight"] = up_block_one_sd_orig.pop(f"{i}.gn_1.weight")
+    up_block_one_sd_new[f"resnets.{i}.norm1.bias"] = up_block_one_sd_orig.pop(f"{i}.gn_1.bias")
+    up_block_one_sd_new[f"resnets.{i}.conv1.weight"] = up_block_one_sd_orig.pop(f"{i}.f_1.weight")
+    up_block_one_sd_new[f"resnets.{i}.conv1.bias"] = up_block_one_sd_orig.pop(f"{i}.f_1.bias")
+    up_block_one_sd_new[f"resnets.{i}.time_emb_proj.weight"] = up_block_one_sd_orig.pop(f"{i}.f_t.weight")
+    up_block_one_sd_new[f"resnets.{i}.time_emb_proj.bias"] = up_block_one_sd_orig.pop(f"{i}.f_t.bias")
+    up_block_one_sd_new[f"resnets.{i}.norm2.weight"] = up_block_one_sd_orig.pop(f"{i}.gn_2.weight")
+    up_block_one_sd_new[f"resnets.{i}.norm2.bias"] = up_block_one_sd_orig.pop(f"{i}.gn_2.bias")
+    up_block_one_sd_new[f"resnets.{i}.conv2.weight"] = up_block_one_sd_orig.pop(f"{i}.f_2.weight")
+    up_block_one_sd_new[f"resnets.{i}.conv2.bias"] = up_block_one_sd_orig.pop(f"{i}.f_2.bias")
+    up_block_one_sd_new[f"resnets.{i}.conv_shortcut.weight"] = up_block_one_sd_orig.pop(f"{i}.f_s.weight")
+    up_block_one_sd_new[f"resnets.{i}.conv_shortcut.bias"] = up_block_one_sd_orig.pop(f"{i}.f_s.bias")
+up_block_one_sd_new["upsamplers.0.norm1.weight"] = up_block_one_sd_orig.pop("4.gn_1.weight")
+up_block_one_sd_new["upsamplers.0.norm1.bias"] = up_block_one_sd_orig.pop("4.gn_1.bias")
+up_block_one_sd_new["upsamplers.0.conv1.weight"] = up_block_one_sd_orig.pop("4.f_1.weight")
+up_block_one_sd_new["upsamplers.0.conv1.bias"] = up_block_one_sd_orig.pop("4.f_1.bias")
+up_block_one_sd_new["upsamplers.0.time_emb_proj.weight"] = up_block_one_sd_orig.pop("4.f_t.weight")
+up_block_one_sd_new["upsamplers.0.time_emb_proj.bias"] = up_block_one_sd_orig.pop("4.f_t.bias")
+up_block_one_sd_new["upsamplers.0.norm2.weight"] = up_block_one_sd_orig.pop("4.gn_2.weight")
+up_block_one_sd_new["upsamplers.0.norm2.bias"] = up_block_one_sd_orig.pop("4.gn_2.bias")
+up_block_one_sd_new["upsamplers.0.conv2.weight"] = up_block_one_sd_orig.pop("4.f_2.weight")
+up_block_one_sd_new["upsamplers.0.conv2.bias"] = up_block_one_sd_orig.pop("4.f_2.bias")
+assert len(up_block_one_sd_orig) == 0
+up_block_one = ResnetUpsampleBlock2D(
+    in_channels=1024,
+    prev_output_channel=1024,
+    out_channels=1024,
+    temb_channels=1280,
+    num_layers=4,
+    add_upsample=True,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+)
+up_block_one.load_state_dict(up_block_one_sd_new)
+print("UP BLOCK TWO")
+up_block_two_sd_orig = model.up[-2].state_dict()
+up_block_two_sd_new = {}
+for i in range(4):
+    up_block_two_sd_new[f"resnets.{i}.norm1.weight"] = up_block_two_sd_orig.pop(f"{i}.gn_1.weight")
+    up_block_two_sd_new[f"resnets.{i}.norm1.bias"] = up_block_two_sd_orig.pop(f"{i}.gn_1.bias")
+    up_block_two_sd_new[f"resnets.{i}.conv1.weight"] = up_block_two_sd_orig.pop(f"{i}.f_1.weight")
+    up_block_two_sd_new[f"resnets.{i}.conv1.bias"] = up_block_two_sd_orig.pop(f"{i}.f_1.bias")
+    up_block_two_sd_new[f"resnets.{i}.time_emb_proj.weight"] = up_block_two_sd_orig.pop(f"{i}.f_t.weight")
+    up_block_two_sd_new[f"resnets.{i}.time_emb_proj.bias"] = up_block_two_sd_orig.pop(f"{i}.f_t.bias")
+    up_block_two_sd_new[f"resnets.{i}.norm2.weight"] = up_block_two_sd_orig.pop(f"{i}.gn_2.weight")
+    up_block_two_sd_new[f"resnets.{i}.norm2.bias"] = up_block_two_sd_orig.pop(f"{i}.gn_2.bias")
+    up_block_two_sd_new[f"resnets.{i}.conv2.weight"] = up_block_two_sd_orig.pop(f"{i}.f_2.weight")
+    up_block_two_sd_new[f"resnets.{i}.conv2.bias"] = up_block_two_sd_orig.pop(f"{i}.f_2.bias")
+    up_block_two_sd_new[f"resnets.{i}.conv_shortcut.weight"] = up_block_two_sd_orig.pop(f"{i}.f_s.weight")
+    up_block_two_sd_new[f"resnets.{i}.conv_shortcut.bias"] = up_block_two_sd_orig.pop(f"{i}.f_s.bias")
+up_block_two_sd_new["upsamplers.0.norm1.weight"] = up_block_two_sd_orig.pop("4.gn_1.weight")
+up_block_two_sd_new["upsamplers.0.norm1.bias"] = up_block_two_sd_orig.pop("4.gn_1.bias")
+up_block_two_sd_new["upsamplers.0.conv1.weight"] = up_block_two_sd_orig.pop("4.f_1.weight")
+up_block_two_sd_new["upsamplers.0.conv1.bias"] = up_block_two_sd_orig.pop("4.f_1.bias")
+up_block_two_sd_new["upsamplers.0.time_emb_proj.weight"] = up_block_two_sd_orig.pop("4.f_t.weight")
+up_block_two_sd_new["upsamplers.0.time_emb_proj.bias"] = up_block_two_sd_orig.pop("4.f_t.bias")
+up_block_two_sd_new["upsamplers.0.norm2.weight"] = up_block_two_sd_orig.pop("4.gn_2.weight")
+up_block_two_sd_new["upsamplers.0.norm2.bias"] = up_block_two_sd_orig.pop("4.gn_2.bias")
+up_block_two_sd_new["upsamplers.0.conv2.weight"] = up_block_two_sd_orig.pop("4.f_2.weight")
+up_block_two_sd_new["upsamplers.0.conv2.bias"] = up_block_two_sd_orig.pop("4.f_2.bias")
+assert len(up_block_two_sd_orig) == 0
+up_block_two = ResnetUpsampleBlock2D(
+    in_channels=640,
+    prev_output_channel=1024,
+    out_channels=1024,
+    temb_channels=1280,
+    num_layers=4,
+    add_upsample=True,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+)
+up_block_two.load_state_dict(up_block_two_sd_new)
+print("UP BLOCK THREE")
+up_block_three_sd_orig = model.up[-3].state_dict()
+up_block_three_sd_new = {}
+for i in range(4):
+    up_block_three_sd_new[f"resnets.{i}.norm1.weight"] = up_block_three_sd_orig.pop(f"{i}.gn_1.weight")
+    up_block_three_sd_new[f"resnets.{i}.norm1.bias"] = up_block_three_sd_orig.pop(f"{i}.gn_1.bias")
+    up_block_three_sd_new[f"resnets.{i}.conv1.weight"] = up_block_three_sd_orig.pop(f"{i}.f_1.weight")
+    up_block_three_sd_new[f"resnets.{i}.conv1.bias"] = up_block_three_sd_orig.pop(f"{i}.f_1.bias")
+    up_block_three_sd_new[f"resnets.{i}.time_emb_proj.weight"] = up_block_three_sd_orig.pop(f"{i}.f_t.weight")
+    up_block_three_sd_new[f"resnets.{i}.time_emb_proj.bias"] = up_block_three_sd_orig.pop(f"{i}.f_t.bias")
+    up_block_three_sd_new[f"resnets.{i}.norm2.weight"] = up_block_three_sd_orig.pop(f"{i}.gn_2.weight")
+    up_block_three_sd_new[f"resnets.{i}.norm2.bias"] = up_block_three_sd_orig.pop(f"{i}.gn_2.bias")
+    up_block_three_sd_new[f"resnets.{i}.conv2.weight"] = up_block_three_sd_orig.pop(f"{i}.f_2.weight")
+    up_block_three_sd_new[f"resnets.{i}.conv2.bias"] = up_block_three_sd_orig.pop(f"{i}.f_2.bias")
+    up_block_three_sd_new[f"resnets.{i}.conv_shortcut.weight"] = up_block_three_sd_orig.pop(f"{i}.f_s.weight")
+    up_block_three_sd_new[f"resnets.{i}.conv_shortcut.bias"] = up_block_three_sd_orig.pop(f"{i}.f_s.bias")
+up_block_three_sd_new["upsamplers.0.norm1.weight"] = up_block_three_sd_orig.pop("4.gn_1.weight")
+up_block_three_sd_new["upsamplers.0.norm1.bias"] = up_block_three_sd_orig.pop("4.gn_1.bias")
+up_block_three_sd_new["upsamplers.0.conv1.weight"] = up_block_three_sd_orig.pop("4.f_1.weight")
+up_block_three_sd_new["upsamplers.0.conv1.bias"] = up_block_three_sd_orig.pop("4.f_1.bias")
+up_block_three_sd_new["upsamplers.0.time_emb_proj.weight"] = up_block_three_sd_orig.pop("4.f_t.weight")
+up_block_three_sd_new["upsamplers.0.time_emb_proj.bias"] = up_block_three_sd_orig.pop("4.f_t.bias")
+up_block_three_sd_new["upsamplers.0.norm2.weight"] = up_block_three_sd_orig.pop("4.gn_2.weight")
+up_block_three_sd_new["upsamplers.0.norm2.bias"] = up_block_three_sd_orig.pop("4.gn_2.bias")
+up_block_three_sd_new["upsamplers.0.conv2.weight"] = up_block_three_sd_orig.pop("4.f_2.weight")
+up_block_three_sd_new["upsamplers.0.conv2.bias"] = up_block_three_sd_orig.pop("4.f_2.bias")
+assert len(up_block_three_sd_orig) == 0
+up_block_three = ResnetUpsampleBlock2D(
+    in_channels=320,
+    prev_output_channel=1024,
+    out_channels=640,
+    temb_channels=1280,
+    num_layers=4,
+    add_upsample=True,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+)
+up_block_three.load_state_dict(up_block_three_sd_new)
+print("UP BLOCK FOUR")
+up_block_four_sd_orig = model.up[-4].state_dict()
+up_block_four_sd_new = {}
+for i in range(4):
+    up_block_four_sd_new[f"resnets.{i}.norm1.weight"] = up_block_four_sd_orig.pop(f"{i}.gn_1.weight")
+    up_block_four_sd_new[f"resnets.{i}.norm1.bias"] = up_block_four_sd_orig.pop(f"{i}.gn_1.bias")
+    up_block_four_sd_new[f"resnets.{i}.conv1.weight"] = up_block_four_sd_orig.pop(f"{i}.f_1.weight")
+    up_block_four_sd_new[f"resnets.{i}.conv1.bias"] = up_block_four_sd_orig.pop(f"{i}.f_1.bias")
+    up_block_four_sd_new[f"resnets.{i}.time_emb_proj.weight"] = up_block_four_sd_orig.pop(f"{i}.f_t.weight")
+    up_block_four_sd_new[f"resnets.{i}.time_emb_proj.bias"] = up_block_four_sd_orig.pop(f"{i}.f_t.bias")
+    up_block_four_sd_new[f"resnets.{i}.norm2.weight"] = up_block_four_sd_orig.pop(f"{i}.gn_2.weight")
+    up_block_four_sd_new[f"resnets.{i}.norm2.bias"] = up_block_four_sd_orig.pop(f"{i}.gn_2.bias")
+    up_block_four_sd_new[f"resnets.{i}.conv2.weight"] = up_block_four_sd_orig.pop(f"{i}.f_2.weight")
+    up_block_four_sd_new[f"resnets.{i}.conv2.bias"] = up_block_four_sd_orig.pop(f"{i}.f_2.bias")
+    up_block_four_sd_new[f"resnets.{i}.conv_shortcut.weight"] = up_block_four_sd_orig.pop(f"{i}.f_s.weight")
+    up_block_four_sd_new[f"resnets.{i}.conv_shortcut.bias"] = up_block_four_sd_orig.pop(f"{i}.f_s.bias")
+assert len(up_block_four_sd_orig) == 0
+up_block_four = ResnetUpsampleBlock2D(
+    in_channels=320,
+    prev_output_channel=640,
+    out_channels=320,
+    temb_channels=1280,
+    num_layers=4,
+    add_upsample=False,
+    resnet_time_scale_shift="scale_shift",
+    resnet_eps=1e-5,
+)
+up_block_four.load_state_dict(up_block_four_sd_new)
+print("initial projection (conv_in)")
+conv_in_sd_orig = model.embed_image.state_dict()
+conv_in_sd_new = {}
+conv_in_sd_new["weight"] = conv_in_sd_orig.pop("f.weight")
+conv_in_sd_new["bias"] = conv_in_sd_orig.pop("f.bias")
+assert len(conv_in_sd_orig) == 0
+block_out_channels = [320, 640, 1024, 1024]
+in_channels = 7
+conv_in_kernel = 3
+conv_in_padding = (conv_in_kernel - 1) // 2
+conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding)
+conv_in.load_state_dict(conv_in_sd_new)
+print("out projection (conv_out) (conv_norm_out)")
+out_channels = 6
+norm_num_groups = 32
+norm_eps = 1e-5
+act_fn = "silu"
+conv_out_kernel = 3
+conv_out_padding = (conv_out_kernel - 1) // 2
+conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)
+# uses torch.functional in orig
+# conv_act = get_activation(act_fn)
+conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding)
+conv_norm_out.load_state_dict(model.output.gn.state_dict())
+conv_out.load_state_dict(model.output.f.state_dict())
+print("timestep projection (time_proj) (time_embedding)")
+f1_sd = model.embed_time.f_1.state_dict()
+f2_sd = model.embed_time.f_2.state_dict()
+time_embedding_sd = {
+    "linear_1.weight": f1_sd.pop("weight"),
+    "linear_1.bias": f1_sd.pop("bias"),
+    "linear_2.weight": f2_sd.pop("weight"),
+    "linear_2.bias": f2_sd.pop("bias"),
+}
+assert len(f1_sd) == 0
+assert len(f2_sd) == 0
+time_embedding_type = "learned"
+num_train_timesteps = 1024
+time_embedding_dim = 1280
+time_proj = nn.Embedding(num_train_timesteps, block_out_channels[0])
+timestep_input_dim = block_out_channels[0]
+time_embedding = TimestepEmbedding(timestep_input_dim, time_embedding_dim)
+time_proj.load_state_dict(model.embed_time.emb.state_dict())
+time_embedding.load_state_dict(time_embedding_sd)
+print("CONVERT")
+time_embedding.to("cuda")
+time_proj.to("cuda")
+conv_in.to("cuda")
+block_one.to("cuda")
+block_two.to("cuda")
+block_three.to("cuda")
+block_four.to("cuda")
+mid_block_one.to("cuda")
+up_block_one.to("cuda")
+up_block_two.to("cuda")
+up_block_three.to("cuda")
+up_block_four.to("cuda")
+conv_norm_out.to("cuda")
+conv_out.to("cuda")
+model.time_proj = time_proj
+model.time_embedding = time_embedding
+model.embed_image = conv_in
+model.down[0] = block_one
+model.down[1] = block_two
+model.down[2] = block_three
+model.down[3] = block_four
+model.mid = mid_block_one
+model.up[-1] = up_block_one
+model.up[-2] = up_block_two
+model.up[-3] = up_block_three
+model.up[-4] = up_block_four
+model.output.gn = conv_norm_out
+model.output.f = conv_out
+model.converted = True
+sample_consistency_new = decoder_consistency(latent, generator=torch.Generator("cpu").manual_seed(0))
+save_image(sample_consistency_new, "con_new.png")
+assert (sample_consistency_orig == sample_consistency_new).all()
+print("making unet")
+unet = UNet2DModel(
+    in_channels=in_channels,
+    out_channels=out_channels,
+    down_block_types=(
+        "ResnetDownsampleBlock2D",
+        "ResnetDownsampleBlock2D",
+        "ResnetDownsampleBlock2D",
+        "ResnetDownsampleBlock2D",
+    ),
+    up_block_types=(
+        "ResnetUpsampleBlock2D",
+        "ResnetUpsampleBlock2D",
+        "ResnetUpsampleBlock2D",
+        "ResnetUpsampleBlock2D",
+    ),
+    block_out_channels=block_out_channels,
+    layers_per_block=3,
+    norm_num_groups=norm_num_groups,
+    norm_eps=norm_eps,
+    resnet_time_scale_shift="scale_shift",
+    time_embedding_type="learned",
+    num_train_timesteps=num_train_timesteps,
+    add_attention=False,
+)
+unet_state_dict = {}
+def add_state_dict(prefix, mod):
+    for k, v in mod.state_dict().items():
+        unet_state_dict[f"{prefix}.{k}"] = v
+add_state_dict("conv_in", conv_in)
+add_state_dict("time_proj", time_proj)
+add_state_dict("time_embedding", time_embedding)
+add_state_dict("down_blocks.0", block_one)
+add_state_dict("down_blocks.1", block_two)
+add_state_dict("down_blocks.2", block_three)
+add_state_dict("down_blocks.3", block_four)
+add_state_dict("mid_block", mid_block_one)
+add_state_dict("up_blocks.0", up_block_one)
+add_state_dict("up_blocks.1", up_block_two)
+add_state_dict("up_blocks.2", up_block_three)
+add_state_dict("up_blocks.3", up_block_four)
+add_state_dict("conv_norm_out", conv_norm_out)
+add_state_dict("conv_out", conv_out)
+unet.load_state_dict(unet_state_dict)
+print("running with diffusers unet")
+unet.to("cuda")
+decoder_consistency.ckpt = unet
+sample_consistency_new_2 = decoder_consistency(latent, generator=torch.Generator("cpu").manual_seed(0))
+save_image(sample_consistency_new_2, "con_new_2.png")
+assert (sample_consistency_orig == sample_consistency_new_2).all()
+print("running with diffusers model")
+Encoder.old_constructor = Encoder.__init__
+def new_constructor(self, **kwargs):
+    self.old_constructor(**kwargs)
+    self.constructor_arguments = kwargs
+Encoder.__init__ = new_constructor
+vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="vae")
+consistency_vae = ConsistencyDecoderVAE(
+    encoder_args=vae.encoder.constructor_arguments,
+    decoder_args=unet.config,
+    scaling_factor=vae.config.scaling_factor,
+    block_out_channels=vae.config.block_out_channels,
+    latent_channels=vae.config.latent_channels,
+)
+consistency_vae.encoder.load_state_dict(vae.encoder.state_dict())
+consistency_vae.quant_conv.load_state_dict(vae.quant_conv.state_dict())
+consistency_vae.decoder_unet.load_state_dict(unet.state_dict())
+consistency_vae.to(dtype=torch.float16, device="cuda")
+sample_consistency_new_3 = consistency_vae.decode(
+    0.18215 * latent, generator=torch.Generator("cpu").manual_seed(0)
+).sample
+print("max difference")
+print((sample_consistency_orig - sample_consistency_new_3).abs().max())
+print("total difference")
+print((sample_consistency_orig - sample_consistency_new_3).abs().sum())
+# assert (sample_consistency_orig == sample_consistency_new_3).all()
+print("running with diffusers pipeline")
+pipe = DiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", vae=consistency_vae, torch_dtype=torch.float16
+)
+pipe.to("cuda")
+pipe("horse", generator=torch.Generator("cpu").manual_seed(0)).images[0].save("horse.png")
+if args.save_pretrained is not None:
+    consistency_vae.save_pretrained(args.save_pretrained)

diffusers/scripts/convert_dance_diffusion_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,346 @@

+#!/usr/bin/env python3
+import argparse
+import math
+import os
+from copy import deepcopy
+import requests
+import torch
+from audio_diffusion.models import DiffusionAttnUnet1D
+from diffusion import sampling
+from torch import nn
+from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel
+from diffusers.utils.constants import DIFFUSERS_REQUEST_TIMEOUT
+MODELS_MAP = {
+    "gwf-440k": {
+        "url": "https://model-server.zqevans2.workers.dev/gwf-440k.ckpt",
+        "sample_rate": 48000,
+        "sample_size": 65536,
+    },
+    "jmann-small-190k": {
+        "url": "https://model-server.zqevans2.workers.dev/jmann-small-190k.ckpt",
+        "sample_rate": 48000,
+        "sample_size": 65536,
+    },
+    "jmann-large-580k": {
+        "url": "https://model-server.zqevans2.workers.dev/jmann-large-580k.ckpt",
+        "sample_rate": 48000,
+        "sample_size": 131072,
+    },
+    "maestro-uncond-150k": {
+        "url": "https://model-server.zqevans2.workers.dev/maestro-uncond-150k.ckpt",
+        "sample_rate": 16000,
+        "sample_size": 65536,
+    },
+    "unlocked-uncond-250k": {
+        "url": "https://model-server.zqevans2.workers.dev/unlocked-uncond-250k.ckpt",
+        "sample_rate": 16000,
+        "sample_size": 65536,
+    },
+    "honk-140k": {
+        "url": "https://model-server.zqevans2.workers.dev/honk-140k.ckpt",
+        "sample_rate": 16000,
+        "sample_size": 65536,
+    },
+}
+def alpha_sigma_to_t(alpha, sigma):
+    """Returns a timestep, given the scaling factors for the clean image and for
+    the noise."""
+    return torch.atan2(sigma, alpha) / math.pi * 2
+def get_crash_schedule(t):
+    sigma = torch.sin(t * math.pi / 2) ** 2
+    alpha = (1 - sigma**2) ** 0.5
+    return alpha_sigma_to_t(alpha, sigma)
+class Object(object):
+    pass
+class DiffusionUncond(nn.Module):
+    def __init__(self, global_args):
+        super().__init__()
+        self.diffusion = DiffusionAttnUnet1D(global_args, n_attn_layers=4)
+        self.diffusion_ema = deepcopy(self.diffusion)
+        self.rng = torch.quasirandom.SobolEngine(1, scramble=True)
+def download(model_name):
+    url = MODELS_MAP[model_name]["url"]
+    r = requests.get(url, stream=True, timeout=DIFFUSERS_REQUEST_TIMEOUT)
+    local_filename = f"./{model_name}.ckpt"
+    with open(local_filename, "wb") as fp:
+        for chunk in r.iter_content(chunk_size=8192):
+            fp.write(chunk)
+    return local_filename
+DOWN_NUM_TO_LAYER = {
+    "1": "resnets.0",
+    "2": "attentions.0",
+    "3": "resnets.1",
+    "4": "attentions.1",
+    "5": "resnets.2",
+    "6": "attentions.2",
+}
+UP_NUM_TO_LAYER = {
+    "8": "resnets.0",
+    "9": "attentions.0",
+    "10": "resnets.1",
+    "11": "attentions.1",
+    "12": "resnets.2",
+    "13": "attentions.2",
+}
+MID_NUM_TO_LAYER = {
+    "1": "resnets.0",
+    "2": "attentions.0",
+    "3": "resnets.1",
+    "4": "attentions.1",
+    "5": "resnets.2",
+    "6": "attentions.2",
+    "8": "resnets.3",
+    "9": "attentions.3",
+    "10": "resnets.4",
+    "11": "attentions.4",
+    "12": "resnets.5",
+    "13": "attentions.5",
+}
+DEPTH_0_TO_LAYER = {
+    "0": "resnets.0",
+    "1": "resnets.1",
+    "2": "resnets.2",
+    "4": "resnets.0",
+    "5": "resnets.1",
+    "6": "resnets.2",
+}
+RES_CONV_MAP = {
+    "skip": "conv_skip",
+    "main.0": "conv_1",
+    "main.1": "group_norm_1",
+    "main.3": "conv_2",
+    "main.4": "group_norm_2",
+}
+ATTN_MAP = {
+    "norm": "group_norm",
+    "qkv_proj": ["query", "key", "value"],
+    "out_proj": ["proj_attn"],
+}
+def convert_resconv_naming(name):
+    if name.startswith("skip"):
+        return name.replace("skip", RES_CONV_MAP["skip"])
+    # name has to be of format main.{digit}
+    if not name.startswith("main."):
+        raise ValueError(f"ResConvBlock error with {name}")
+    return name.replace(name[:6], RES_CONV_MAP[name[:6]])
+def convert_attn_naming(name):
+    for key, value in ATTN_MAP.items():
+        if name.startswith(key) and not isinstance(value, list):
+            return name.replace(key, value)
+        elif name.startswith(key):
+            return [name.replace(key, v) for v in value]
+    raise ValueError(f"Attn error with {name}")
+def rename(input_string, max_depth=13):
+    string = input_string
+    if string.split(".")[0] == "timestep_embed":
+        return string.replace("timestep_embed", "time_proj")
+    depth = 0
+    if string.startswith("net.3."):
+        depth += 1
+        string = string[6:]
+    elif string.startswith("net."):
+        string = string[4:]
+    while string.startswith("main.7."):
+        depth += 1
+        string = string[7:]
+    if string.startswith("main."):
+        string = string[5:]
+    # mid block
+    if string[:2].isdigit():
+        layer_num = string[:2]
+        string_left = string[2:]
+    else:
+        layer_num = string[0]
+        string_left = string[1:]
+    if depth == max_depth:
+        new_layer = MID_NUM_TO_LAYER[layer_num]
+        prefix = "mid_block"
+    elif depth > 0 and int(layer_num) < 7:
+        new_layer = DOWN_NUM_TO_LAYER[layer_num]
+        prefix = f"down_blocks.{depth}"
+    elif depth > 0 and int(layer_num) > 7:
+        new_layer = UP_NUM_TO_LAYER[layer_num]
+        prefix = f"up_blocks.{max_depth - depth - 1}"
+    elif depth == 0:
+        new_layer = DEPTH_0_TO_LAYER[layer_num]
+        prefix = f"up_blocks.{max_depth - 1}" if int(layer_num) > 3 else "down_blocks.0"
+    if not string_left.startswith("."):
+        raise ValueError(f"Naming error with {input_string} and string_left: {string_left}.")
+    string_left = string_left[1:]
+    if "resnets" in new_layer:
+        string_left = convert_resconv_naming(string_left)
+    elif "attentions" in new_layer:
+        new_string_left = convert_attn_naming(string_left)
+        string_left = new_string_left
+    if not isinstance(string_left, list):
+        new_string = prefix + "." + new_layer + "." + string_left
+    else:
+        new_string = [prefix + "." + new_layer + "." + s for s in string_left]
+    return new_string
+def rename_orig_weights(state_dict):
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        if k.endswith("kernel"):
+            # up- and downsample layers, don't have trainable weights
+            continue
+        new_k = rename(k)
+        # check if we need to transform from Conv => Linear for attention
+        if isinstance(new_k, list):
+            new_state_dict = transform_conv_attns(new_state_dict, new_k, v)
+        else:
+            new_state_dict[new_k] = v
+    return new_state_dict
+def transform_conv_attns(new_state_dict, new_k, v):
+    if len(new_k) == 1:
+        if len(v.shape) == 3:
+            # weight
+            new_state_dict[new_k[0]] = v[:, :, 0]
+        else:
+            # bias
+            new_state_dict[new_k[0]] = v
+    else:
+        # qkv matrices
+        trippled_shape = v.shape[0]
+        single_shape = trippled_shape // 3
+        for i in range(3):
+            if len(v.shape) == 3:
+                new_state_dict[new_k[i]] = v[i * single_shape : (i + 1) * single_shape, :, 0]
+            else:
+                new_state_dict[new_k[i]] = v[i * single_shape : (i + 1) * single_shape]
+    return new_state_dict
+def main(args):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_name = args.model_path.split("/")[-1].split(".")[0]
+    if not os.path.isfile(args.model_path):
+        assert model_name == args.model_path, (
+            f"Make sure to provide one of the official model names {MODELS_MAP.keys()}"
+        )
+        args.model_path = download(model_name)
+    sample_rate = MODELS_MAP[model_name]["sample_rate"]
+    sample_size = MODELS_MAP[model_name]["sample_size"]
+    config = Object()
+    config.sample_size = sample_size
+    config.sample_rate = sample_rate
+    config.latent_dim = 0
+    diffusers_model = UNet1DModel(sample_size=sample_size, sample_rate=sample_rate)
+    diffusers_state_dict = diffusers_model.state_dict()
+    orig_model = DiffusionUncond(config)
+    orig_model.load_state_dict(torch.load(args.model_path, map_location=device)["state_dict"])
+    orig_model = orig_model.diffusion_ema.eval()
+    orig_model_state_dict = orig_model.state_dict()
+    renamed_state_dict = rename_orig_weights(orig_model_state_dict)
+    renamed_minus_diffusers = set(renamed_state_dict.keys()) - set(diffusers_state_dict.keys())
+    diffusers_minus_renamed = set(diffusers_state_dict.keys()) - set(renamed_state_dict.keys())
+    assert len(renamed_minus_diffusers) == 0, f"Problem with {renamed_minus_diffusers}"
+    assert all(k.endswith("kernel") for k in list(diffusers_minus_renamed)), f"Problem with {diffusers_minus_renamed}"
+    for key, value in renamed_state_dict.items():
+        assert diffusers_state_dict[key].squeeze().shape == value.squeeze().shape, (
+            f"Shape for {key} doesn't match. Diffusers: {diffusers_state_dict[key].shape} vs. {value.shape}"
+        )
+        if key == "time_proj.weight":
+            value = value.squeeze()
+        diffusers_state_dict[key] = value
+    diffusers_model.load_state_dict(diffusers_state_dict)
+    steps = 100
+    seed = 33
+    diffusers_scheduler = IPNDMScheduler(num_train_timesteps=steps)
+    generator = torch.manual_seed(seed)
+    noise = torch.randn([1, 2, config.sample_size], generator=generator).to(device)
+    t = torch.linspace(1, 0, steps + 1, device=device)[:-1]
+    step_list = get_crash_schedule(t)
+    pipe = DanceDiffusionPipeline(unet=diffusers_model, scheduler=diffusers_scheduler)
+    generator = torch.manual_seed(33)
+    audio = pipe(num_inference_steps=steps, generator=generator).audios
+    generated = sampling.iplms_sample(orig_model, noise, step_list, {})
+    generated = generated.clamp(-1, 1)
+    diff_sum = (generated - audio).abs().sum()
+    diff_max = (generated - audio).abs().max()
+    if args.save:
+        pipe.save_pretrained(args.checkpoint_path)
+    print("Diff sum", diff_sum)
+    print("Diff max", diff_max)
+    assert diff_max < 1e-3, f"Diff max: {diff_max} is too much :-/"
+    print(f"Conversion for {model_name} successful!")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.")
+    parser.add_argument(
+        "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not."
+    )
+    parser.add_argument("--checkpoint_path", default=None, type=str, required=True, help="Path to the output model.")
+    args = parser.parse_args()
+    main(args)

diffusers/scripts/convert_dcae_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import argparse
+from typing import Any, Dict
+import torch
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+from diffusers import AutoencoderDC
+def remap_qkv_(key: str, state_dict: Dict[str, Any]):
+    qkv = state_dict.pop(key)
+    q, k, v = torch.chunk(qkv, 3, dim=0)
+    parent_module, _, _ = key.rpartition(".qkv.conv.weight")
+    state_dict[f"{parent_module}.to_q.weight"] = q.squeeze()
+    state_dict[f"{parent_module}.to_k.weight"] = k.squeeze()
+    state_dict[f"{parent_module}.to_v.weight"] = v.squeeze()
+def remap_proj_conv_(key: str, state_dict: Dict[str, Any]):
+    parent_module, _, _ = key.rpartition(".proj.conv.weight")
+    state_dict[f"{parent_module}.to_out.weight"] = state_dict.pop(key).squeeze()
+AE_KEYS_RENAME_DICT = {
+    # common
+    "main.": "",
+    "op_list.": "",
+    "context_module": "attn",
+    "local_module": "conv_out",
+    # NOTE: The below two lines work because scales in the available configs only have a tuple length of 1
+    # If there were more scales, there would be more layers, so a loop would be better to handle this
+    "aggreg.0.0": "to_qkv_multiscale.0.proj_in",
+    "aggreg.0.1": "to_qkv_multiscale.0.proj_out",
+    "depth_conv.conv": "conv_depth",
+    "inverted_conv.conv": "conv_inverted",
+    "point_conv.conv": "conv_point",
+    "point_conv.norm": "norm",
+    "conv.conv.": "conv.",
+    "conv1.conv": "conv1",
+    "conv2.conv": "conv2",
+    "conv2.norm": "norm",
+    "proj.norm": "norm_out",
+    # encoder
+    "encoder.project_in.conv": "encoder.conv_in",
+    "encoder.project_out.0.conv": "encoder.conv_out",
+    "encoder.stages": "encoder.down_blocks",
+    # decoder
+    "decoder.project_in.conv": "decoder.conv_in",
+    "decoder.project_out.0": "decoder.norm_out",
+    "decoder.project_out.2.conv": "decoder.conv_out",
+    "decoder.stages": "decoder.up_blocks",
+}
+AE_F32C32_KEYS = {
+    # encoder
+    "encoder.project_in.conv": "encoder.conv_in.conv",
+    # decoder
+    "decoder.project_out.2.conv": "decoder.conv_out.conv",
+}
+AE_F64C128_KEYS = {
+    # encoder
+    "encoder.project_in.conv": "encoder.conv_in.conv",
+    # decoder
+    "decoder.project_out.2.conv": "decoder.conv_out.conv",
+}
+AE_F128C512_KEYS = {
+    # encoder
+    "encoder.project_in.conv": "encoder.conv_in.conv",
+    # decoder
+    "decoder.project_out.2.conv": "decoder.conv_out.conv",
+}
+AE_SPECIAL_KEYS_REMAP = {
+    "qkv.conv.weight": remap_qkv_,
+    "proj.conv.weight": remap_proj_conv_,
+}
+def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
+    state_dict = saved_dict
+    if "model" in saved_dict.keys():
+        state_dict = state_dict["model"]
+    if "module" in saved_dict.keys():
+        state_dict = state_dict["module"]
+    if "state_dict" in saved_dict.keys():
+        state_dict = state_dict["state_dict"]
+    return state_dict
+def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
+    state_dict[new_key] = state_dict.pop(old_key)
+def convert_ae(config_name: str, dtype: torch.dtype):
+    config = get_ae_config(config_name)
+    hub_id = f"mit-han-lab/{config_name}"
+    ckpt_path = hf_hub_download(hub_id, "model.safetensors")
+    original_state_dict = get_state_dict(load_file(ckpt_path))
+    ae = AutoencoderDC(**config).to(dtype=dtype)
+    for key in list(original_state_dict.keys()):
+        new_key = key[:]
+        for replace_key, rename_key in AE_KEYS_RENAME_DICT.items():
+            new_key = new_key.replace(replace_key, rename_key)
+        update_state_dict_(original_state_dict, key, new_key)
+    for key in list(original_state_dict.keys()):
+        for special_key, handler_fn_inplace in AE_SPECIAL_KEYS_REMAP.items():
+            if special_key not in key:
+                continue
+            handler_fn_inplace(key, original_state_dict)
+    ae.load_state_dict(original_state_dict, strict=True)
+    return ae
+def get_ae_config(name: str):
+    if name in ["dc-ae-f32c32-sana-1.0"]:
+        config = {
+            "latent_channels": 32,
+            "encoder_block_types": (
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ),
+            "decoder_block_types": (
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ),
+            "encoder_block_out_channels": (128, 256, 512, 512, 1024, 1024),
+            "decoder_block_out_channels": (128, 256, 512, 512, 1024, 1024),
+            "encoder_qkv_multiscales": ((), (), (), (5,), (5,), (5,)),
+            "decoder_qkv_multiscales": ((), (), (), (5,), (5,), (5,)),
+            "encoder_layers_per_block": (2, 2, 2, 3, 3, 3),
+            "decoder_layers_per_block": [3, 3, 3, 3, 3, 3],
+            "downsample_block_type": "conv",
+            "upsample_block_type": "interpolate",
+            "decoder_norm_types": "rms_norm",
+            "decoder_act_fns": "silu",
+            "scaling_factor": 0.41407,
+        }
+    elif name in ["dc-ae-f32c32-in-1.0", "dc-ae-f32c32-mix-1.0"]:
+        AE_KEYS_RENAME_DICT.update(AE_F32C32_KEYS)
+        config = {
+            "latent_channels": 32,
+            "encoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "decoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "encoder_block_out_channels": [128, 256, 512, 512, 1024, 1024],
+            "decoder_block_out_channels": [128, 256, 512, 512, 1024, 1024],
+            "encoder_layers_per_block": [0, 4, 8, 2, 2, 2],
+            "decoder_layers_per_block": [0, 5, 10, 2, 2, 2],
+            "encoder_qkv_multiscales": ((), (), (), (), (), ()),
+            "decoder_qkv_multiscales": ((), (), (), (), (), ()),
+            "decoder_norm_types": ["batch_norm", "batch_norm", "batch_norm", "rms_norm", "rms_norm", "rms_norm"],
+            "decoder_act_fns": ["relu", "relu", "relu", "silu", "silu", "silu"],
+        }
+        if name == "dc-ae-f32c32-in-1.0":
+            config["scaling_factor"] = 0.3189
+        elif name == "dc-ae-f32c32-mix-1.0":
+            config["scaling_factor"] = 0.4552
+    elif name in ["dc-ae-f64c128-in-1.0", "dc-ae-f64c128-mix-1.0"]:
+        AE_KEYS_RENAME_DICT.update(AE_F64C128_KEYS)
+        config = {
+            "latent_channels": 128,
+            "encoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "decoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "encoder_block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048],
+            "decoder_block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048],
+            "encoder_layers_per_block": [0, 4, 8, 2, 2, 2, 2],
+            "decoder_layers_per_block": [0, 5, 10, 2, 2, 2, 2],
+            "encoder_qkv_multiscales": ((), (), (), (), (), (), ()),
+            "decoder_qkv_multiscales": ((), (), (), (), (), (), ()),
+            "decoder_norm_types": [
+                "batch_norm",
+                "batch_norm",
+                "batch_norm",
+                "rms_norm",
+                "rms_norm",
+                "rms_norm",
+                "rms_norm",
+            ],
+            "decoder_act_fns": ["relu", "relu", "relu", "silu", "silu", "silu", "silu"],
+        }
+        if name == "dc-ae-f64c128-in-1.0":
+            config["scaling_factor"] = 0.2889
+        elif name == "dc-ae-f64c128-mix-1.0":
+            config["scaling_factor"] = 0.4538
+    elif name in ["dc-ae-f128c512-in-1.0", "dc-ae-f128c512-mix-1.0"]:
+        AE_KEYS_RENAME_DICT.update(AE_F128C512_KEYS)
+        config = {
+            "latent_channels": 512,
+            "encoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "decoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "encoder_block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048, 2048],
+            "decoder_block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048, 2048],
+            "encoder_layers_per_block": [0, 4, 8, 2, 2, 2, 2, 2],
+            "decoder_layers_per_block": [0, 5, 10, 2, 2, 2, 2, 2],
+            "encoder_qkv_multiscales": ((), (), (), (), (), (), (), ()),
+            "decoder_qkv_multiscales": ((), (), (), (), (), (), (), ()),
+            "decoder_norm_types": [
+                "batch_norm",
+                "batch_norm",
+                "batch_norm",
+                "rms_norm",
+                "rms_norm",
+                "rms_norm",
+                "rms_norm",
+                "rms_norm",
+            ],
+            "decoder_act_fns": ["relu", "relu", "relu", "silu", "silu", "silu", "silu", "silu"],
+        }
+        if name == "dc-ae-f128c512-in-1.0":
+            config["scaling_factor"] = 0.4883
+        elif name == "dc-ae-f128c512-mix-1.0":
+            config["scaling_factor"] = 0.3620
+    else:
+        raise ValueError("Invalid config name provided.")
+    return config
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config_name",
+        type=str,
+        default="dc-ae-f32c32-sana-1.0",
+        choices=[
+            "dc-ae-f32c32-sana-1.0",
+            "dc-ae-f32c32-in-1.0",
+            "dc-ae-f32c32-mix-1.0",
+            "dc-ae-f64c128-in-1.0",
+            "dc-ae-f64c128-mix-1.0",
+            "dc-ae-f128c512-in-1.0",
+            "dc-ae-f128c512-mix-1.0",
+        ],
+        help="The DCAE checkpoint to convert",
+    )
+    parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
+    parser.add_argument("--dtype", default="fp32", help="Torch dtype to save the model in.")
+    return parser.parse_args()
+DTYPE_MAPPING = {
+    "fp32": torch.float32,
+    "fp16": torch.float16,
+    "bf16": torch.bfloat16,
+}
+VARIANT_MAPPING = {
+    "fp32": None,
+    "fp16": "fp16",
+    "bf16": "bf16",
+}
+if __name__ == "__main__":
+    args = get_args()
+    dtype = DTYPE_MAPPING[args.dtype]
+    variant = VARIANT_MAPPING[args.dtype]
+    ae = convert_ae(args.config_name, dtype)
+    ae.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB", variant=variant)

diffusers/scripts/convert_diffusers_sdxl_lora_to_webui.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Script for converting a Hugging Face Diffusers trained SDXL LoRAs to Kohya format
+# This means that you can input your diffusers-trained LoRAs and
+# Get the output to work with WebUIs such as AUTOMATIC1111, ComfyUI, SD.Next and others.
+# To get started you can find some cool `diffusers` trained LoRAs such as this cute Corgy
+# https://huggingface.co/ignasbud/corgy_dog_LoRA/, download its `pytorch_lora_weights.safetensors` file
+# and run the script:
+# python convert_diffusers_sdxl_lora_to_webui.py --input_lora pytorch_lora_weights.safetensors --output_lora corgy.safetensors
+# now you can use corgy.safetensors in your WebUI of choice!
+# To train your own, here are some diffusers training scripts and utils that you can use and then convert:
+# LoRA Ease - no code SDXL Dreambooth LoRA trainer: https://huggingface.co/spaces/multimodalart/lora-ease
+# Dreambooth Advanced Training Script - state of the art techniques such as pivotal tuning and prodigy optimizer:
+# - Script: https://github.com/huggingface/diffusers/blob/main/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
+# - Colab (only on Pro): https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_Dreambooth_LoRA_advanced_example.ipynb
+# Canonical diffusers training scripts:
+# - Script: https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_lora_sdxl.py
+# - Colab (runs on free tier): https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_.ipynb
+import argparse
+import os
+from safetensors.torch import load_file, save_file
+from diffusers.utils import convert_all_state_dict_to_peft, convert_state_dict_to_kohya
+def convert_and_save(input_lora, output_lora=None):
+    if output_lora is None:
+        base_name = os.path.splitext(input_lora)[0]
+        output_lora = f"{base_name}_webui.safetensors"
+    diffusers_state_dict = load_file(input_lora)
+    peft_state_dict = convert_all_state_dict_to_peft(diffusers_state_dict)
+    kohya_state_dict = convert_state_dict_to_kohya(peft_state_dict)
+    save_file(kohya_state_dict, output_lora)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert LoRA model to PEFT and then to Kohya format.")
+    parser.add_argument(
+        "--input_lora",
+        type=str,
+        required=True,
+        help="Path to the input LoRA model file in the diffusers format.",
+    )
+    parser.add_argument(
+        "--output_lora",
+        type=str,
+        required=False,
+        help="Path for the converted LoRA (safetensors format for AUTOMATIC1111, ComfyUI, etc.). Optional, defaults to input name with a _webui suffix.",
+    )
+    args = parser.parse_args()
+    convert_and_save(args.input_lora, args.output_lora)

diffusers/scripts/convert_flux_xlabs_ipadapter_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import argparse
+from contextlib import nullcontext
+import safetensors.torch
+from accelerate import init_empty_weights
+from huggingface_hub import hf_hub_download
+from diffusers.utils.import_utils import is_accelerate_available, is_transformers_available
+if is_transformers_available():
+    from transformers import CLIPVisionModelWithProjection
+    vision = True
+else:
+    vision = False
+"""
+python scripts/convert_flux_xlabs_ipadapter_to_diffusers.py  \
+--original_state_dict_repo_id "XLabs-AI/flux-ip-adapter" \
+--filename "flux-ip-adapter.safetensors"
+--output_path "flux-ip-adapter-hf/"
+"""
+CTX = init_empty_weights if is_accelerate_available else nullcontext
+parser = argparse.ArgumentParser()
+parser.add_argument("--original_state_dict_repo_id", default=None, type=str)
+parser.add_argument("--filename", default="flux.safetensors", type=str)
+parser.add_argument("--checkpoint_path", default=None, type=str)
+parser.add_argument("--output_path", type=str)
+parser.add_argument("--vision_pretrained_or_path", default="openai/clip-vit-large-patch14", type=str)
+args = parser.parse_args()
+def load_original_checkpoint(args):
+    if args.original_state_dict_repo_id is not None:
+        ckpt_path = hf_hub_download(repo_id=args.original_state_dict_repo_id, filename=args.filename)
+    elif args.checkpoint_path is not None:
+        ckpt_path = args.checkpoint_path
+    else:
+        raise ValueError(" please provide either `original_state_dict_repo_id` or a local `checkpoint_path`")
+    original_state_dict = safetensors.torch.load_file(ckpt_path)
+    return original_state_dict
+def convert_flux_ipadapter_checkpoint_to_diffusers(original_state_dict, num_layers):
+    converted_state_dict = {}
+    # image_proj
+    ## norm
+    converted_state_dict["image_proj.norm.weight"] = original_state_dict.pop("ip_adapter_proj_model.norm.weight")
+    converted_state_dict["image_proj.norm.bias"] = original_state_dict.pop("ip_adapter_proj_model.norm.bias")
+    ## proj
+    converted_state_dict["image_proj.proj.weight"] = original_state_dict.pop("ip_adapter_proj_model.norm.weight")
+    converted_state_dict["image_proj.proj.bias"] = original_state_dict.pop("ip_adapter_proj_model.norm.bias")
+    # double transformer blocks
+    for i in range(num_layers):
+        block_prefix = f"ip_adapter.{i}."
+        # to_k_ip
+        converted_state_dict[f"{block_prefix}to_k_ip.bias"] = original_state_dict.pop(
+            f"double_blocks.{i}.processor.ip_adapter_double_stream_k_proj.bias"
+        )
+        converted_state_dict[f"{block_prefix}to_k_ip.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.processor.ip_adapter_double_stream_k_proj.weight"
+        )
+        # to_v_ip
+        converted_state_dict[f"{block_prefix}to_v_ip.bias"] = original_state_dict.pop(
+            f"double_blocks.{i}.processor.ip_adapter_double_stream_v_proj.bias"
+        )
+        converted_state_dict[f"{block_prefix}to_k_ip.weight"] = original_state_dict.pop(
+            f"double_blocks.{i}.processor.ip_adapter_double_stream_v_proj.weight"
+        )
+    return converted_state_dict
+def main(args):
+    original_ckpt = load_original_checkpoint(args)
+    num_layers = 19
+    converted_ip_adapter_state_dict = convert_flux_ipadapter_checkpoint_to_diffusers(original_ckpt, num_layers)
+    print("Saving Flux IP-Adapter in Diffusers format.")
+    safetensors.torch.save_file(converted_ip_adapter_state_dict, f"{args.output_path}/model.safetensors")
+    if vision:
+        model = CLIPVisionModelWithProjection.from_pretrained(args.vision_pretrained_or_path)
+        model.save_pretrained(f"{args.output_path}/image_encoder")
+if __name__ == "__main__":
+    main(args)

diffusers/scripts/convert_hunyuandit_controlnet_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import argparse
+import torch
+from diffusers import HunyuanDiT2DControlNetModel
+def main(args):
+    state_dict = torch.load(args.pt_checkpoint_path, map_location="cpu")
+    if args.load_key != "none":
+        try:
+            state_dict = state_dict[args.load_key]
+        except KeyError:
+            raise KeyError(
+                f"{args.load_key} not found in the checkpoint."
+                "Please load from the following keys:{state_dict.keys()}"
+            )
+    device = "cuda"
+    model_config = HunyuanDiT2DControlNetModel.load_config(
+        "Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers", subfolder="transformer"
+    )
+    model_config["use_style_cond_and_image_meta_size"] = (
+        args.use_style_cond_and_image_meta_size
+    )  ### version <= v1.1: True; version >= v1.2: False
+    print(model_config)
+    for key in state_dict:
+        print("local:", key)
+    model = HunyuanDiT2DControlNetModel.from_config(model_config).to(device)
+    for key in model.state_dict():
+        print("diffusers:", key)
+    num_layers = 19
+    for i in range(num_layers):
+        # attn1
+        # Wkqv -> to_q, to_k, to_v
+        q, k, v = torch.chunk(state_dict[f"blocks.{i}.attn1.Wqkv.weight"], 3, dim=0)
+        q_bias, k_bias, v_bias = torch.chunk(state_dict[f"blocks.{i}.attn1.Wqkv.bias"], 3, dim=0)
+        state_dict[f"blocks.{i}.attn1.to_q.weight"] = q
+        state_dict[f"blocks.{i}.attn1.to_q.bias"] = q_bias
+        state_dict[f"blocks.{i}.attn1.to_k.weight"] = k
+        state_dict[f"blocks.{i}.attn1.to_k.bias"] = k_bias
+        state_dict[f"blocks.{i}.attn1.to_v.weight"] = v
+        state_dict[f"blocks.{i}.attn1.to_v.bias"] = v_bias
+        state_dict.pop(f"blocks.{i}.attn1.Wqkv.weight")
+        state_dict.pop(f"blocks.{i}.attn1.Wqkv.bias")
+        # q_norm, k_norm -> norm_q, norm_k
+        state_dict[f"blocks.{i}.attn1.norm_q.weight"] = state_dict[f"blocks.{i}.attn1.q_norm.weight"]
+        state_dict[f"blocks.{i}.attn1.norm_q.bias"] = state_dict[f"blocks.{i}.attn1.q_norm.bias"]
+        state_dict[f"blocks.{i}.attn1.norm_k.weight"] = state_dict[f"blocks.{i}.attn1.k_norm.weight"]
+        state_dict[f"blocks.{i}.attn1.norm_k.bias"] = state_dict[f"blocks.{i}.attn1.k_norm.bias"]
+        state_dict.pop(f"blocks.{i}.attn1.q_norm.weight")
+        state_dict.pop(f"blocks.{i}.attn1.q_norm.bias")
+        state_dict.pop(f"blocks.{i}.attn1.k_norm.weight")
+        state_dict.pop(f"blocks.{i}.attn1.k_norm.bias")
+        # out_proj -> to_out
+        state_dict[f"blocks.{i}.attn1.to_out.0.weight"] = state_dict[f"blocks.{i}.attn1.out_proj.weight"]
+        state_dict[f"blocks.{i}.attn1.to_out.0.bias"] = state_dict[f"blocks.{i}.attn1.out_proj.bias"]
+        state_dict.pop(f"blocks.{i}.attn1.out_proj.weight")
+        state_dict.pop(f"blocks.{i}.attn1.out_proj.bias")
+        # attn2
+        # kq_proj -> to_k, to_v
+        k, v = torch.chunk(state_dict[f"blocks.{i}.attn2.kv_proj.weight"], 2, dim=0)
+        k_bias, v_bias = torch.chunk(state_dict[f"blocks.{i}.attn2.kv_proj.bias"], 2, dim=0)
+        state_dict[f"blocks.{i}.attn2.to_k.weight"] = k
+        state_dict[f"blocks.{i}.attn2.to_k.bias"] = k_bias
+        state_dict[f"blocks.{i}.attn2.to_v.weight"] = v
+        state_dict[f"blocks.{i}.attn2.to_v.bias"] = v_bias
+        state_dict.pop(f"blocks.{i}.attn2.kv_proj.weight")
+        state_dict.pop(f"blocks.{i}.attn2.kv_proj.bias")
+        # q_proj -> to_q
+        state_dict[f"blocks.{i}.attn2.to_q.weight"] = state_dict[f"blocks.{i}.attn2.q_proj.weight"]
+        state_dict[f"blocks.{i}.attn2.to_q.bias"] = state_dict[f"blocks.{i}.attn2.q_proj.bias"]
+        state_dict.pop(f"blocks.{i}.attn2.q_proj.weight")
+        state_dict.pop(f"blocks.{i}.attn2.q_proj.bias")
+        # q_norm, k_norm -> norm_q, norm_k
+        state_dict[f"blocks.{i}.attn2.norm_q.weight"] = state_dict[f"blocks.{i}.attn2.q_norm.weight"]
+        state_dict[f"blocks.{i}.attn2.norm_q.bias"] = state_dict[f"blocks.{i}.attn2.q_norm.bias"]
+        state_dict[f"blocks.{i}.attn2.norm_k.weight"] = state_dict[f"blocks.{i}.attn2.k_norm.weight"]
+        state_dict[f"blocks.{i}.attn2.norm_k.bias"] = state_dict[f"blocks.{i}.attn2.k_norm.bias"]
+        state_dict.pop(f"blocks.{i}.attn2.q_norm.weight")
+        state_dict.pop(f"blocks.{i}.attn2.q_norm.bias")
+        state_dict.pop(f"blocks.{i}.attn2.k_norm.weight")
+        state_dict.pop(f"blocks.{i}.attn2.k_norm.bias")
+        # out_proj -> to_out
+        state_dict[f"blocks.{i}.attn2.to_out.0.weight"] = state_dict[f"blocks.{i}.attn2.out_proj.weight"]
+        state_dict[f"blocks.{i}.attn2.to_out.0.bias"] = state_dict[f"blocks.{i}.attn2.out_proj.bias"]
+        state_dict.pop(f"blocks.{i}.attn2.out_proj.weight")
+        state_dict.pop(f"blocks.{i}.attn2.out_proj.bias")
+        # switch norm 2 and norm 3
+        norm2_weight = state_dict[f"blocks.{i}.norm2.weight"]
+        norm2_bias = state_dict[f"blocks.{i}.norm2.bias"]
+        state_dict[f"blocks.{i}.norm2.weight"] = state_dict[f"blocks.{i}.norm3.weight"]
+        state_dict[f"blocks.{i}.norm2.bias"] = state_dict[f"blocks.{i}.norm3.bias"]
+        state_dict[f"blocks.{i}.norm3.weight"] = norm2_weight
+        state_dict[f"blocks.{i}.norm3.bias"] = norm2_bias
+        # norm1 -> norm1.norm
+        # default_modulation.1 -> norm1.linear
+        state_dict[f"blocks.{i}.norm1.norm.weight"] = state_dict[f"blocks.{i}.norm1.weight"]
+        state_dict[f"blocks.{i}.norm1.norm.bias"] = state_dict[f"blocks.{i}.norm1.bias"]
+        state_dict[f"blocks.{i}.norm1.linear.weight"] = state_dict[f"blocks.{i}.default_modulation.1.weight"]
+        state_dict[f"blocks.{i}.norm1.linear.bias"] = state_dict[f"blocks.{i}.default_modulation.1.bias"]
+        state_dict.pop(f"blocks.{i}.norm1.weight")
+        state_dict.pop(f"blocks.{i}.norm1.bias")
+        state_dict.pop(f"blocks.{i}.default_modulation.1.weight")
+        state_dict.pop(f"blocks.{i}.default_modulation.1.bias")
+        # mlp.fc1 -> ff.net.0, mlp.fc2 -> ff.net.2
+        state_dict[f"blocks.{i}.ff.net.0.proj.weight"] = state_dict[f"blocks.{i}.mlp.fc1.weight"]
+        state_dict[f"blocks.{i}.ff.net.0.proj.bias"] = state_dict[f"blocks.{i}.mlp.fc1.bias"]
+        state_dict[f"blocks.{i}.ff.net.2.weight"] = state_dict[f"blocks.{i}.mlp.fc2.weight"]
+        state_dict[f"blocks.{i}.ff.net.2.bias"] = state_dict[f"blocks.{i}.mlp.fc2.bias"]
+        state_dict.pop(f"blocks.{i}.mlp.fc1.weight")
+        state_dict.pop(f"blocks.{i}.mlp.fc1.bias")
+        state_dict.pop(f"blocks.{i}.mlp.fc2.weight")
+        state_dict.pop(f"blocks.{i}.mlp.fc2.bias")
+        # after_proj_list -> controlnet_blocks
+        state_dict[f"controlnet_blocks.{i}.weight"] = state_dict[f"after_proj_list.{i}.weight"]
+        state_dict[f"controlnet_blocks.{i}.bias"] = state_dict[f"after_proj_list.{i}.bias"]
+        state_dict.pop(f"after_proj_list.{i}.weight")
+        state_dict.pop(f"after_proj_list.{i}.bias")
+    # before_proj -> input_block
+    state_dict["input_block.weight"] = state_dict["before_proj.weight"]
+    state_dict["input_block.bias"] = state_dict["before_proj.bias"]
+    state_dict.pop("before_proj.weight")
+    state_dict.pop("before_proj.bias")
+    # pooler -> time_extra_emb
+    state_dict["time_extra_emb.pooler.positional_embedding"] = state_dict["pooler.positional_embedding"]
+    state_dict["time_extra_emb.pooler.k_proj.weight"] = state_dict["pooler.k_proj.weight"]
+    state_dict["time_extra_emb.pooler.k_proj.bias"] = state_dict["pooler.k_proj.bias"]
+    state_dict["time_extra_emb.pooler.q_proj.weight"] = state_dict["pooler.q_proj.weight"]
+    state_dict["time_extra_emb.pooler.q_proj.bias"] = state_dict["pooler.q_proj.bias"]
+    state_dict["time_extra_emb.pooler.v_proj.weight"] = state_dict["pooler.v_proj.weight"]
+    state_dict["time_extra_emb.pooler.v_proj.bias"] = state_dict["pooler.v_proj.bias"]
+    state_dict["time_extra_emb.pooler.c_proj.weight"] = state_dict["pooler.c_proj.weight"]
+    state_dict["time_extra_emb.pooler.c_proj.bias"] = state_dict["pooler.c_proj.bias"]
+    state_dict.pop("pooler.k_proj.weight")
+    state_dict.pop("pooler.k_proj.bias")
+    state_dict.pop("pooler.q_proj.weight")
+    state_dict.pop("pooler.q_proj.bias")
+    state_dict.pop("pooler.v_proj.weight")
+    state_dict.pop("pooler.v_proj.bias")
+    state_dict.pop("pooler.c_proj.weight")
+    state_dict.pop("pooler.c_proj.bias")
+    state_dict.pop("pooler.positional_embedding")
+    # t_embedder -> time_embedding (`TimestepEmbedding`)
+    state_dict["time_extra_emb.timestep_embedder.linear_1.bias"] = state_dict["t_embedder.mlp.0.bias"]
+    state_dict["time_extra_emb.timestep_embedder.linear_1.weight"] = state_dict["t_embedder.mlp.0.weight"]
+    state_dict["time_extra_emb.timestep_embedder.linear_2.bias"] = state_dict["t_embedder.mlp.2.bias"]
+    state_dict["time_extra_emb.timestep_embedder.linear_2.weight"] = state_dict["t_embedder.mlp.2.weight"]
+    state_dict.pop("t_embedder.mlp.0.bias")
+    state_dict.pop("t_embedder.mlp.0.weight")
+    state_dict.pop("t_embedder.mlp.2.bias")
+    state_dict.pop("t_embedder.mlp.2.weight")
+    # x_embedder -> pos_embd (`PatchEmbed`)
+    state_dict["pos_embed.proj.weight"] = state_dict["x_embedder.proj.weight"]
+    state_dict["pos_embed.proj.bias"] = state_dict["x_embedder.proj.bias"]
+    state_dict.pop("x_embedder.proj.weight")
+    state_dict.pop("x_embedder.proj.bias")
+    # mlp_t5 -> text_embedder
+    state_dict["text_embedder.linear_1.bias"] = state_dict["mlp_t5.0.bias"]
+    state_dict["text_embedder.linear_1.weight"] = state_dict["mlp_t5.0.weight"]
+    state_dict["text_embedder.linear_2.bias"] = state_dict["mlp_t5.2.bias"]
+    state_dict["text_embedder.linear_2.weight"] = state_dict["mlp_t5.2.weight"]
+    state_dict.pop("mlp_t5.0.bias")
+    state_dict.pop("mlp_t5.0.weight")
+    state_dict.pop("mlp_t5.2.bias")
+    state_dict.pop("mlp_t5.2.weight")
+    # extra_embedder -> extra_embedder
+    state_dict["time_extra_emb.extra_embedder.linear_1.bias"] = state_dict["extra_embedder.0.bias"]
+    state_dict["time_extra_emb.extra_embedder.linear_1.weight"] = state_dict["extra_embedder.0.weight"]
+    state_dict["time_extra_emb.extra_embedder.linear_2.bias"] = state_dict["extra_embedder.2.bias"]
+    state_dict["time_extra_emb.extra_embedder.linear_2.weight"] = state_dict["extra_embedder.2.weight"]
+    state_dict.pop("extra_embedder.0.bias")
+    state_dict.pop("extra_embedder.0.weight")
+    state_dict.pop("extra_embedder.2.bias")
+    state_dict.pop("extra_embedder.2.weight")
+    # style_embedder
+    if model_config["use_style_cond_and_image_meta_size"]:
+        print(state_dict["style_embedder.weight"])
+        print(state_dict["style_embedder.weight"].shape)
+        state_dict["time_extra_emb.style_embedder.weight"] = state_dict["style_embedder.weight"][0:1]
+        state_dict.pop("style_embedder.weight")
+    model.load_state_dict(state_dict)
+    if args.save:
+        model.save_pretrained(args.output_checkpoint_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save", default=True, type=bool, required=False, help="Whether to save the converted pipeline or not."
+    )
+    parser.add_argument(
+        "--pt_checkpoint_path", default=None, type=str, required=True, help="Path to the .pt pretrained model."
+    )
+    parser.add_argument(
+        "--output_checkpoint_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Path to the output converted diffusers pipeline.",
+    )
+    parser.add_argument(
+        "--load_key", default="none", type=str, required=False, help="The key to load from the pretrained .pt file"
+    )
+    parser.add_argument(
+        "--use_style_cond_and_image_meta_size",
+        type=bool,
+        default=False,
+        help="version <= v1.1: True; version >= v1.2: False",
+    )
+    args = parser.parse_args()
+    main(args)

diffusers/scripts/convert_i2vgen_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,510 @@

+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for the LDM checkpoints."""
+import argparse
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from diffusers import DDIMScheduler, I2VGenXLPipeline, I2VGenXLUNet, StableDiffusionPipeline
+CLIP_ID = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+    for path in paths:
+        new_path = path["new"]
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+        # proj_attn.weight has to be converted from conv 1D to linear
+        weight = old_checkpoint[path["old"]]
+        names = ["proj_attn.weight"]
+        names_2 = ["proj_out.weight", "proj_in.weight"]
+        if any(k in new_path for k in names):
+            checkpoint[new_path] = weight[:, :, 0]
+        elif any(k in new_path for k in names_2) and len(weight.shape) > 2 and ".attentions." not in new_path:
+            checkpoint[new_path] = weight[:, :, 0]
+        else:
+            checkpoint[new_path] = weight
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+def renew_temp_conv_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        mapping.append({"old": old_item, "new": old_item})
+    return mapping
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        if "temopral_conv" not in old_item:
+            mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+    unet_key = "model.diffusion_model."
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        print(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+        for key in keys:
+            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+    new_checkpoint = {}
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+    additional_embedding_substrings = [
+        "local_image_concat",
+        "context_embedding",
+        "local_image_embedding",
+        "fps_embedding",
+    ]
+    for k in unet_state_dict:
+        if any(substring in k for substring in additional_embedding_substrings):
+            diffusers_key = k.replace("local_image_concat", "image_latents_proj_in").replace(
+                "local_image_embedding", "image_latents_context_embedding"
+            )
+            new_checkpoint[diffusers_key] = unet_state_dict[k]
+    # temporal encoder.
+    new_checkpoint["image_latents_temporal_encoder.norm1.weight"] = unet_state_dict[
+        "local_temporal_encoder.layers.0.0.norm.weight"
+    ]
+    new_checkpoint["image_latents_temporal_encoder.norm1.bias"] = unet_state_dict[
+        "local_temporal_encoder.layers.0.0.norm.bias"
+    ]
+    # attention
+    qkv = unet_state_dict["local_temporal_encoder.layers.0.0.fn.to_qkv.weight"]
+    q, k, v = torch.chunk(qkv, 3, dim=0)
+    new_checkpoint["image_latents_temporal_encoder.attn1.to_q.weight"] = q
+    new_checkpoint["image_latents_temporal_encoder.attn1.to_k.weight"] = k
+    new_checkpoint["image_latents_temporal_encoder.attn1.to_v.weight"] = v
+    new_checkpoint["image_latents_temporal_encoder.attn1.to_out.0.weight"] = unet_state_dict[
+        "local_temporal_encoder.layers.0.0.fn.to_out.0.weight"
+    ]
+    new_checkpoint["image_latents_temporal_encoder.attn1.to_out.0.bias"] = unet_state_dict[
+        "local_temporal_encoder.layers.0.0.fn.to_out.0.bias"
+    ]
+    # feedforward
+    new_checkpoint["image_latents_temporal_encoder.ff.net.0.proj.weight"] = unet_state_dict[
+        "local_temporal_encoder.layers.0.1.net.0.0.weight"
+    ]
+    new_checkpoint["image_latents_temporal_encoder.ff.net.0.proj.bias"] = unet_state_dict[
+        "local_temporal_encoder.layers.0.1.net.0.0.bias"
+    ]
+    new_checkpoint["image_latents_temporal_encoder.ff.net.2.weight"] = unet_state_dict[
+        "local_temporal_encoder.layers.0.1.net.2.weight"
+    ]
+    new_checkpoint["image_latents_temporal_encoder.ff.net.2.bias"] = unet_state_dict[
+        "local_temporal_encoder.layers.0.1.net.2.bias"
+    ]
+    if "class_embed_type" in config:
+        if config["class_embed_type"] is None:
+            # No parameters to port
+            ...
+        elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+            new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+            new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+            new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+            new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+        else:
+            raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+    first_temp_attention = [v for v in unet_state_dict if v.startswith("input_blocks.0.1")]
+    paths = renew_attention_paths(first_temp_attention)
+    meta_path = {"old": "input_blocks.0.1", "new": "transformer_in"}
+    assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        temp_attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.2" in key]
+        if f"input_blocks.{i}.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.op.bias"
+            )
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+        temporal_convs = [key for key in resnets if "temopral_conv" in key]
+        paths = renew_temp_conv_paths(temporal_convs)
+        meta_path = {
+            "old": f"input_blocks.{i}.0.temopral_conv",
+            "new": f"down_blocks.{block_id}.temp_convs.{layer_in_block_id}",
+        }
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+        if len(temp_attentions):
+            paths = renew_attention_paths(temp_attentions)
+            meta_path = {
+                "old": f"input_blocks.{i}.2",
+                "new": f"down_blocks.{block_id}.temp_attentions.{layer_in_block_id}",
+            }
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+    resnet_0 = middle_blocks[0]
+    temporal_convs_0 = [key for key in resnet_0 if "temopral_conv" in key]
+    attentions = middle_blocks[1]
+    temp_attentions = middle_blocks[2]
+    resnet_1 = middle_blocks[3]
+    temporal_convs_1 = [key for key in resnet_1 if "temopral_conv" in key]
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    meta_path = {"old": "middle_block.0", "new": "mid_block.resnets.0"}
+    assign_to_checkpoint(
+        resnet_0_paths, new_checkpoint, unet_state_dict, config=config, additional_replacements=[meta_path]
+    )
+    temp_conv_0_paths = renew_temp_conv_paths(temporal_convs_0)
+    meta_path = {"old": "middle_block.0.temopral_conv", "new": "mid_block.temp_convs.0"}
+    assign_to_checkpoint(
+        temp_conv_0_paths, new_checkpoint, unet_state_dict, config=config, additional_replacements=[meta_path]
+    )
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    meta_path = {"old": "middle_block.3", "new": "mid_block.resnets.1"}
+    assign_to_checkpoint(
+        resnet_1_paths, new_checkpoint, unet_state_dict, config=config, additional_replacements=[meta_path]
+    )
+    temp_conv_1_paths = renew_temp_conv_paths(temporal_convs_1)
+    meta_path = {"old": "middle_block.3.temopral_conv", "new": "mid_block.temp_convs.1"}
+    assign_to_checkpoint(
+        temp_conv_1_paths, new_checkpoint, unet_state_dict, config=config, additional_replacements=[meta_path]
+    )
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+    temp_attentions_paths = renew_attention_paths(temp_attentions)
+    meta_path = {"old": "middle_block.2", "new": "mid_block.temp_attentions.0"}
+    assign_to_checkpoint(
+        temp_attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+            temp_attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.2" in key]
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+            temporal_convs = [key for key in resnets if "temopral_conv" in key]
+            paths = renew_temp_conv_paths(temporal_convs)
+            meta_path = {
+                "old": f"output_blocks.{i}.0.temopral_conv",
+                "new": f"up_blocks.{block_id}.temp_convs.{layer_in_block_id}",
+            }
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+            if len(temp_attentions):
+                paths = renew_attention_paths(temp_attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.2",
+                    "new": f"up_blocks.{block_id}.temp_attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+            temopral_conv_paths = [l for l in output_block_layers if "temopral_conv" in l]
+            for path in temopral_conv_paths:
+                pruned_path = path.split("temopral_conv.")[-1]
+                old_path = ".".join(["output_blocks", str(i), str(block_id), "temopral_conv", pruned_path])
+                new_path = ".".join(["up_blocks", str(block_id), "temp_convs", str(layer_in_block_id), pruned_path])
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+    return new_checkpoint
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--unet_checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--push_to_hub", action="store_true")
+    args = parser.parse_args()
+    # UNet
+    unet_checkpoint = torch.load(args.unet_checkpoint_path, map_location="cpu")
+    unet_checkpoint = unet_checkpoint["state_dict"]
+    unet = I2VGenXLUNet(sample_size=32)
+    converted_ckpt = convert_ldm_unet_checkpoint(unet_checkpoint, unet.config)
+    diff_0 = set(unet.state_dict().keys()) - set(converted_ckpt.keys())
+    diff_1 = set(converted_ckpt.keys()) - set(unet.state_dict().keys())
+    assert len(diff_0) == len(diff_1) == 0, "Converted weights don't match"
+    unet.load_state_dict(converted_ckpt, strict=True)
+    # vae
+    temp_pipe = StableDiffusionPipeline.from_single_file(
+        "https://huggingface.co/ali-vilab/i2vgen-xl/blob/main/models/v2-1_512-ema-pruned.ckpt"
+    )
+    vae = temp_pipe.vae
+    del temp_pipe
+    # text encoder and tokenizer
+    text_encoder = CLIPTextModel.from_pretrained(CLIP_ID)
+    tokenizer = CLIPTokenizer.from_pretrained(CLIP_ID)
+    # image encoder and feature extractor
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained(CLIP_ID)
+    feature_extractor = CLIPImageProcessor.from_pretrained(CLIP_ID)
+    # scheduler
+    # https://github.com/ali-vilab/i2vgen-xl/blob/main/configs/i2vgen_xl_train.yaml
+    scheduler = DDIMScheduler(
+        beta_schedule="squaredcos_cap_v2",
+        rescale_betas_zero_snr=True,
+        set_alpha_to_one=True,
+        clip_sample=False,
+        steps_offset=1,
+        timestep_spacing="leading",
+        prediction_type="v_prediction",
+    )
+    # final
+    pipeline = I2VGenXLPipeline(
+        unet=unet,
+        vae=vae,
+        image_encoder=image_encoder,
+        feature_extractor=feature_extractor,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        scheduler=scheduler,
+    )
+    pipeline.save_pretrained(args.dump_path, push_to_hub=args.push_to_hub)

diffusers/scripts/convert_if.py ADDED Viewed

	@@ -0,0 +1,1250 @@

+import argparse
+import inspect
+import os
+import numpy as np
+import torch
+import yaml
+from torch.nn import functional as F
+from transformers import CLIPConfig, CLIPImageProcessor, CLIPVisionModelWithProjection, T5EncoderModel, T5Tokenizer
+from diffusers import DDPMScheduler, IFPipeline, IFSuperResolutionPipeline, UNet2DConditionModel
+from diffusers.pipelines.deepfloyd_if.safety_checker import IFSafetyChecker
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dump_path", required=False, default=None, type=str)
+    parser.add_argument("--dump_path_stage_2", required=False, default=None, type=str)
+    parser.add_argument("--dump_path_stage_3", required=False, default=None, type=str)
+    parser.add_argument("--unet_config", required=False, default=None, type=str, help="Path to unet config file")
+    parser.add_argument(
+        "--unet_checkpoint_path", required=False, default=None, type=str, help="Path to unet checkpoint file"
+    )
+    parser.add_argument(
+        "--unet_checkpoint_path_stage_2",
+        required=False,
+        default=None,
+        type=str,
+        help="Path to stage 2 unet checkpoint file",
+    )
+    parser.add_argument(
+        "--unet_checkpoint_path_stage_3",
+        required=False,
+        default=None,
+        type=str,
+        help="Path to stage 3 unet checkpoint file",
+    )
+    parser.add_argument("--p_head_path", type=str, required=True)
+    parser.add_argument("--w_head_path", type=str, required=True)
+    args = parser.parse_args()
+    return args
+def main(args):
+    tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-xxl")
+    text_encoder = T5EncoderModel.from_pretrained("google/t5-v1_1-xxl")
+    feature_extractor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
+    safety_checker = convert_safety_checker(p_head_path=args.p_head_path, w_head_path=args.w_head_path)
+    if args.unet_config is not None and args.unet_checkpoint_path is not None and args.dump_path is not None:
+        convert_stage_1_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args)
+    if args.unet_checkpoint_path_stage_2 is not None and args.dump_path_stage_2 is not None:
+        convert_super_res_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args, stage=2)
+    if args.unet_checkpoint_path_stage_3 is not None and args.dump_path_stage_3 is not None:
+        convert_super_res_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args, stage=3)
+def convert_stage_1_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args):
+    unet = get_stage_1_unet(args.unet_config, args.unet_checkpoint_path)
+    scheduler = DDPMScheduler(
+        variance_type="learned_range",
+        beta_schedule="squaredcos_cap_v2",
+        prediction_type="epsilon",
+        thresholding=True,
+        dynamic_thresholding_ratio=0.95,
+        sample_max_value=1.5,
+    )
+    pipe = IFPipeline(
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        unet=unet,
+        scheduler=scheduler,
+        safety_checker=safety_checker,
+        feature_extractor=feature_extractor,
+        requires_safety_checker=True,
+    )
+    pipe.save_pretrained(args.dump_path)
+def convert_super_res_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args, stage):
+    if stage == 2:
+        unet_checkpoint_path = args.unet_checkpoint_path_stage_2
+        sample_size = None
+        dump_path = args.dump_path_stage_2
+    elif stage == 3:
+        unet_checkpoint_path = args.unet_checkpoint_path_stage_3
+        sample_size = 1024
+        dump_path = args.dump_path_stage_3
+    else:
+        assert False
+    unet = get_super_res_unet(unet_checkpoint_path, verify_param_count=False, sample_size=sample_size)
+    image_noising_scheduler = DDPMScheduler(
+        beta_schedule="squaredcos_cap_v2",
+    )
+    scheduler = DDPMScheduler(
+        variance_type="learned_range",
+        beta_schedule="squaredcos_cap_v2",
+        prediction_type="epsilon",
+        thresholding=True,
+        dynamic_thresholding_ratio=0.95,
+        sample_max_value=1.0,
+    )
+    pipe = IFSuperResolutionPipeline(
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        unet=unet,
+        scheduler=scheduler,
+        image_noising_scheduler=image_noising_scheduler,
+        safety_checker=safety_checker,
+        feature_extractor=feature_extractor,
+        requires_safety_checker=True,
+    )
+    pipe.save_pretrained(dump_path)
+def get_stage_1_unet(unet_config, unet_checkpoint_path):
+    original_unet_config = yaml.safe_load(unet_config)
+    original_unet_config = original_unet_config["params"]
+    unet_diffusers_config = create_unet_diffusers_config(original_unet_config)
+    unet = UNet2DConditionModel(**unet_diffusers_config)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    unet_checkpoint = torch.load(unet_checkpoint_path, map_location=device)
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        unet_checkpoint, unet_diffusers_config, path=unet_checkpoint_path
+    )
+    unet.load_state_dict(converted_unet_checkpoint)
+    return unet
+def convert_safety_checker(p_head_path, w_head_path):
+    state_dict = {}
+    # p head
+    p_head = np.load(p_head_path)
+    p_head_weights = p_head["weights"]
+    p_head_weights = torch.from_numpy(p_head_weights)
+    p_head_weights = p_head_weights.unsqueeze(0)
+    p_head_biases = p_head["biases"]
+    p_head_biases = torch.from_numpy(p_head_biases)
+    p_head_biases = p_head_biases.unsqueeze(0)
+    state_dict["p_head.weight"] = p_head_weights
+    state_dict["p_head.bias"] = p_head_biases
+    # w head
+    w_head = np.load(w_head_path)
+    w_head_weights = w_head["weights"]
+    w_head_weights = torch.from_numpy(w_head_weights)
+    w_head_weights = w_head_weights.unsqueeze(0)
+    w_head_biases = w_head["biases"]
+    w_head_biases = torch.from_numpy(w_head_biases)
+    w_head_biases = w_head_biases.unsqueeze(0)
+    state_dict["w_head.weight"] = w_head_weights
+    state_dict["w_head.bias"] = w_head_biases
+    # vision model
+    vision_model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+    vision_model_state_dict = vision_model.state_dict()
+    for key, value in vision_model_state_dict.items():
+        key = f"vision_model.{key}"
+        state_dict[key] = value
+    # full model
+    config = CLIPConfig.from_pretrained("openai/clip-vit-large-patch14")
+    safety_checker = IFSafetyChecker(config)
+    safety_checker.load_state_dict(state_dict)
+    return safety_checker
+def create_unet_diffusers_config(original_unet_config, class_embed_type=None):
+    attention_resolutions = parse_list(original_unet_config["attention_resolutions"])
+    attention_resolutions = [original_unet_config["image_size"] // int(res) for res in attention_resolutions]
+    channel_mult = parse_list(original_unet_config["channel_mult"])
+    block_out_channels = [original_unet_config["model_channels"] * mult for mult in channel_mult]
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        if resolution in attention_resolutions:
+            block_type = "SimpleCrossAttnDownBlock2D"
+        elif original_unet_config["resblock_updown"]:
+            block_type = "ResnetDownsampleBlock2D"
+        else:
+            block_type = "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        if resolution in attention_resolutions:
+            block_type = "SimpleCrossAttnUpBlock2D"
+        elif original_unet_config["resblock_updown"]:
+            block_type = "ResnetUpsampleBlock2D"
+        else:
+            block_type = "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+    head_dim = original_unet_config["num_head_channels"]
+    use_linear_projection = (
+        original_unet_config["use_linear_in_transformer"]
+        if "use_linear_in_transformer" in original_unet_config
+        else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim = [5, 10, 20, 20]
+    projection_class_embeddings_input_dim = None
+    if class_embed_type is None:
+        if "num_classes" in original_unet_config:
+            if original_unet_config["num_classes"] == "sequential":
+                class_embed_type = "projection"
+                assert "adm_in_channels" in original_unet_config
+                projection_class_embeddings_input_dim = original_unet_config["adm_in_channels"]
+            else:
+                raise NotImplementedError(
+                    f"Unknown conditional unet num_classes config: {original_unet_config['num_classes']}"
+                )
+    config = {
+        "sample_size": original_unet_config["image_size"],
+        "in_channels": original_unet_config["in_channels"],
+        "down_block_types": tuple(down_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": original_unet_config["num_res_blocks"],
+        "cross_attention_dim": original_unet_config["encoder_channels"],
+        "attention_head_dim": head_dim,
+        "use_linear_projection": use_linear_projection,
+        "class_embed_type": class_embed_type,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+        "out_channels": original_unet_config["out_channels"],
+        "up_block_types": tuple(up_block_types),
+        "upcast_attention": False,  # TODO: guessing
+        "cross_attention_norm": "group_norm",
+        "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+        "addition_embed_type": "text",
+        "act_fn": "gelu",
+    }
+    if original_unet_config["use_scale_shift_norm"]:
+        config["resnet_time_scale_shift"] = "scale_shift"
+    if "encoder_dim" in original_unet_config:
+        config["encoder_hid_dim"] = original_unet_config["encoder_dim"]
+    return config
+def convert_ldm_unet_checkpoint(unet_state_dict, config, path=None):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    new_checkpoint = {}
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+    if config["class_embed_type"] in [None, "identity"]:
+        # No parameters to port
+        ...
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+    else:
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}." in key]
+        for layer_id in range(num_input_blocks)
+    }
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}." in key]
+        for layer_id in range(num_output_blocks)
+    }
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+        paths = renew_resnet_paths(resnets)
+        # TODO need better check than i in [4, 8, 12, 16]
+        block_type = config["down_block_types"][block_id]
+        if (block_type == "ResnetDownsampleBlock2D" or block_type == "SimpleCrossAttnDownBlock2D") and i in [
+            4,
+            8,
+            12,
+            16,
+        ]:
+            meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.downsamplers.0"}
+        else:
+            meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+        if len(attentions):
+            old_path = f"input_blocks.{i}.1"
+            new_path = f"down_blocks.{block_id}.attentions.{layer_in_block_id}"
+            assign_attention_to_checkpoint(
+                new_checkpoint=new_checkpoint,
+                unet_state_dict=unet_state_dict,
+                old_path=old_path,
+                new_path=new_path,
+                config=config,
+            )
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": old_path, "new": new_path}
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                unet_state_dict,
+                additional_replacements=[meta_path],
+                config=config,
+            )
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+    old_path = "middle_block.1"
+    new_path = "mid_block.attentions.0"
+    assign_attention_to_checkpoint(
+        new_checkpoint=new_checkpoint,
+        unet_state_dict=unet_state_dict,
+        old_path=old_path,
+        new_path=new_path,
+        config=config,
+    )
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+        # len(output_block_list) == 1 -> resnet
+        # len(output_block_list) == 2 -> resnet, attention
+        # len(output_block_list) == 3 -> resnet, attention, upscale resnet
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+            paths = renew_resnet_paths(resnets)
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+            if len(attentions):
+                old_path = f"output_blocks.{i}.1"
+                new_path = f"up_blocks.{block_id}.attentions.{layer_in_block_id}"
+                assign_attention_to_checkpoint(
+                    new_checkpoint=new_checkpoint,
+                    unet_state_dict=unet_state_dict,
+                    old_path=old_path,
+                    new_path=new_path,
+                    config=config,
+                )
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": old_path,
+                    "new": new_path,
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+            if len(output_block_list) == 3:
+                resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.2" in key]
+                paths = renew_resnet_paths(resnets)
+                meta_path = {"old": f"output_blocks.{i}.2", "new": f"up_blocks.{block_id}.upsamplers.0"}
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+    if "encoder_proj.weight" in unet_state_dict:
+        new_checkpoint["encoder_hid_proj.weight"] = unet_state_dict.pop("encoder_proj.weight")
+        new_checkpoint["encoder_hid_proj.bias"] = unet_state_dict.pop("encoder_proj.bias")
+    if "encoder_pooling.0.weight" in unet_state_dict:
+        new_checkpoint["add_embedding.norm1.weight"] = unet_state_dict.pop("encoder_pooling.0.weight")
+        new_checkpoint["add_embedding.norm1.bias"] = unet_state_dict.pop("encoder_pooling.0.bias")
+        new_checkpoint["add_embedding.pool.positional_embedding"] = unet_state_dict.pop(
+            "encoder_pooling.1.positional_embedding"
+        )
+        new_checkpoint["add_embedding.pool.k_proj.weight"] = unet_state_dict.pop("encoder_pooling.1.k_proj.weight")
+        new_checkpoint["add_embedding.pool.k_proj.bias"] = unet_state_dict.pop("encoder_pooling.1.k_proj.bias")
+        new_checkpoint["add_embedding.pool.q_proj.weight"] = unet_state_dict.pop("encoder_pooling.1.q_proj.weight")
+        new_checkpoint["add_embedding.pool.q_proj.bias"] = unet_state_dict.pop("encoder_pooling.1.q_proj.bias")
+        new_checkpoint["add_embedding.pool.v_proj.weight"] = unet_state_dict.pop("encoder_pooling.1.v_proj.weight")
+        new_checkpoint["add_embedding.pool.v_proj.bias"] = unet_state_dict.pop("encoder_pooling.1.v_proj.bias")
+        new_checkpoint["add_embedding.proj.weight"] = unet_state_dict.pop("encoder_pooling.2.weight")
+        new_checkpoint["add_embedding.proj.bias"] = unet_state_dict.pop("encoder_pooling.2.bias")
+        new_checkpoint["add_embedding.norm2.weight"] = unet_state_dict.pop("encoder_pooling.3.weight")
+        new_checkpoint["add_embedding.norm2.bias"] = unet_state_dict.pop("encoder_pooling.3.bias")
+    return new_checkpoint
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        if "qkv" in new_item:
+            continue
+        if "encoder_kv" in new_item:
+            continue
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+        new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
+        new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
+        new_item = new_item.replace("norm_encoder.weight", "norm_cross.weight")
+        new_item = new_item.replace("norm_encoder.bias", "norm_cross.bias")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def assign_attention_to_checkpoint(new_checkpoint, unet_state_dict, old_path, new_path, config):
+    qkv_weight = unet_state_dict.pop(f"{old_path}.qkv.weight")
+    qkv_weight = qkv_weight[:, :, 0]
+    qkv_bias = unet_state_dict.pop(f"{old_path}.qkv.bias")
+    is_cross_attn_only = "only_cross_attention" in config and config["only_cross_attention"]
+    split = 1 if is_cross_attn_only else 3
+    weights, bias = split_attentions(
+        weight=qkv_weight,
+        bias=qkv_bias,
+        split=split,
+        chunk_size=config["attention_head_dim"],
+    )
+    if is_cross_attn_only:
+        query_weight, q_bias = weights, bias
+        new_checkpoint[f"{new_path}.to_q.weight"] = query_weight[0]
+        new_checkpoint[f"{new_path}.to_q.bias"] = q_bias[0]
+    else:
+        [query_weight, key_weight, value_weight], [q_bias, k_bias, v_bias] = weights, bias
+        new_checkpoint[f"{new_path}.to_q.weight"] = query_weight
+        new_checkpoint[f"{new_path}.to_q.bias"] = q_bias
+        new_checkpoint[f"{new_path}.to_k.weight"] = key_weight
+        new_checkpoint[f"{new_path}.to_k.bias"] = k_bias
+        new_checkpoint[f"{new_path}.to_v.weight"] = value_weight
+        new_checkpoint[f"{new_path}.to_v.bias"] = v_bias
+    encoder_kv_weight = unet_state_dict.pop(f"{old_path}.encoder_kv.weight")
+    encoder_kv_weight = encoder_kv_weight[:, :, 0]
+    encoder_kv_bias = unet_state_dict.pop(f"{old_path}.encoder_kv.bias")
+    [encoder_k_weight, encoder_v_weight], [encoder_k_bias, encoder_v_bias] = split_attentions(
+        weight=encoder_kv_weight,
+        bias=encoder_kv_bias,
+        split=2,
+        chunk_size=config["attention_head_dim"],
+    )
+    new_checkpoint[f"{new_path}.add_k_proj.weight"] = encoder_k_weight
+    new_checkpoint[f"{new_path}.add_k_proj.bias"] = encoder_k_bias
+    new_checkpoint[f"{new_path}.add_v_proj.weight"] = encoder_v_weight
+    new_checkpoint[f"{new_path}.add_v_proj.bias"] = encoder_v_bias
+def assign_to_checkpoint(paths, checkpoint, old_checkpoint, additional_replacements=None, config=None):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    for path in paths:
+        new_path = path["new"]
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path or "to_out.0.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+# TODO maybe document and/or can do more efficiently (build indices in for loop and extract once for each split?)
+def split_attentions(*, weight, bias, split, chunk_size):
+    weights = [None] * split
+    biases = [None] * split
+    weights_biases_idx = 0
+    for starting_row_index in range(0, weight.shape[0], chunk_size):
+        row_indices = torch.arange(starting_row_index, starting_row_index + chunk_size)
+        weight_rows = weight[row_indices, :]
+        bias_rows = bias[row_indices]
+        if weights[weights_biases_idx] is None:
+            weights[weights_biases_idx] = weight_rows
+            biases[weights_biases_idx] = bias_rows
+        else:
+            assert weights[weights_biases_idx] is not None
+            weights[weights_biases_idx] = torch.concat([weights[weights_biases_idx], weight_rows])
+            biases[weights_biases_idx] = torch.concat([biases[weights_biases_idx], bias_rows])
+        weights_biases_idx = (weights_biases_idx + 1) % split
+    return weights, biases
+def parse_list(value):
+    if isinstance(value, str):
+        value = value.split(",")
+        value = [int(v) for v in value]
+    elif isinstance(value, list):
+        pass
+    else:
+        raise ValueError(f"Can't parse list for type: {type(value)}")
+    return value
+# below is copy and pasted from original convert_if_stage_2.py script
+def get_super_res_unet(unet_checkpoint_path, verify_param_count=True, sample_size=None):
+    orig_path = unet_checkpoint_path
+    original_unet_config = yaml.safe_load(os.path.join(orig_path, "config.yml"))
+    original_unet_config = original_unet_config["params"]
+    unet_diffusers_config = superres_create_unet_diffusers_config(original_unet_config)
+    unet_diffusers_config["time_embedding_dim"] = original_unet_config["model_channels"] * int(
+        original_unet_config["channel_mult"].split(",")[-1]
+    )
+    if original_unet_config["encoder_dim"] != original_unet_config["encoder_channels"]:
+        unet_diffusers_config["encoder_hid_dim"] = original_unet_config["encoder_dim"]
+        unet_diffusers_config["class_embed_type"] = "timestep"
+        unet_diffusers_config["addition_embed_type"] = "text"
+    unet_diffusers_config["time_embedding_act_fn"] = "gelu"
+    unet_diffusers_config["resnet_skip_time_act"] = True
+    unet_diffusers_config["resnet_out_scale_factor"] = 1 / 0.7071
+    unet_diffusers_config["mid_block_scale_factor"] = 1 / 0.7071
+    unet_diffusers_config["only_cross_attention"] = (
+        bool(original_unet_config["disable_self_attentions"])
+        if (
+            "disable_self_attentions" in original_unet_config
+            and isinstance(original_unet_config["disable_self_attentions"], int)
+        )
+        else True
+    )
+    if sample_size is None:
+        unet_diffusers_config["sample_size"] = original_unet_config["image_size"]
+    else:
+        # The second upscaler unet's sample size is incorrectly specified
+        # in the config and is instead hardcoded in source
+        unet_diffusers_config["sample_size"] = sample_size
+    unet_checkpoint = torch.load(os.path.join(unet_checkpoint_path, "pytorch_model.bin"), map_location="cpu")
+    if verify_param_count:
+        # check that architecture matches - is a bit slow
+        verify_param_count(orig_path, unet_diffusers_config)
+    converted_unet_checkpoint = superres_convert_ldm_unet_checkpoint(
+        unet_checkpoint, unet_diffusers_config, path=unet_checkpoint_path
+    )
+    converted_keys = converted_unet_checkpoint.keys()
+    model = UNet2DConditionModel(**unet_diffusers_config)
+    expected_weights = model.state_dict().keys()
+    diff_c_e = set(converted_keys) - set(expected_weights)
+    diff_e_c = set(expected_weights) - set(converted_keys)
+    assert len(diff_e_c) == 0, f"Expected, but not converted: {diff_e_c}"
+    assert len(diff_c_e) == 0, f"Converted, but not expected: {diff_c_e}"
+    model.load_state_dict(converted_unet_checkpoint)
+    return model
+def superres_create_unet_diffusers_config(original_unet_config):
+    attention_resolutions = parse_list(original_unet_config["attention_resolutions"])
+    attention_resolutions = [original_unet_config["image_size"] // int(res) for res in attention_resolutions]
+    channel_mult = parse_list(original_unet_config["channel_mult"])
+    block_out_channels = [original_unet_config["model_channels"] * mult for mult in channel_mult]
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        if resolution in attention_resolutions:
+            block_type = "SimpleCrossAttnDownBlock2D"
+        elif original_unet_config["resblock_updown"]:
+            block_type = "ResnetDownsampleBlock2D"
+        else:
+            block_type = "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        if resolution in attention_resolutions:
+            block_type = "SimpleCrossAttnUpBlock2D"
+        elif original_unet_config["resblock_updown"]:
+            block_type = "ResnetUpsampleBlock2D"
+        else:
+            block_type = "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+    head_dim = original_unet_config["num_head_channels"]
+    use_linear_projection = (
+        original_unet_config["use_linear_in_transformer"]
+        if "use_linear_in_transformer" in original_unet_config
+        else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim = [5, 10, 20, 20]
+    class_embed_type = None
+    projection_class_embeddings_input_dim = None
+    if "num_classes" in original_unet_config:
+        if original_unet_config["num_classes"] == "sequential":
+            class_embed_type = "projection"
+            assert "adm_in_channels" in original_unet_config
+            projection_class_embeddings_input_dim = original_unet_config["adm_in_channels"]
+        else:
+            raise NotImplementedError(
+                f"Unknown conditional unet num_classes config: {original_unet_config['num_classes']}"
+            )
+    config = {
+        "in_channels": original_unet_config["in_channels"],
+        "down_block_types": tuple(down_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": tuple(original_unet_config["num_res_blocks"]),
+        "cross_attention_dim": original_unet_config["encoder_channels"],
+        "attention_head_dim": head_dim,
+        "use_linear_projection": use_linear_projection,
+        "class_embed_type": class_embed_type,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+        "out_channels": original_unet_config["out_channels"],
+        "up_block_types": tuple(up_block_types),
+        "upcast_attention": False,  # TODO: guessing
+        "cross_attention_norm": "group_norm",
+        "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+        "act_fn": "gelu",
+    }
+    if original_unet_config["use_scale_shift_norm"]:
+        config["resnet_time_scale_shift"] = "scale_shift"
+    return config
+def superres_convert_ldm_unet_checkpoint(unet_state_dict, config, path=None, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    new_checkpoint = {}
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+    if config["class_embed_type"] is None:
+        # No parameters to port
+        ...
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["aug_proj.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["aug_proj.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["aug_proj.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["aug_proj.2.bias"]
+    else:
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+    if "encoder_proj.weight" in unet_state_dict:
+        new_checkpoint["encoder_hid_proj.weight"] = unet_state_dict["encoder_proj.weight"]
+        new_checkpoint["encoder_hid_proj.bias"] = unet_state_dict["encoder_proj.bias"]
+    if "encoder_pooling.0.weight" in unet_state_dict:
+        mapping = {
+            "encoder_pooling.0": "add_embedding.norm1",
+            "encoder_pooling.1": "add_embedding.pool",
+            "encoder_pooling.2": "add_embedding.proj",
+            "encoder_pooling.3": "add_embedding.norm2",
+        }
+        for key in unet_state_dict.keys():
+            if key.startswith("encoder_pooling"):
+                prefix = key[: len("encoder_pooling.0")]
+                new_key = key.replace(prefix, mapping[prefix])
+                new_checkpoint[new_key] = unet_state_dict[key]
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}." in key]
+        for layer_id in range(num_input_blocks)
+    }
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}." in key]
+        for layer_id in range(num_output_blocks)
+    }
+    if not isinstance(config["layers_per_block"], int):
+        layers_per_block_list = [e + 1 for e in config["layers_per_block"]]
+        layers_per_block_cumsum = list(np.cumsum(layers_per_block_list))
+        downsampler_ids = layers_per_block_cumsum
+    else:
+        # TODO need better check than i in [4, 8, 12, 16]
+        downsampler_ids = [4, 8, 12, 16]
+    for i in range(1, num_input_blocks):
+        if isinstance(config["layers_per_block"], int):
+            layers_per_block = config["layers_per_block"]
+            block_id = (i - 1) // (layers_per_block + 1)
+            layer_in_block_id = (i - 1) % (layers_per_block + 1)
+        else:
+            block_id = next(k for k, n in enumerate(layers_per_block_cumsum) if (i - 1) < n)
+            passed_blocks = layers_per_block_cumsum[block_id - 1] if block_id > 0 else 0
+            layer_in_block_id = (i - 1) - passed_blocks
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+        paths = renew_resnet_paths(resnets)
+        block_type = config["down_block_types"][block_id]
+        if (
+            block_type == "ResnetDownsampleBlock2D" or block_type == "SimpleCrossAttnDownBlock2D"
+        ) and i in downsampler_ids:
+            meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.downsamplers.0"}
+        else:
+            meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+        if len(attentions):
+            old_path = f"input_blocks.{i}.1"
+            new_path = f"down_blocks.{block_id}.attentions.{layer_in_block_id}"
+            assign_attention_to_checkpoint(
+                new_checkpoint=new_checkpoint,
+                unet_state_dict=unet_state_dict,
+                old_path=old_path,
+                new_path=new_path,
+                config=config,
+            )
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": old_path, "new": new_path}
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                unet_state_dict,
+                additional_replacements=[meta_path],
+                config=config,
+            )
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+    old_path = "middle_block.1"
+    new_path = "mid_block.attentions.0"
+    assign_attention_to_checkpoint(
+        new_checkpoint=new_checkpoint,
+        unet_state_dict=unet_state_dict,
+        old_path=old_path,
+        new_path=new_path,
+        config=config,
+    )
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+    if not isinstance(config["layers_per_block"], int):
+        layers_per_block_list = list(reversed([e + 1 for e in config["layers_per_block"]]))
+        layers_per_block_cumsum = list(np.cumsum(layers_per_block_list))
+    for i in range(num_output_blocks):
+        if isinstance(config["layers_per_block"], int):
+            layers_per_block = config["layers_per_block"]
+            block_id = i // (layers_per_block + 1)
+            layer_in_block_id = i % (layers_per_block + 1)
+        else:
+            block_id = next(k for k, n in enumerate(layers_per_block_cumsum) if i < n)
+            passed_blocks = layers_per_block_cumsum[block_id - 1] if block_id > 0 else 0
+            layer_in_block_id = i - passed_blocks
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+        # len(output_block_list) == 1 -> resnet
+        # len(output_block_list) == 2 -> resnet, attention or resnet, upscale resnet
+        # len(output_block_list) == 3 -> resnet, attention, upscale resnet
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            has_attention = True
+            if len(output_block_list) == 2 and any("in_layers" in k for k in output_block_list["1"]):
+                has_attention = False
+            maybe_attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+            paths = renew_resnet_paths(resnets)
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+                # this layer was no attention
+                has_attention = False
+                maybe_attentions = []
+            if has_attention:
+                old_path = f"output_blocks.{i}.1"
+                new_path = f"up_blocks.{block_id}.attentions.{layer_in_block_id}"
+                assign_attention_to_checkpoint(
+                    new_checkpoint=new_checkpoint,
+                    unet_state_dict=unet_state_dict,
+                    old_path=old_path,
+                    new_path=new_path,
+                    config=config,
+                )
+                paths = renew_attention_paths(maybe_attentions)
+                meta_path = {
+                    "old": old_path,
+                    "new": new_path,
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+            if len(output_block_list) == 3 or (not has_attention and len(maybe_attentions) > 0):
+                layer_id = len(output_block_list) - 1
+                resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.{layer_id}" in key]
+                paths = renew_resnet_paths(resnets)
+                meta_path = {"old": f"output_blocks.{i}.{layer_id}", "new": f"up_blocks.{block_id}.upsamplers.0"}
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+    return new_checkpoint
+def verify_param_count(orig_path, unet_diffusers_config):
+    if "-II-" in orig_path:
+        from deepfloyd_if.modules import IFStageII
+        if_II = IFStageII(device="cpu", dir_or_name=orig_path)
+    elif "-III-" in orig_path:
+        from deepfloyd_if.modules import IFStageIII
+        if_II = IFStageIII(device="cpu", dir_or_name=orig_path)
+    else:
+        assert f"Weird name. Should have -II- or -III- in path: {orig_path}"
+    unet = UNet2DConditionModel(**unet_diffusers_config)
+    # in params
+    assert_param_count(unet.time_embedding, if_II.model.time_embed)
+    assert_param_count(unet.conv_in, if_II.model.input_blocks[:1])
+    # downblocks
+    assert_param_count(unet.down_blocks[0], if_II.model.input_blocks[1:4])
+    assert_param_count(unet.down_blocks[1], if_II.model.input_blocks[4:7])
+    assert_param_count(unet.down_blocks[2], if_II.model.input_blocks[7:11])
+    if "-II-" in orig_path:
+        assert_param_count(unet.down_blocks[3], if_II.model.input_blocks[11:17])
+        assert_param_count(unet.down_blocks[4], if_II.model.input_blocks[17:])
+    if "-III-" in orig_path:
+        assert_param_count(unet.down_blocks[3], if_II.model.input_blocks[11:15])
+        assert_param_count(unet.down_blocks[4], if_II.model.input_blocks[15:20])
+        assert_param_count(unet.down_blocks[5], if_II.model.input_blocks[20:])
+    # mid block
+    assert_param_count(unet.mid_block, if_II.model.middle_block)
+    # up block
+    if "-II-" in orig_path:
+        assert_param_count(unet.up_blocks[0], if_II.model.output_blocks[:6])
+        assert_param_count(unet.up_blocks[1], if_II.model.output_blocks[6:12])
+        assert_param_count(unet.up_blocks[2], if_II.model.output_blocks[12:16])
+        assert_param_count(unet.up_blocks[3], if_II.model.output_blocks[16:19])
+        assert_param_count(unet.up_blocks[4], if_II.model.output_blocks[19:])
+    if "-III-" in orig_path:
+        assert_param_count(unet.up_blocks[0], if_II.model.output_blocks[:5])
+        assert_param_count(unet.up_blocks[1], if_II.model.output_blocks[5:10])
+        assert_param_count(unet.up_blocks[2], if_II.model.output_blocks[10:14])
+        assert_param_count(unet.up_blocks[3], if_II.model.output_blocks[14:18])
+        assert_param_count(unet.up_blocks[4], if_II.model.output_blocks[18:21])
+        assert_param_count(unet.up_blocks[5], if_II.model.output_blocks[21:24])
+    # out params
+    assert_param_count(unet.conv_norm_out, if_II.model.out[0])
+    assert_param_count(unet.conv_out, if_II.model.out[2])
+    # make sure all model architecture has same param count
+    assert_param_count(unet, if_II.model)
+def assert_param_count(model_1, model_2):
+    count_1 = sum(p.numel() for p in model_1.parameters())
+    count_2 = sum(p.numel() for p in model_2.parameters())
+    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
+def superres_check_against_original(dump_path, unet_checkpoint_path):
+    model_path = dump_path
+    model = UNet2DConditionModel.from_pretrained(model_path)
+    model.to("cuda")
+    orig_path = unet_checkpoint_path
+    if "-II-" in orig_path:
+        from deepfloyd_if.modules import IFStageII
+        if_II_model = IFStageII(device="cuda", dir_or_name=orig_path, model_kwargs={"precision": "fp32"}).model
+    elif "-III-" in orig_path:
+        from deepfloyd_if.modules import IFStageIII
+        if_II_model = IFStageIII(device="cuda", dir_or_name=orig_path, model_kwargs={"precision": "fp32"}).model
+    batch_size = 1
+    channels = model.config.in_channels // 2
+    height = model.config.sample_size
+    width = model.config.sample_size
+    height = 1024
+    width = 1024
+    torch.manual_seed(0)
+    latents = torch.randn((batch_size, channels, height, width), device=model.device)
+    image_small = torch.randn((batch_size, channels, height // 4, width // 4), device=model.device)
+    interpolate_antialias = {}
+    if "antialias" in inspect.signature(F.interpolate).parameters:
+        interpolate_antialias["antialias"] = True
+        image_upscaled = F.interpolate(
+            image_small, size=[height, width], mode="bicubic", align_corners=False, **interpolate_antialias
+        )
+    latent_model_input = torch.cat([latents, image_upscaled], dim=1).to(model.dtype)
+    t = torch.tensor([5], device=model.device).to(model.dtype)
+    seq_len = 64
+    encoder_hidden_states = torch.randn((batch_size, seq_len, model.config.encoder_hid_dim), device=model.device).to(
+        model.dtype
+    )
+    fake_class_labels = torch.tensor([t], device=model.device).to(model.dtype)
+    with torch.no_grad():
+        out = if_II_model(latent_model_input, t, aug_steps=fake_class_labels, text_emb=encoder_hidden_states)
+    if_II_model.to("cpu")
+    del if_II_model
+    import gc
+    torch.cuda.empty_cache()
+    gc.collect()
+    print(50 * "=")
+    with torch.no_grad():
+        noise_pred = model(
+            sample=latent_model_input,
+            encoder_hidden_states=encoder_hidden_states,
+            class_labels=fake_class_labels,
+            timestep=t,
+        ).sample
+    print("Out shape", noise_pred.shape)
+    print("Diff", (out - noise_pred).abs().sum())
+if __name__ == "__main__":
+    main(parse_args())

diffusers/scripts/convert_lora_safetensor_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# coding=utf-8
+# Copyright 2024, Haofan Wang, Qixun Wang, All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for the LoRA's safetensors checkpoints."""
+import argparse
+import torch
+from safetensors.torch import load_file
+from diffusers import StableDiffusionPipeline
+def convert(base_model_path, checkpoint_path, LORA_PREFIX_UNET, LORA_PREFIX_TEXT_ENCODER, alpha):
+    # load base model
+    pipeline = StableDiffusionPipeline.from_pretrained(base_model_path, torch_dtype=torch.float32)
+    # load LoRA weight from .safetensors
+    state_dict = load_file(checkpoint_path)
+    visited = []
+    # directly update weight in diffusers model
+    for key in state_dict:
+        # it is suggested to print out the key, it usually will be something like below
+        # "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight"
+        # as we have set the alpha beforehand, so just skip
+        if ".alpha" in key or key in visited:
+            continue
+        if "text" in key:
+            layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
+            curr_layer = pipeline.text_encoder
+        else:
+            layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
+            curr_layer = pipeline.unet
+        # find the target layer
+        temp_name = layer_infos.pop(0)
+        while len(layer_infos) > -1:
+            try:
+                curr_layer = curr_layer.__getattr__(temp_name)
+                if len(layer_infos) > 0:
+                    temp_name = layer_infos.pop(0)
+                elif len(layer_infos) == 0:
+                    break
+            except Exception:
+                if len(temp_name) > 0:
+                    temp_name += "_" + layer_infos.pop(0)
+                else:
+                    temp_name = layer_infos.pop(0)
+        pair_keys = []
+        if "lora_down" in key:
+            pair_keys.append(key.replace("lora_down", "lora_up"))
+            pair_keys.append(key)
+        else:
+            pair_keys.append(key)
+            pair_keys.append(key.replace("lora_up", "lora_down"))
+        # update weight
+        if len(state_dict[pair_keys[0]].shape) == 4:
+            weight_up = state_dict[pair_keys[0]].squeeze(3).squeeze(2).to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].squeeze(3).squeeze(2).to(torch.float32)
+            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3)
+        else:
+            weight_up = state_dict[pair_keys[0]].to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].to(torch.float32)
+            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down)
+        # update visited list
+        for item in pair_keys:
+            visited.append(item)
+    return pipeline
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base_model_path", default=None, type=str, required=True, help="Path to the base model in diffusers format."
+    )
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument(
+        "--lora_prefix_unet", default="lora_unet", type=str, help="The prefix of UNet weight in safetensors"
+    )
+    parser.add_argument(
+        "--lora_prefix_text_encoder",
+        default="lora_te",
+        type=str,
+        help="The prefix of text encoder weight in safetensors",
+    )
+    parser.add_argument("--alpha", default=0.75, type=float, help="The merging ratio in W = W0 + alpha * deltaW")
+    parser.add_argument(
+        "--to_safetensors", action="store_true", help="Whether to store pipeline in safetensors format or not."
+    )
+    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+    args = parser.parse_args()
+    base_model_path = args.base_model_path
+    checkpoint_path = args.checkpoint_path
+    dump_path = args.dump_path
+    lora_prefix_unet = args.lora_prefix_unet
+    lora_prefix_text_encoder = args.lora_prefix_text_encoder
+    alpha = args.alpha
+    pipe = convert(base_model_path, checkpoint_path, lora_prefix_unet, lora_prefix_text_encoder, alpha)
+    pipe = pipe.to(args.device)
+    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)

diffusers/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for the NCSNPP checkpoints."""
+import argparse
+import json
+import torch
+from diffusers import ScoreSdeVePipeline, ScoreSdeVeScheduler, UNet2DModel
+def convert_ncsnpp_checkpoint(checkpoint, config):
+    """
+    Takes a state dict and the path to
+    """
+    new_model_architecture = UNet2DModel(**config)
+    new_model_architecture.time_proj.W.data = checkpoint["all_modules.0.W"].data
+    new_model_architecture.time_proj.weight.data = checkpoint["all_modules.0.W"].data
+    new_model_architecture.time_embedding.linear_1.weight.data = checkpoint["all_modules.1.weight"].data
+    new_model_architecture.time_embedding.linear_1.bias.data = checkpoint["all_modules.1.bias"].data
+    new_model_architecture.time_embedding.linear_2.weight.data = checkpoint["all_modules.2.weight"].data
+    new_model_architecture.time_embedding.linear_2.bias.data = checkpoint["all_modules.2.bias"].data
+    new_model_architecture.conv_in.weight.data = checkpoint["all_modules.3.weight"].data
+    new_model_architecture.conv_in.bias.data = checkpoint["all_modules.3.bias"].data
+    new_model_architecture.conv_norm_out.weight.data = checkpoint[list(checkpoint.keys())[-4]].data
+    new_model_architecture.conv_norm_out.bias.data = checkpoint[list(checkpoint.keys())[-3]].data
+    new_model_architecture.conv_out.weight.data = checkpoint[list(checkpoint.keys())[-2]].data
+    new_model_architecture.conv_out.bias.data = checkpoint[list(checkpoint.keys())[-1]].data
+    module_index = 4
+    def set_attention_weights(new_layer, old_checkpoint, index):
+        new_layer.query.weight.data = old_checkpoint[f"all_modules.{index}.NIN_0.W"].data.T
+        new_layer.key.weight.data = old_checkpoint[f"all_modules.{index}.NIN_1.W"].data.T
+        new_layer.value.weight.data = old_checkpoint[f"all_modules.{index}.NIN_2.W"].data.T
+        new_layer.query.bias.data = old_checkpoint[f"all_modules.{index}.NIN_0.b"].data
+        new_layer.key.bias.data = old_checkpoint[f"all_modules.{index}.NIN_1.b"].data
+        new_layer.value.bias.data = old_checkpoint[f"all_modules.{index}.NIN_2.b"].data
+        new_layer.proj_attn.weight.data = old_checkpoint[f"all_modules.{index}.NIN_3.W"].data.T
+        new_layer.proj_attn.bias.data = old_checkpoint[f"all_modules.{index}.NIN_3.b"].data
+        new_layer.group_norm.weight.data = old_checkpoint[f"all_modules.{index}.GroupNorm_0.weight"].data
+        new_layer.group_norm.bias.data = old_checkpoint[f"all_modules.{index}.GroupNorm_0.bias"].data
+    def set_resnet_weights(new_layer, old_checkpoint, index):
+        new_layer.conv1.weight.data = old_checkpoint[f"all_modules.{index}.Conv_0.weight"].data
+        new_layer.conv1.bias.data = old_checkpoint[f"all_modules.{index}.Conv_0.bias"].data
+        new_layer.norm1.weight.data = old_checkpoint[f"all_modules.{index}.GroupNorm_0.weight"].data
+        new_layer.norm1.bias.data = old_checkpoint[f"all_modules.{index}.GroupNorm_0.bias"].data
+        new_layer.conv2.weight.data = old_checkpoint[f"all_modules.{index}.Conv_1.weight"].data
+        new_layer.conv2.bias.data = old_checkpoint[f"all_modules.{index}.Conv_1.bias"].data
+        new_layer.norm2.weight.data = old_checkpoint[f"all_modules.{index}.GroupNorm_1.weight"].data
+        new_layer.norm2.bias.data = old_checkpoint[f"all_modules.{index}.GroupNorm_1.bias"].data
+        new_layer.time_emb_proj.weight.data = old_checkpoint[f"all_modules.{index}.Dense_0.weight"].data
+        new_layer.time_emb_proj.bias.data = old_checkpoint[f"all_modules.{index}.Dense_0.bias"].data
+        if new_layer.in_channels != new_layer.out_channels or new_layer.up or new_layer.down:
+            new_layer.conv_shortcut.weight.data = old_checkpoint[f"all_modules.{index}.Conv_2.weight"].data
+            new_layer.conv_shortcut.bias.data = old_checkpoint[f"all_modules.{index}.Conv_2.bias"].data
+    for i, block in enumerate(new_model_architecture.downsample_blocks):
+        has_attentions = hasattr(block, "attentions")
+        for j in range(len(block.resnets)):
+            set_resnet_weights(block.resnets[j], checkpoint, module_index)
+            module_index += 1
+            if has_attentions:
+                set_attention_weights(block.attentions[j], checkpoint, module_index)
+                module_index += 1
+        if hasattr(block, "downsamplers") and block.downsamplers is not None:
+            set_resnet_weights(block.resnet_down, checkpoint, module_index)
+            module_index += 1
+            block.skip_conv.weight.data = checkpoint[f"all_modules.{module_index}.Conv_0.weight"].data
+            block.skip_conv.bias.data = checkpoint[f"all_modules.{module_index}.Conv_0.bias"].data
+            module_index += 1
+    set_resnet_weights(new_model_architecture.mid_block.resnets[0], checkpoint, module_index)
+    module_index += 1
+    set_attention_weights(new_model_architecture.mid_block.attentions[0], checkpoint, module_index)
+    module_index += 1
+    set_resnet_weights(new_model_architecture.mid_block.resnets[1], checkpoint, module_index)
+    module_index += 1
+    for i, block in enumerate(new_model_architecture.up_blocks):
+        has_attentions = hasattr(block, "attentions")
+        for j in range(len(block.resnets)):
+            set_resnet_weights(block.resnets[j], checkpoint, module_index)
+            module_index += 1
+        if has_attentions:
+            set_attention_weights(
+                block.attentions[0], checkpoint, module_index
+            )  # why can there only be a single attention layer for up?
+            module_index += 1
+        if hasattr(block, "resnet_up") and block.resnet_up is not None:
+            block.skip_norm.weight.data = checkpoint[f"all_modules.{module_index}.weight"].data
+            block.skip_norm.bias.data = checkpoint[f"all_modules.{module_index}.bias"].data
+            module_index += 1
+            block.skip_conv.weight.data = checkpoint[f"all_modules.{module_index}.weight"].data
+            block.skip_conv.bias.data = checkpoint[f"all_modules.{module_index}.bias"].data
+            module_index += 1
+            set_resnet_weights(block.resnet_up, checkpoint, module_index)
+            module_index += 1
+    new_model_architecture.conv_norm_out.weight.data = checkpoint[f"all_modules.{module_index}.weight"].data
+    new_model_architecture.conv_norm_out.bias.data = checkpoint[f"all_modules.{module_index}.bias"].data
+    module_index += 1
+    new_model_architecture.conv_out.weight.data = checkpoint[f"all_modules.{module_index}.weight"].data
+    new_model_architecture.conv_out.bias.data = checkpoint[f"all_modules.{module_index}.bias"].data
+    return new_model_architecture.state_dict()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--checkpoint_path",
+        default="/Users/arthurzucker/Work/diffusers/ArthurZ/diffusion_pytorch_model.bin",
+        type=str,
+        required=False,
+        help="Path to the checkpoint to convert.",
+    )
+    parser.add_argument(
+        "--config_file",
+        default="/Users/arthurzucker/Work/diffusers/ArthurZ/config.json",
+        type=str,
+        required=False,
+        help="The config json file corresponding to the architecture.",
+    )
+    parser.add_argument(
+        "--dump_path",
+        default="/Users/arthurzucker/Work/diffusers/ArthurZ/diffusion_model_new.pt",
+        type=str,
+        required=False,
+        help="Path to the output model.",
+    )
+    args = parser.parse_args()
+    checkpoint = torch.load(args.checkpoint_path, map_location="cpu")
+    with open(args.config_file) as f:
+        config = json.loads(f.read())
+    converted_checkpoint = convert_ncsnpp_checkpoint(
+        checkpoint,
+        config,
+    )
+    if "sde" in config:
+        del config["sde"]
+    model = UNet2DModel(**config)
+    model.load_state_dict(converted_checkpoint)
+    try:
+        scheduler = ScoreSdeVeScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1]))
+        pipe = ScoreSdeVePipeline(unet=model, scheduler=scheduler)
+        pipe.save_pretrained(args.dump_path)
+    except:  # noqa: E722
+        model.save_pretrained(args.dump_path)

diffusers/scripts/convert_omnigen_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import argparse
+import os
+import torch
+from huggingface_hub import snapshot_download
+from safetensors.torch import load_file
+from transformers import AutoTokenizer
+from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, OmniGenPipeline, OmniGenTransformer2DModel
+def main(args):
+    # checkpoint from https://huggingface.co/Shitao/OmniGen-v1
+    if not os.path.exists(args.origin_ckpt_path):
+        print("Model not found, downloading...")
+        cache_folder = os.getenv("HF_HUB_CACHE")
+        args.origin_ckpt_path = snapshot_download(
+            repo_id=args.origin_ckpt_path,
+            cache_dir=cache_folder,
+            ignore_patterns=["flax_model.msgpack", "rust_model.ot", "tf_model.h5", "model.pt"],
+        )
+        print(f"Downloaded model to {args.origin_ckpt_path}")
+    ckpt = os.path.join(args.origin_ckpt_path, "model.safetensors")
+    ckpt = load_file(ckpt, device="cpu")
+    mapping_dict = {
+        "pos_embed": "patch_embedding.pos_embed",
+        "x_embedder.proj.weight": "patch_embedding.output_image_proj.weight",
+        "x_embedder.proj.bias": "patch_embedding.output_image_proj.bias",
+        "input_x_embedder.proj.weight": "patch_embedding.input_image_proj.weight",
+        "input_x_embedder.proj.bias": "patch_embedding.input_image_proj.bias",
+        "final_layer.adaLN_modulation.1.weight": "norm_out.linear.weight",
+        "final_layer.adaLN_modulation.1.bias": "norm_out.linear.bias",
+        "final_layer.linear.weight": "proj_out.weight",
+        "final_layer.linear.bias": "proj_out.bias",
+        "time_token.mlp.0.weight": "time_token.linear_1.weight",
+        "time_token.mlp.0.bias": "time_token.linear_1.bias",
+        "time_token.mlp.2.weight": "time_token.linear_2.weight",
+        "time_token.mlp.2.bias": "time_token.linear_2.bias",
+        "t_embedder.mlp.0.weight": "t_embedder.linear_1.weight",
+        "t_embedder.mlp.0.bias": "t_embedder.linear_1.bias",
+        "t_embedder.mlp.2.weight": "t_embedder.linear_2.weight",
+        "t_embedder.mlp.2.bias": "t_embedder.linear_2.bias",
+        "llm.embed_tokens.weight": "embed_tokens.weight",
+    }
+    converted_state_dict = {}
+    for k, v in ckpt.items():
+        if k in mapping_dict:
+            converted_state_dict[mapping_dict[k]] = v
+        elif "qkv" in k:
+            to_q, to_k, to_v = v.chunk(3)
+            converted_state_dict[f"layers.{k.split('.')[2]}.self_attn.to_q.weight"] = to_q
+            converted_state_dict[f"layers.{k.split('.')[2]}.self_attn.to_k.weight"] = to_k
+            converted_state_dict[f"layers.{k.split('.')[2]}.self_attn.to_v.weight"] = to_v
+        elif "o_proj" in k:
+            converted_state_dict[f"layers.{k.split('.')[2]}.self_attn.to_out.0.weight"] = v
+        else:
+            converted_state_dict[k[4:]] = v
+    transformer = OmniGenTransformer2DModel(
+        rope_scaling={
+            "long_factor": [
+                1.0299999713897705,
+                1.0499999523162842,
+                1.0499999523162842,
+                1.0799999237060547,
+                1.2299998998641968,
+                1.2299998998641968,
+                1.2999999523162842,
+                1.4499999284744263,
+                1.5999999046325684,
+                1.6499998569488525,
+                1.8999998569488525,
+                2.859999895095825,
+                3.68999981880188,
+                5.419999599456787,
+                5.489999771118164,
+                5.489999771118164,
+                9.09000015258789,
+                11.579999923706055,
+                15.65999984741211,
+                15.769999504089355,
+                15.789999961853027,
+                18.360000610351562,
+                21.989999771118164,
+                23.079999923706055,
+                30.009998321533203,
+                32.35000228881836,
+                32.590003967285156,
+                35.56000518798828,
+                39.95000457763672,
+                53.840003967285156,
+                56.20000457763672,
+                57.95000457763672,
+                59.29000473022461,
+                59.77000427246094,
+                59.920005798339844,
+                61.190006256103516,
+                61.96000671386719,
+                62.50000762939453,
+                63.3700065612793,
+                63.48000717163086,
+                63.48000717163086,
+                63.66000747680664,
+                63.850006103515625,
+                64.08000946044922,
+                64.760009765625,
+                64.80001068115234,
+                64.81001281738281,
+                64.81001281738281,
+            ],
+            "short_factor": [
+                1.05,
+                1.05,
+                1.05,
+                1.1,
+                1.1,
+                1.1,
+                1.2500000000000002,
+                1.2500000000000002,
+                1.4000000000000004,
+                1.4500000000000004,
+                1.5500000000000005,
+                1.8500000000000008,
+                1.9000000000000008,
+                2.000000000000001,
+                2.000000000000001,
+                2.000000000000001,
+                2.000000000000001,
+                2.000000000000001,
+                2.000000000000001,
+                2.000000000000001,
+                2.000000000000001,
+                2.000000000000001,
+                2.000000000000001,
+                2.000000000000001,
+                2.000000000000001,
+                2.000000000000001,
+                2.000000000000001,
+                2.000000000000001,
+                2.000000000000001,
+                2.000000000000001,
+                2.000000000000001,
+                2.000000000000001,
+                2.1000000000000005,
+                2.1000000000000005,
+                2.2,
+                2.3499999999999996,
+                2.3499999999999996,
+                2.3499999999999996,
+                2.3499999999999996,
+                2.3999999999999995,
+                2.3999999999999995,
+                2.6499999999999986,
+                2.6999999999999984,
+                2.8999999999999977,
+                2.9499999999999975,
+                3.049999999999997,
+                3.049999999999997,
+                3.049999999999997,
+            ],
+            "type": "su",
+        },
+        patch_size=2,
+        in_channels=4,
+        pos_embed_max_size=192,
+    )
+    transformer.load_state_dict(converted_state_dict, strict=True)
+    transformer.to(torch.bfloat16)
+    num_model_params = sum(p.numel() for p in transformer.parameters())
+    print(f"Total number of transformer parameters: {num_model_params}")
+    scheduler = FlowMatchEulerDiscreteScheduler(invert_sigmas=True, num_train_timesteps=1)
+    vae = AutoencoderKL.from_pretrained(os.path.join(args.origin_ckpt_path, "vae"), torch_dtype=torch.float32)
+    tokenizer = AutoTokenizer.from_pretrained(args.origin_ckpt_path)
+    pipeline = OmniGenPipeline(tokenizer=tokenizer, transformer=transformer, vae=vae, scheduler=scheduler)
+    pipeline.save_pretrained(args.dump_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--origin_ckpt_path",
+        default="Shitao/OmniGen-v1",
+        type=str,
+        required=False,
+        help="Path to the checkpoint to convert.",
+    )
+    parser.add_argument(
+        "--dump_path", default="OmniGen-v1-diffusers", type=str, required=False, help="Path to the output pipeline."
+    )
+    args = parser.parse_args()
+    main(args)

diffusers/scripts/convert_original_audioldm2_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,1135 @@

+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for the AudioLDM2 checkpoints."""
+import argparse
+import re
+from typing import List, Union
+import torch
+import yaml
+from transformers import (
+    AutoFeatureExtractor,
+    AutoTokenizer,
+    ClapConfig,
+    ClapModel,
+    GPT2Config,
+    GPT2Model,
+    SpeechT5HifiGan,
+    SpeechT5HifiGanConfig,
+    T5Config,
+    T5EncoderModel,
+)
+from diffusers import (
+    AudioLDM2Pipeline,
+    AudioLDM2ProjectionModel,
+    AudioLDM2UNet2DConditionModel,
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils import is_safetensors_available
+from diffusers.utils.import_utils import BACKENDS_MAPPING
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_resnet_paths
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_resnet_paths
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_attention_paths
+def renew_attention_paths(old_list):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+        new_item = new_item.replace("q.weight", "to_q.weight")
+        new_item = new_item.replace("q.bias", "to_q.bias")
+        new_item = new_item.replace("k.weight", "to_k.weight")
+        new_item = new_item.replace("k.bias", "to_k.bias")
+        new_item = new_item.replace("v.weight", "to_v.weight")
+        new_item = new_item.replace("v.bias", "to_v.bias")
+        new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
+        new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+    for path in paths:
+        new_path = path["new"]
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["to_q.weight", "to_k.weight", "to_v.weight"]
+    proj_key = "to_out.0.weight"
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys or ".".join(key.split(".")[-3:]) == proj_key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key].squeeze()
+def create_unet_diffusers_config(original_config, image_size: int):
+    """
+    Creates a UNet config for diffusers based on the config of the original AudioLDM2 model.
+    """
+    unet_params = original_config["model"]["params"]["unet_config"]["params"]
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
+    cross_attention_dim = list(unet_params["context_dim"]) if "context_dim" in unet_params else block_out_channels
+    if len(cross_attention_dim) > 1:
+        # require two or more cross-attention layers per-block, each of different dimension
+        cross_attention_dim = [cross_attention_dim for _ in range(len(block_out_channels))]
+    config = {
+        "sample_size": image_size // vae_scale_factor,
+        "in_channels": unet_params["in_channels"],
+        "out_channels": unet_params["out_channels"],
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": unet_params["num_res_blocks"],
+        "transformer_layers_per_block": unet_params["transformer_depth"],
+        "cross_attention_dim": tuple(cross_attention_dim),
+    }
+    return config
+# Adapted from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_vae_diffusers_config
+def create_vae_diffusers_config(original_config, checkpoint, image_size: int):
+    """
+    Creates a VAE config for diffusers based on the config of the original AudioLDM2 model. Compared to the original
+    Stable Diffusion conversion, this function passes a *learnt* VAE scaling factor to the diffusers VAE.
+    """
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"]
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+    scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config["model"]["params"] else 0.18215
+    config = {
+        "sample_size": image_size,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
+        "scaling_factor": float(scaling_factor),
+    }
+    return config
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_diffusers_schedular
+def create_diffusers_schedular(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config["model"]["params"]["timesteps"],
+        beta_start=original_config["model"]["params"]["linear_start"],
+        beta_end=original_config["model"]["params"]["linear_end"],
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted UNet checkpoint.
+    """
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+    unet_key = "model.diffusion_model."
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        print(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+        # strip the unet prefix from the weight names
+        for key in keys:
+            if key.startswith(unet_key):
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+    new_checkpoint = {}
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}." in key]
+        for layer_id in range(num_input_blocks)
+    }
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}." in key]
+        for layer_id in range(num_middle_blocks)
+    }
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}." in key]
+        for layer_id in range(num_output_blocks)
+    }
+    # Check how many Transformer blocks we have per layer
+    if isinstance(config.get("cross_attention_dim"), (list, tuple)):
+        if isinstance(config["cross_attention_dim"][0], (list, tuple)):
+            # in this case we have multiple cross-attention layers per-block
+            num_attention_layers = len(config.get("cross_attention_dim")[0])
+    else:
+        num_attention_layers = 1
+    if config.get("extra_self_attn_layer"):
+        num_attention_layers += 1
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.0" not in key]
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = [
+                {
+                    "old": f"input_blocks.{i}.{1 + layer_id}",
+                    "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id * num_attention_layers + layer_id}",
+                }
+                for layer_id in range(num_attention_layers)
+            ]
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=meta_path, config=config
+            )
+    resnet_0 = middle_blocks[0]
+    resnet_1 = middle_blocks[num_middle_blocks - 1]
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    meta_path = {"old": "middle_block.0", "new": "mid_block.resnets.0"}
+    assign_to_checkpoint(
+        resnet_0_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    meta_path = {"old": f"middle_block.{len(middle_blocks) - 1}", "new": "mid_block.resnets.1"}
+    assign_to_checkpoint(
+        resnet_1_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+    for i in range(1, num_middle_blocks - 1):
+        attentions = middle_blocks[i]
+        attentions_paths = renew_attention_paths(attentions)
+        meta_path = {"old": f"middle_block.{i}", "new": f"mid_block.attentions.{i - 1}"}
+        assign_to_checkpoint(
+            attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.0" not in key]
+            paths = renew_resnet_paths(resnets)
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+                attentions.remove(f"output_blocks.{i}.{index}.conv.bias")
+                attentions.remove(f"output_blocks.{i}.{index}.conv.weight")
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = [
+                    {
+                        "old": f"output_blocks.{i}.{1 + layer_id}",
+                        "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id * num_attention_layers + layer_id}",
+                    }
+                    for layer_id in range(num_attention_layers)
+                ]
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=meta_path, config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+    return new_checkpoint
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+    new_checkpoint = {}
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+CLAP_KEYS_TO_MODIFY_MAPPING = {
+    "text_branch": "text_model",
+    "audio_branch": "audio_model.audio_encoder",
+    "attn": "attention.self",
+    "self.proj": "output.dense",
+    "attention.self_mask": "attn_mask",
+    "mlp.fc1": "intermediate.dense",
+    "mlp.fc2": "output.dense",
+    "norm1": "layernorm_before",
+    "norm2": "layernorm_after",
+    "bn0": "batch_norm",
+}
+CLAP_KEYS_TO_IGNORE = [
+    "text_transform",
+    "audio_transform",
+    "stft",
+    "logmel_extractor",
+    "tscam_conv",
+    "head",
+    "attn_mask",
+]
+CLAP_EXPECTED_MISSING_KEYS = ["text_model.embeddings.token_type_ids"]
+def convert_open_clap_checkpoint(checkpoint):
+    """
+    Takes a state dict and returns a converted CLAP checkpoint.
+    """
+    # extract state dict for CLAP text embedding model, discarding the audio component
+    model_state_dict = {}
+    model_key = "clap.model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(model_key):
+            model_state_dict[key.replace(model_key, "")] = checkpoint.get(key)
+    new_checkpoint = {}
+    sequential_layers_pattern = r".*sequential.(\d+).*"
+    text_projection_pattern = r".*_projection.(\d+).*"
+    for key, value in model_state_dict.items():
+        # check if key should be ignored in mapping - if so map it to a key name that we'll filter out at the end
+        for key_to_ignore in CLAP_KEYS_TO_IGNORE:
+            if key_to_ignore in key:
+                key = "spectrogram"
+        # check if any key needs to be modified
+        for key_to_modify, new_key in CLAP_KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+        if re.match(sequential_layers_pattern, key):
+            # replace sequential layers with list
+            sequential_layer = re.match(sequential_layers_pattern, key).group(1)
+            key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer) // 3}.linear.")
+        elif re.match(text_projection_pattern, key):
+            projecton_layer = int(re.match(text_projection_pattern, key).group(1))
+            # Because in CLAP they use `nn.Sequential`...
+            transformers_projection_layer = 1 if projecton_layer == 0 else 2
+            key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.")
+        if "audio" and "qkv" in key:
+            # split qkv into query key and value
+            mixed_qkv = value
+            qkv_dim = mixed_qkv.size(0) // 3
+            query_layer = mixed_qkv[:qkv_dim]
+            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
+            value_layer = mixed_qkv[qkv_dim * 2 :]
+            new_checkpoint[key.replace("qkv", "query")] = query_layer
+            new_checkpoint[key.replace("qkv", "key")] = key_layer
+            new_checkpoint[key.replace("qkv", "value")] = value_layer
+        elif key != "spectrogram":
+            new_checkpoint[key] = value
+    return new_checkpoint
+def create_transformers_vocoder_config(original_config):
+    """
+    Creates a config for transformers SpeechT5HifiGan based on the config of the vocoder model.
+    """
+    vocoder_params = original_config["model"]["params"]["vocoder_config"]["params"]
+    config = {
+        "model_in_dim": vocoder_params["num_mels"],
+        "sampling_rate": vocoder_params["sampling_rate"],
+        "upsample_initial_channel": vocoder_params["upsample_initial_channel"],
+        "upsample_rates": list(vocoder_params["upsample_rates"]),
+        "upsample_kernel_sizes": list(vocoder_params["upsample_kernel_sizes"]),
+        "resblock_kernel_sizes": list(vocoder_params["resblock_kernel_sizes"]),
+        "resblock_dilation_sizes": [
+            list(resblock_dilation) for resblock_dilation in vocoder_params["resblock_dilation_sizes"]
+        ],
+        "normalize_before": False,
+    }
+    return config
+def extract_sub_model(checkpoint, key_prefix):
+    """
+    Takes a state dict and returns the state dict for a particular sub-model.
+    """
+    sub_model_state_dict = {}
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(key_prefix):
+            sub_model_state_dict[key.replace(key_prefix, "")] = checkpoint.get(key)
+    return sub_model_state_dict
+def convert_hifigan_checkpoint(checkpoint, config):
+    """
+    Takes a state dict and config, and returns a converted HiFiGAN vocoder checkpoint.
+    """
+    # extract state dict for vocoder
+    vocoder_state_dict = extract_sub_model(checkpoint, key_prefix="first_stage_model.vocoder.")
+    # fix upsampler keys, everything else is correct already
+    for i in range(len(config.upsample_rates)):
+        vocoder_state_dict[f"upsampler.{i}.weight"] = vocoder_state_dict.pop(f"ups.{i}.weight")
+        vocoder_state_dict[f"upsampler.{i}.bias"] = vocoder_state_dict.pop(f"ups.{i}.bias")
+    if not config.normalize_before:
+        # if we don't set normalize_before then these variables are unused, so we set them to their initialised values
+        vocoder_state_dict["mean"] = torch.zeros(config.model_in_dim)
+        vocoder_state_dict["scale"] = torch.ones(config.model_in_dim)
+    return vocoder_state_dict
+def convert_projection_checkpoint(checkpoint):
+    projection_state_dict = {}
+    conditioner_state_dict = extract_sub_model(checkpoint, key_prefix="cond_stage_models.0.")
+    projection_state_dict["sos_embed"] = conditioner_state_dict["start_of_sequence_tokens.weight"][0]
+    projection_state_dict["sos_embed_1"] = conditioner_state_dict["start_of_sequence_tokens.weight"][1]
+    projection_state_dict["eos_embed"] = conditioner_state_dict["end_of_sequence_tokens.weight"][0]
+    projection_state_dict["eos_embed_1"] = conditioner_state_dict["end_of_sequence_tokens.weight"][1]
+    projection_state_dict["projection.weight"] = conditioner_state_dict["input_sequence_embed_linear.0.weight"]
+    projection_state_dict["projection.bias"] = conditioner_state_dict["input_sequence_embed_linear.0.bias"]
+    projection_state_dict["projection_1.weight"] = conditioner_state_dict["input_sequence_embed_linear.1.weight"]
+    projection_state_dict["projection_1.bias"] = conditioner_state_dict["input_sequence_embed_linear.1.bias"]
+    return projection_state_dict
+# Adapted from https://github.com/haoheliu/AudioLDM2/blob/81ad2c6ce015c1310387695e2dae975a7d2ed6fd/audioldm2/utils.py#L143
+DEFAULT_CONFIG = {
+    "model": {
+        "params": {
+            "linear_start": 0.0015,
+            "linear_end": 0.0195,
+            "timesteps": 1000,
+            "channels": 8,
+            "scale_by_std": True,
+            "unet_config": {
+                "target": "audioldm2.latent_diffusion.openaimodel.UNetModel",
+                "params": {
+                    "context_dim": [None, 768, 1024],
+                    "in_channels": 8,
+                    "out_channels": 8,
+                    "model_channels": 128,
+                    "attention_resolutions": [8, 4, 2],
+                    "num_res_blocks": 2,
+                    "channel_mult": [1, 2, 3, 5],
+                    "num_head_channels": 32,
+                    "transformer_depth": 1,
+                },
+            },
+            "first_stage_config": {
+                "target": "audioldm2.variational_autoencoder.autoencoder.AutoencoderKL",
+                "params": {
+                    "embed_dim": 8,
+                    "ddconfig": {
+                        "z_channels": 8,
+                        "resolution": 256,
+                        "in_channels": 1,
+                        "out_ch": 1,
+                        "ch": 128,
+                        "ch_mult": [1, 2, 4],
+                        "num_res_blocks": 2,
+                    },
+                },
+            },
+            "cond_stage_config": {
+                "crossattn_audiomae_generated": {
+                    "target": "audioldm2.latent_diffusion.modules.encoders.modules.SequenceGenAudioMAECond",
+                    "params": {
+                        "sequence_gen_length": 8,
+                        "sequence_input_embed_dim": [512, 1024],
+                    },
+                }
+            },
+            "vocoder_config": {
+                "target": "audioldm2.first_stage_model.vocoder",
+                "params": {
+                    "upsample_rates": [5, 4, 2, 2, 2],
+                    "upsample_kernel_sizes": [16, 16, 8, 4, 4],
+                    "upsample_initial_channel": 1024,
+                    "resblock_kernel_sizes": [3, 7, 11],
+                    "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                    "num_mels": 64,
+                    "sampling_rate": 16000,
+                },
+            },
+        },
+    },
+}
+def load_pipeline_from_original_AudioLDM2_ckpt(
+    checkpoint_path: str,
+    original_config_file: str = None,
+    image_size: int = 1024,
+    prediction_type: str = None,
+    extract_ema: bool = False,
+    scheduler_type: str = "ddim",
+    cross_attention_dim: Union[List, List[List]] = None,
+    transformer_layers_per_block: int = None,
+    device: str = None,
+    from_safetensors: bool = False,
+) -> AudioLDM2Pipeline:
+    """
+    Load an AudioLDM2 pipeline object from a `.ckpt`/`.safetensors` file and (ideally) a `.yaml` config file.
+    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
+    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
+    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
+    Args:
+        checkpoint_path (`str`): Path to `.ckpt` file.
+        original_config_file (`str`):
+            Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically
+            set to the AudioLDM2 base config.
+        image_size (`int`, *optional*, defaults to 1024):
+            The image size that the model was trained on.
+        prediction_type (`str`, *optional*):
+            The prediction type that the model was trained on. If `None`, will be automatically
+            inferred by looking for a key in the config. For the default config, the prediction type is `'epsilon'`.
+        scheduler_type (`str`, *optional*, defaults to 'ddim'):
+            Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
+            "ddim"]`.
+        cross_attention_dim (`list`, *optional*, defaults to `None`):
+            The dimension of the cross-attention layers. If `None`, the cross-attention dimension will be
+            automatically inferred. Set to `[768, 1024]` for the base model, or `[768, 1024, None]` for the large model.
+        transformer_layers_per_block (`int`, *optional*, defaults to `None`):
+            The number of transformer layers in each transformer block. If `None`, number of layers will be "
+             "automatically inferred. Set to `1` for the base model, or `2` for the large model.
+        extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
+            checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to
+            `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
+            inference. Non-EMA weights are usually better to continue fine-tuning.
+        device (`str`, *optional*, defaults to `None`):
+            The device to use. Pass `None` to determine automatically.
+        from_safetensors (`str`, *optional*, defaults to `False`):
+            If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.
+        return: An AudioLDM2Pipeline object representing the passed-in `.ckpt`/`.safetensors` file.
+    """
+    if from_safetensors:
+        if not is_safetensors_available():
+            raise ValueError(BACKENDS_MAPPING["safetensors"][1])
+        from safetensors import safe_open
+        checkpoint = {}
+        with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                checkpoint[key] = f.get_tensor(key)
+    else:
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+        else:
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+    if "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+    if original_config_file is None:
+        original_config = DEFAULT_CONFIG
+    else:
+        original_config = yaml.safe_load(original_config_file)
+    if image_size is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["image_size"] = image_size
+    if cross_attention_dim is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["context_dim"] = cross_attention_dim
+    if transformer_layers_per_block is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["transformer_depth"] = transformer_layers_per_block
+    if (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+        if prediction_type is None:
+            prediction_type = "v_prediction"
+    else:
+        if prediction_type is None:
+            prediction_type = "epsilon"
+    num_train_timesteps = original_config["model"]["params"]["timesteps"]
+    beta_start = original_config["model"]["params"]["linear_start"]
+    beta_end = original_config["model"]["params"]["linear_end"]
+    scheduler = DDIMScheduler(
+        beta_end=beta_end,
+        beta_schedule="scaled_linear",
+        beta_start=beta_start,
+        num_train_timesteps=num_train_timesteps,
+        steps_offset=1,
+        clip_sample=False,
+        set_alpha_to_one=False,
+        prediction_type=prediction_type,
+    )
+    # make sure scheduler works correctly with DDIM
+    scheduler.register_to_config(clip_sample=False)
+    if scheduler_type == "pndm":
+        config = dict(scheduler.config)
+        config["skip_prk_steps"] = True
+        scheduler = PNDMScheduler.from_config(config)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+    elif scheduler_type == "ddim":
+        scheduler = scheduler
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+    # Convert the UNet2DModel
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    unet = AudioLDM2UNet2DConditionModel(**unet_config)
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
+    )
+    unet.load_state_dict(converted_unet_checkpoint)
+    # Convert the VAE model
+    vae_config = create_vae_diffusers_config(original_config, checkpoint=checkpoint, image_size=image_size)
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+    # Convert the joint audio-text encoding model
+    clap_config = ClapConfig.from_pretrained("laion/clap-htsat-unfused")
+    clap_config.audio_config.update(
+        {
+            "patch_embeds_hidden_size": 128,
+            "hidden_size": 1024,
+            "depths": [2, 2, 12, 2],
+        }
+    )
+    # AudioLDM2 uses the same tokenizer and feature extractor as the original CLAP model
+    clap_tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
+    clap_feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
+    converted_clap_model = convert_open_clap_checkpoint(checkpoint)
+    clap_model = ClapModel(clap_config)
+    missing_keys, unexpected_keys = clap_model.load_state_dict(converted_clap_model, strict=False)
+    # we expect not to have token_type_ids in our original state dict so let's ignore them
+    missing_keys = list(set(missing_keys) - set(CLAP_EXPECTED_MISSING_KEYS))
+    if len(unexpected_keys) > 0:
+        raise ValueError(f"Unexpected keys when loading CLAP model: {unexpected_keys}")
+    if len(missing_keys) > 0:
+        raise ValueError(f"Missing keys when loading CLAP model: {missing_keys}")
+    # Convert the vocoder model
+    vocoder_config = create_transformers_vocoder_config(original_config)
+    vocoder_config = SpeechT5HifiGanConfig(**vocoder_config)
+    converted_vocoder_checkpoint = convert_hifigan_checkpoint(checkpoint, vocoder_config)
+    vocoder = SpeechT5HifiGan(vocoder_config)
+    vocoder.load_state_dict(converted_vocoder_checkpoint)
+    # Convert the Flan-T5 encoder model: AudioLDM2 uses the same configuration and tokenizer as the original Flan-T5 large model
+    t5_config = T5Config.from_pretrained("google/flan-t5-large")
+    converted_t5_checkpoint = extract_sub_model(checkpoint, key_prefix="cond_stage_models.1.model.")
+    t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
+    # hard-coded in the original implementation (i.e. not retrievable from the config)
+    t5_tokenizer.model_max_length = 128
+    t5_model = T5EncoderModel(t5_config)
+    t5_model.load_state_dict(converted_t5_checkpoint)
+    # Convert the GPT2 encoder model: AudioLDM2 uses the same configuration as the original GPT2 base model
+    gpt2_config = GPT2Config.from_pretrained("gpt2")
+    gpt2_model = GPT2Model(gpt2_config)
+    gpt2_model.config.max_new_tokens = original_config["model"]["params"]["cond_stage_config"][
+        "crossattn_audiomae_generated"
+    ]["params"]["sequence_gen_length"]
+    converted_gpt2_checkpoint = extract_sub_model(checkpoint, key_prefix="cond_stage_models.0.model.")
+    gpt2_model.load_state_dict(converted_gpt2_checkpoint)
+    # Convert the extra embedding / projection layers
+    projection_model = AudioLDM2ProjectionModel(clap_config.projection_dim, t5_config.d_model, gpt2_config.n_embd)
+    converted_projection_checkpoint = convert_projection_checkpoint(checkpoint)
+    projection_model.load_state_dict(converted_projection_checkpoint)
+    # Instantiate the diffusers pipeline
+    pipe = AudioLDM2Pipeline(
+        vae=vae,
+        text_encoder=clap_model,
+        text_encoder_2=t5_model,
+        projection_model=projection_model,
+        language_model=gpt2_model,
+        tokenizer=clap_tokenizer,
+        tokenizer_2=t5_tokenizer,
+        feature_extractor=clap_feature_extractor,
+        unet=unet,
+        scheduler=scheduler,
+        vocoder=vocoder,
+    )
+    return pipe
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--original_config_file",
+        default=None,
+        type=str,
+        help="The YAML config file corresponding to the original architecture.",
+    )
+    parser.add_argument(
+        "--cross_attention_dim",
+        default=None,
+        type=int,
+        nargs="+",
+        help="The dimension of the cross-attention layers. If `None`, the cross-attention dimension will be "
+        "automatically inferred. Set to `768+1024` for the base model, or `768+1024+640` for the large model",
+    )
+    parser.add_argument(
+        "--transformer_layers_per_block",
+        default=None,
+        type=int,
+        help="The number of transformer layers in each transformer block. If `None`, number of layers will be "
+        "automatically inferred. Set to `1` for the base model, or `2` for the large model.",
+    )
+    parser.add_argument(
+        "--scheduler_type",
+        default="ddim",
+        type=str,
+        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancestral', 'dpm']",
+    )
+    parser.add_argument(
+        "--image_size",
+        default=1048,
+        type=int,
+        help="The image size that the model was trained on.",
+    )
+    parser.add_argument(
+        "--prediction_type",
+        default=None,
+        type=str,
+        help=("The prediction type that the model was trained on."),
+    )
+    parser.add_argument(
+        "--extract_ema",
+        action="store_true",
+        help=(
+            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
+            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
+            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
+        ),
+    )
+    parser.add_argument(
+        "--from_safetensors",
+        action="store_true",
+        help="If `--checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.",
+    )
+    parser.add_argument(
+        "--to_safetensors",
+        action="store_true",
+        help="Whether to store pipeline in safetensors format or not.",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+    args = parser.parse_args()
+    pipe = load_pipeline_from_original_AudioLDM2_ckpt(
+        checkpoint_path=args.checkpoint_path,
+        original_config_file=args.original_config_file,
+        image_size=args.image_size,
+        prediction_type=args.prediction_type,
+        extract_ema=args.extract_ema,
+        scheduler_type=args.scheduler_type,
+        cross_attention_dim=args.cross_attention_dim,
+        transformer_layers_per_block=args.transformer_layers_per_block,
+        from_safetensors=args.from_safetensors,
+        device=args.device,
+    )
+    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)

diffusers/scripts/convert_original_musicldm_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,1056 @@

+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conversion script for the MusicLDM checkpoints."""
+import argparse
+import re
+import torch
+import yaml
+from transformers import (
+    AutoFeatureExtractor,
+    AutoTokenizer,
+    ClapConfig,
+    ClapModel,
+    SpeechT5HifiGan,
+    SpeechT5HifiGanConfig,
+)
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    MusicLDMPipeline,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_resnet_paths
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_resnet_paths
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_attention_paths
+def renew_attention_paths(old_list):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+        new_item = new_item.replace("q.weight", "to_q.weight")
+        new_item = new_item.replace("q.bias", "to_q.bias")
+        new_item = new_item.replace("k.weight", "to_k.weight")
+        new_item = new_item.replace("k.bias", "to_k.bias")
+        new_item = new_item.replace("v.weight", "to_v.weight")
+        new_item = new_item.replace("v.bias", "to_v.bias")
+        new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
+        new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.assign_to_checkpoint
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+    for path in paths:
+        new_path = path["new"]
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["to_q.weight", "to_k.weight", "to_v.weight"]
+    proj_key = "to_out.0.weight"
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys or ".".join(key.split(".")[-3:]) == proj_key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key].squeeze()
+def create_unet_diffusers_config(original_config, image_size: int):
+    """
+    Creates a UNet config for diffusers based on the config of the original MusicLDM model.
+    """
+    unet_params = original_config["model"]["params"]["unet_config"]["params"]
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
+    cross_attention_dim = (
+        unet_params["cross_attention_dim"] if "cross_attention_dim" in unet_params else block_out_channels
+    )
+    class_embed_type = "simple_projection" if "extra_film_condition_dim" in unet_params else None
+    projection_class_embeddings_input_dim = (
+        unet_params["extra_film_condition_dim"] if "extra_film_condition_dim" in unet_params else None
+    )
+    class_embeddings_concat = unet_params["extra_film_use_concat"] if "extra_film_use_concat" in unet_params else None
+    config = {
+        "sample_size": image_size // vae_scale_factor,
+        "in_channels": unet_params["in_channels"],
+        "out_channels": unet_params["out_channels"],
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": unet_params["num_res_blocks"],
+        "cross_attention_dim": cross_attention_dim,
+        "class_embed_type": class_embed_type,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+        "class_embeddings_concat": class_embeddings_concat,
+    }
+    return config
+# Adapted from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_vae_diffusers_config
+def create_vae_diffusers_config(original_config, checkpoint, image_size: int):
+    """
+    Creates a VAE config for diffusers based on the config of the original MusicLDM model. Compared to the original
+    Stable Diffusion conversion, this function passes a *learnt* VAE scaling factor to the diffusers VAE.
+    """
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"]
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+    scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config["model"]["params"] else 0.18215
+    config = {
+        "sample_size": image_size,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
+        "scaling_factor": float(scaling_factor),
+    }
+    return config
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_diffusers_schedular
+def create_diffusers_schedular(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config["model"]["params"]["timesteps"],
+        beta_start=original_config["model"]["params"]["linear_start"],
+        beta_end=original_config["model"]["params"]["linear_end"],
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint. Compared to the original Stable Diffusion
+    conversion, this function additionally converts the learnt film embedding linear layer.
+    """
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+    unet_key = "model.diffusion_model."
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        print(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+        for key in keys:
+            if key.startswith(unet_key):
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+    new_checkpoint = {}
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+    new_checkpoint["class_embedding.weight"] = unet_state_dict["film_emb.weight"]
+    new_checkpoint["class_embedding.bias"] = unet_state_dict["film_emb.bias"]
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+    return new_checkpoint
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_vae_checkpoint
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+    new_checkpoint = {}
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+CLAP_KEYS_TO_MODIFY_MAPPING = {
+    "text_branch": "text_model",
+    "audio_branch": "audio_model.audio_encoder",
+    "attn": "attention.self",
+    "self.proj": "output.dense",
+    "attention.self_mask": "attn_mask",
+    "mlp.fc1": "intermediate.dense",
+    "mlp.fc2": "output.dense",
+    "norm1": "layernorm_before",
+    "norm2": "layernorm_after",
+    "bn0": "batch_norm",
+}
+CLAP_KEYS_TO_IGNORE = [
+    "text_transform",
+    "audio_transform",
+    "stft",
+    "logmel_extractor",
+    "tscam_conv",
+    "head",
+    "attn_mask",
+]
+CLAP_EXPECTED_MISSING_KEYS = ["text_model.embeddings.token_type_ids"]
+def convert_open_clap_checkpoint(checkpoint):
+    """
+    Takes a state dict and returns a converted CLAP checkpoint.
+    """
+    # extract state dict for CLAP text embedding model, discarding the audio component
+    model_state_dict = {}
+    model_key = "cond_stage_model.model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(model_key):
+            model_state_dict[key.replace(model_key, "")] = checkpoint.get(key)
+    new_checkpoint = {}
+    sequential_layers_pattern = r".*sequential.(\d+).*"
+    text_projection_pattern = r".*_projection.(\d+).*"
+    for key, value in model_state_dict.items():
+        # check if key should be ignored in mapping - if so map it to a key name that we'll filter out at the end
+        for key_to_ignore in CLAP_KEYS_TO_IGNORE:
+            if key_to_ignore in key:
+                key = "spectrogram"
+        # check if any key needs to be modified
+        for key_to_modify, new_key in CLAP_KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+        if re.match(sequential_layers_pattern, key):
+            # replace sequential layers with list
+            sequential_layer = re.match(sequential_layers_pattern, key).group(1)
+            key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer) // 3}.linear.")
+        elif re.match(text_projection_pattern, key):
+            projecton_layer = int(re.match(text_projection_pattern, key).group(1))
+            # Because in CLAP they use `nn.Sequential`...
+            transformers_projection_layer = 1 if projecton_layer == 0 else 2
+            key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.")
+        if "audio" and "qkv" in key:
+            # split qkv into query key and value
+            mixed_qkv = value
+            qkv_dim = mixed_qkv.size(0) // 3
+            query_layer = mixed_qkv[:qkv_dim]
+            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
+            value_layer = mixed_qkv[qkv_dim * 2 :]
+            new_checkpoint[key.replace("qkv", "query")] = query_layer
+            new_checkpoint[key.replace("qkv", "key")] = key_layer
+            new_checkpoint[key.replace("qkv", "value")] = value_layer
+        elif key != "spectrogram":
+            new_checkpoint[key] = value
+    return new_checkpoint
+def create_transformers_vocoder_config(original_config):
+    """
+    Creates a config for transformers SpeechT5HifiGan based on the config of the vocoder model.
+    """
+    vocoder_params = original_config["model"]["params"]["vocoder_config"]["params"]
+    config = {
+        "model_in_dim": vocoder_params["num_mels"],
+        "sampling_rate": vocoder_params["sampling_rate"],
+        "upsample_initial_channel": vocoder_params["upsample_initial_channel"],
+        "upsample_rates": list(vocoder_params["upsample_rates"]),
+        "upsample_kernel_sizes": list(vocoder_params["upsample_kernel_sizes"]),
+        "resblock_kernel_sizes": list(vocoder_params["resblock_kernel_sizes"]),
+        "resblock_dilation_sizes": [
+            list(resblock_dilation) for resblock_dilation in vocoder_params["resblock_dilation_sizes"]
+        ],
+        "normalize_before": False,
+    }
+    return config
+def convert_hifigan_checkpoint(checkpoint, config):
+    """
+    Takes a state dict and config, and returns a converted HiFiGAN vocoder checkpoint.
+    """
+    # extract state dict for vocoder
+    vocoder_state_dict = {}
+    vocoder_key = "first_stage_model.vocoder."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vocoder_key):
+            vocoder_state_dict[key.replace(vocoder_key, "")] = checkpoint.get(key)
+    # fix upsampler keys, everything else is correct already
+    for i in range(len(config.upsample_rates)):
+        vocoder_state_dict[f"upsampler.{i}.weight"] = vocoder_state_dict.pop(f"ups.{i}.weight")
+        vocoder_state_dict[f"upsampler.{i}.bias"] = vocoder_state_dict.pop(f"ups.{i}.bias")
+    if not config.normalize_before:
+        # if we don't set normalize_before then these variables are unused, so we set them to their initialised values
+        vocoder_state_dict["mean"] = torch.zeros(config.model_in_dim)
+        vocoder_state_dict["scale"] = torch.ones(config.model_in_dim)
+    return vocoder_state_dict
+# Adapted from https://huggingface.co/spaces/haoheliu/MusicLDM-text-to-audio-generation/blob/84a0384742a22bd80c44e903e241f0623e874f1d/MusicLDM/utils.py#L72-L73
+DEFAULT_CONFIG = {
+    "model": {
+        "params": {
+            "linear_start": 0.0015,
+            "linear_end": 0.0195,
+            "timesteps": 1000,
+            "channels": 8,
+            "scale_by_std": True,
+            "unet_config": {
+                "target": "MusicLDM.latent_diffusion.openaimodel.UNetModel",
+                "params": {
+                    "extra_film_condition_dim": 512,
+                    "extra_film_use_concat": True,
+                    "in_channels": 8,
+                    "out_channels": 8,
+                    "model_channels": 128,
+                    "attention_resolutions": [8, 4, 2],
+                    "num_res_blocks": 2,
+                    "channel_mult": [1, 2, 3, 5],
+                    "num_head_channels": 32,
+                },
+            },
+            "first_stage_config": {
+                "target": "MusicLDM.variational_autoencoder.autoencoder.AutoencoderKL",
+                "params": {
+                    "embed_dim": 8,
+                    "ddconfig": {
+                        "z_channels": 8,
+                        "resolution": 256,
+                        "in_channels": 1,
+                        "out_ch": 1,
+                        "ch": 128,
+                        "ch_mult": [1, 2, 4],
+                        "num_res_blocks": 2,
+                    },
+                },
+            },
+            "vocoder_config": {
+                "target": "MusicLDM.first_stage_model.vocoder",
+                "params": {
+                    "upsample_rates": [5, 4, 2, 2, 2],
+                    "upsample_kernel_sizes": [16, 16, 8, 4, 4],
+                    "upsample_initial_channel": 1024,
+                    "resblock_kernel_sizes": [3, 7, 11],
+                    "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                    "num_mels": 64,
+                    "sampling_rate": 16000,
+                },
+            },
+        },
+    },
+}
+def load_pipeline_from_original_MusicLDM_ckpt(
+    checkpoint_path: str,
+    original_config_file: str = None,
+    image_size: int = 1024,
+    prediction_type: str = None,
+    extract_ema: bool = False,
+    scheduler_type: str = "ddim",
+    num_in_channels: int = None,
+    model_channels: int = None,
+    num_head_channels: int = None,
+    device: str = None,
+    from_safetensors: bool = False,
+) -> MusicLDMPipeline:
+    """
+    Load an MusicLDM pipeline object from a `.ckpt`/`.safetensors` file and (ideally) a `.yaml` config file.
+    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
+    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
+    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
+    Args:
+        checkpoint_path (`str`): Path to `.ckpt` file.
+        original_config_file (`str`):
+            Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically
+            set to the MusicLDM-s-full-v2 config.
+        image_size (`int`, *optional*, defaults to 1024):
+            The image size that the model was trained on.
+        prediction_type (`str`, *optional*):
+            The prediction type that the model was trained on. If `None`, will be automatically
+            inferred by looking for a key in the config. For the default config, the prediction type is `'epsilon'`.
+        num_in_channels (`int`, *optional*, defaults to None):
+            The number of UNet input channels. If `None`, it will be automatically inferred from the config.
+        model_channels (`int`, *optional*, defaults to None):
+            The number of UNet model channels. If `None`, it will be automatically inferred from the config. Override
+            to 128 for the small checkpoints, 192 for the medium checkpoints and 256 for the large.
+        num_head_channels (`int`, *optional*, defaults to None):
+            The number of UNet head channels. If `None`, it will be automatically inferred from the config. Override
+            to 32 for the small and medium checkpoints, and 64 for the large.
+        scheduler_type (`str`, *optional*, defaults to 'pndm'):
+            Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
+            "ddim"]`.
+        extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
+            checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to
+            `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
+            inference. Non-EMA weights are usually better to continue fine-tuning.
+        device (`str`, *optional*, defaults to `None`):
+            The device to use. Pass `None` to determine automatically.
+        from_safetensors (`str`, *optional*, defaults to `False`):
+            If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.
+        return: An MusicLDMPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
+    """
+    if from_safetensors:
+        from safetensors import safe_open
+        checkpoint = {}
+        with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                checkpoint[key] = f.get_tensor(key)
+    else:
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+        else:
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+    if "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+    if original_config_file is None:
+        original_config = DEFAULT_CONFIG
+    else:
+        original_config = yaml.safe_load(original_config_file)
+    if num_in_channels is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
+    if model_channels is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["model_channels"] = model_channels
+    if num_head_channels is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["num_head_channels"] = num_head_channels
+    if (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+        if prediction_type is None:
+            prediction_type = "v_prediction"
+    else:
+        if prediction_type is None:
+            prediction_type = "epsilon"
+    if image_size is None:
+        image_size = 512
+    num_train_timesteps = original_config["model"]["params"]["timesteps"]
+    beta_start = original_config["model"]["params"]["linear_start"]
+    beta_end = original_config["model"]["params"]["linear_end"]
+    scheduler = DDIMScheduler(
+        beta_end=beta_end,
+        beta_schedule="scaled_linear",
+        beta_start=beta_start,
+        num_train_timesteps=num_train_timesteps,
+        steps_offset=1,
+        clip_sample=False,
+        set_alpha_to_one=False,
+        prediction_type=prediction_type,
+    )
+    # make sure scheduler works correctly with DDIM
+    scheduler.register_to_config(clip_sample=False)
+    if scheduler_type == "pndm":
+        config = dict(scheduler.config)
+        config["skip_prk_steps"] = True
+        scheduler = PNDMScheduler.from_config(config)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+    elif scheduler_type == "ddim":
+        scheduler = scheduler
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+    # Convert the UNet2DModel
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    unet = UNet2DConditionModel(**unet_config)
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
+    )
+    unet.load_state_dict(converted_unet_checkpoint)
+    # Convert the VAE model
+    vae_config = create_vae_diffusers_config(original_config, checkpoint=checkpoint, image_size=image_size)
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+    # Convert the text model
+    # MusicLDM uses the same tokenizer as the original CLAP model, but a slightly different configuration
+    config = ClapConfig.from_pretrained("laion/clap-htsat-unfused")
+    config.audio_config.update(
+        {
+            "patch_embeds_hidden_size": 128,
+            "hidden_size": 1024,
+            "depths": [2, 2, 12, 2],
+        }
+    )
+    tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
+    feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
+    converted_text_model = convert_open_clap_checkpoint(checkpoint)
+    text_model = ClapModel(config)
+    missing_keys, unexpected_keys = text_model.load_state_dict(converted_text_model, strict=False)
+    # we expect not to have token_type_ids in our original state dict so let's ignore them
+    missing_keys = list(set(missing_keys) - set(CLAP_EXPECTED_MISSING_KEYS))
+    if len(unexpected_keys) > 0:
+        raise ValueError(f"Unexpected keys when loading CLAP model: {unexpected_keys}")
+    if len(missing_keys) > 0:
+        raise ValueError(f"Missing keys when loading CLAP model: {missing_keys}")
+    # Convert the vocoder model
+    vocoder_config = create_transformers_vocoder_config(original_config)
+    vocoder_config = SpeechT5HifiGanConfig(**vocoder_config)
+    converted_vocoder_checkpoint = convert_hifigan_checkpoint(checkpoint, vocoder_config)
+    vocoder = SpeechT5HifiGan(vocoder_config)
+    vocoder.load_state_dict(converted_vocoder_checkpoint)
+    # Instantiate the diffusers pipeline
+    pipe = MusicLDMPipeline(
+        vae=vae,
+        text_encoder=text_model,
+        tokenizer=tokenizer,
+        unet=unet,
+        scheduler=scheduler,
+        vocoder=vocoder,
+        feature_extractor=feature_extractor,
+    )
+    return pipe
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--original_config_file",
+        default=None,
+        type=str,
+        help="The YAML config file corresponding to the original architecture.",
+    )
+    parser.add_argument(
+        "--num_in_channels",
+        default=None,
+        type=int,
+        help="The number of input channels. If `None` number of input channels will be automatically inferred.",
+    )
+    parser.add_argument(
+        "--model_channels",
+        default=None,
+        type=int,
+        help="The number of UNet model channels. If `None`, it will be automatically inferred from the config. Override"
+        " to 128 for the small checkpoints, 192 for the medium checkpoints and 256 for the large.",
+    )
+    parser.add_argument(
+        "--num_head_channels",
+        default=None,
+        type=int,
+        help="The number of UNet head channels. If `None`, it will be automatically inferred from the config. Override"
+        " to 32 for the small and medium checkpoints, and 64 for the large.",
+    )
+    parser.add_argument(
+        "--scheduler_type",
+        default="ddim",
+        type=str,
+        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancestral', 'dpm']",
+    )
+    parser.add_argument(
+        "--image_size",
+        default=None,
+        type=int,
+        help=("The image size that the model was trained on."),
+    )
+    parser.add_argument(
+        "--prediction_type",
+        default=None,
+        type=str,
+        help=("The prediction type that the model was trained on."),
+    )
+    parser.add_argument(
+        "--extract_ema",
+        action="store_true",
+        help=(
+            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
+            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
+            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
+        ),
+    )
+    parser.add_argument(
+        "--from_safetensors",
+        action="store_true",
+        help="If `--checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.",
+    )
+    parser.add_argument(
+        "--to_safetensors",
+        action="store_true",
+        help="Whether to store pipeline in safetensors format or not.",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+    args = parser.parse_args()
+    pipe = load_pipeline_from_original_MusicLDM_ckpt(
+        checkpoint_path=args.checkpoint_path,
+        original_config_file=args.original_config_file,
+        image_size=args.image_size,
+        prediction_type=args.prediction_type,
+        extract_ema=args.extract_ema,
+        scheduler_type=args.scheduler_type,
+        num_in_channels=args.num_in_channels,
+        model_channels=args.model_channels,
+        num_head_channels=args.num_head_channels,
+        from_safetensors=args.from_safetensors,
+        device=args.device,
+    )
+    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)

diffusers/scripts/convert_pixart_sigma_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import argparse
+import os
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+from diffusers import AutoencoderKL, DPMSolverMultistepScheduler, PixArtSigmaPipeline, Transformer2DModel
+ckpt_id = "PixArt-alpha"
+# https://github.com/PixArt-alpha/PixArt-sigma/blob/dd087141864e30ec44f12cb7448dd654be065e88/scripts/inference.py#L158
+interpolation_scale = {256: 0.5, 512: 1, 1024: 2, 2048: 4}
+def main(args):
+    all_state_dict = torch.load(args.orig_ckpt_path)
+    state_dict = all_state_dict.pop("state_dict")
+    converted_state_dict = {}
+    # Patch embeddings.
+    converted_state_dict["pos_embed.proj.weight"] = state_dict.pop("x_embedder.proj.weight")
+    converted_state_dict["pos_embed.proj.bias"] = state_dict.pop("x_embedder.proj.bias")
+    # Caption projection.
+    converted_state_dict["caption_projection.linear_1.weight"] = state_dict.pop("y_embedder.y_proj.fc1.weight")
+    converted_state_dict["caption_projection.linear_1.bias"] = state_dict.pop("y_embedder.y_proj.fc1.bias")
+    converted_state_dict["caption_projection.linear_2.weight"] = state_dict.pop("y_embedder.y_proj.fc2.weight")
+    converted_state_dict["caption_projection.linear_2.bias"] = state_dict.pop("y_embedder.y_proj.fc2.bias")
+    # AdaLN-single LN
+    converted_state_dict["adaln_single.emb.timestep_embedder.linear_1.weight"] = state_dict.pop(
+        "t_embedder.mlp.0.weight"
+    )
+    converted_state_dict["adaln_single.emb.timestep_embedder.linear_1.bias"] = state_dict.pop("t_embedder.mlp.0.bias")
+    converted_state_dict["adaln_single.emb.timestep_embedder.linear_2.weight"] = state_dict.pop(
+        "t_embedder.mlp.2.weight"
+    )
+    converted_state_dict["adaln_single.emb.timestep_embedder.linear_2.bias"] = state_dict.pop("t_embedder.mlp.2.bias")
+    if args.micro_condition:
+        # Resolution.
+        converted_state_dict["adaln_single.emb.resolution_embedder.linear_1.weight"] = state_dict.pop(
+            "csize_embedder.mlp.0.weight"
+        )
+        converted_state_dict["adaln_single.emb.resolution_embedder.linear_1.bias"] = state_dict.pop(
+            "csize_embedder.mlp.0.bias"
+        )
+        converted_state_dict["adaln_single.emb.resolution_embedder.linear_2.weight"] = state_dict.pop(
+            "csize_embedder.mlp.2.weight"
+        )
+        converted_state_dict["adaln_single.emb.resolution_embedder.linear_2.bias"] = state_dict.pop(
+            "csize_embedder.mlp.2.bias"
+        )
+        # Aspect ratio.
+        converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_1.weight"] = state_dict.pop(
+            "ar_embedder.mlp.0.weight"
+        )
+        converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_1.bias"] = state_dict.pop(
+            "ar_embedder.mlp.0.bias"
+        )
+        converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_2.weight"] = state_dict.pop(
+            "ar_embedder.mlp.2.weight"
+        )
+        converted_state_dict["adaln_single.emb.aspect_ratio_embedder.linear_2.bias"] = state_dict.pop(
+            "ar_embedder.mlp.2.bias"
+        )
+    # Shared norm.
+    converted_state_dict["adaln_single.linear.weight"] = state_dict.pop("t_block.1.weight")
+    converted_state_dict["adaln_single.linear.bias"] = state_dict.pop("t_block.1.bias")
+    for depth in range(28):
+        # Transformer blocks.
+        converted_state_dict[f"transformer_blocks.{depth}.scale_shift_table"] = state_dict.pop(
+            f"blocks.{depth}.scale_shift_table"
+        )
+        # Attention is all you need 🤘
+        # Self attention.
+        q, k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.attn.qkv.weight"), 3, dim=0)
+        q_bias, k_bias, v_bias = torch.chunk(state_dict.pop(f"blocks.{depth}.attn.qkv.bias"), 3, dim=0)
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_q.weight"] = q
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_q.bias"] = q_bias
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_k.weight"] = k
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_k.bias"] = k_bias
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_v.weight"] = v
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_v.bias"] = v_bias
+        # Projection.
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.weight"] = state_dict.pop(
+            f"blocks.{depth}.attn.proj.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.bias"] = state_dict.pop(
+            f"blocks.{depth}.attn.proj.bias"
+        )
+        if args.qk_norm:
+            converted_state_dict[f"transformer_blocks.{depth}.attn1.q_norm.weight"] = state_dict.pop(
+                f"blocks.{depth}.attn.q_norm.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{depth}.attn1.q_norm.bias"] = state_dict.pop(
+                f"blocks.{depth}.attn.q_norm.bias"
+            )
+            converted_state_dict[f"transformer_blocks.{depth}.attn1.k_norm.weight"] = state_dict.pop(
+                f"blocks.{depth}.attn.k_norm.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{depth}.attn1.k_norm.bias"] = state_dict.pop(
+                f"blocks.{depth}.attn.k_norm.bias"
+            )
+        # Feed-forward.
+        converted_state_dict[f"transformer_blocks.{depth}.ff.net.0.proj.weight"] = state_dict.pop(
+            f"blocks.{depth}.mlp.fc1.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.net.0.proj.bias"] = state_dict.pop(
+            f"blocks.{depth}.mlp.fc1.bias"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.net.2.weight"] = state_dict.pop(
+            f"blocks.{depth}.mlp.fc2.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.net.2.bias"] = state_dict.pop(
+            f"blocks.{depth}.mlp.fc2.bias"
+        )
+        # Cross-attention.
+        q = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.weight")
+        q_bias = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.bias")
+        k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.weight"), 2, dim=0)
+        k_bias, v_bias = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.bias"), 2, dim=0)
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.weight"] = q
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.bias"] = q_bias
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.weight"] = k
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.bias"] = k_bias
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.weight"] = v
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.bias"] = v_bias
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.weight"] = state_dict.pop(
+            f"blocks.{depth}.cross_attn.proj.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.bias"] = state_dict.pop(
+            f"blocks.{depth}.cross_attn.proj.bias"
+        )
+    # Final block.
+    converted_state_dict["proj_out.weight"] = state_dict.pop("final_layer.linear.weight")
+    converted_state_dict["proj_out.bias"] = state_dict.pop("final_layer.linear.bias")
+    converted_state_dict["scale_shift_table"] = state_dict.pop("final_layer.scale_shift_table")
+    # PixArt XL/2
+    transformer = Transformer2DModel(
+        sample_size=args.image_size // 8,
+        num_layers=28,
+        attention_head_dim=72,
+        in_channels=4,
+        out_channels=8,
+        patch_size=2,
+        attention_bias=True,
+        num_attention_heads=16,
+        cross_attention_dim=1152,
+        activation_fn="gelu-approximate",
+        num_embeds_ada_norm=1000,
+        norm_type="ada_norm_single",
+        norm_elementwise_affine=False,
+        norm_eps=1e-6,
+        caption_channels=4096,
+        interpolation_scale=interpolation_scale[args.image_size],
+        use_additional_conditions=args.micro_condition,
+    )
+    transformer.load_state_dict(converted_state_dict, strict=True)
+    assert transformer.pos_embed.pos_embed is not None
+    try:
+        state_dict.pop("y_embedder.y_embedding")
+        state_dict.pop("pos_embed")
+    except Exception as e:
+        print(f"Skipping {str(e)}")
+        pass
+    assert len(state_dict) == 0, f"State dict is not empty, {state_dict.keys()}"
+    num_model_params = sum(p.numel() for p in transformer.parameters())
+    print(f"Total number of transformer parameters: {num_model_params}")
+    if args.only_transformer:
+        transformer.save_pretrained(os.path.join(args.dump_path, "transformer"))
+    else:
+        # pixart-Sigma vae link: https://huggingface.co/PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers/tree/main/vae
+        vae = AutoencoderKL.from_pretrained(f"{ckpt_id}/pixart_sigma_sdxlvae_T5_diffusers", subfolder="vae")
+        scheduler = DPMSolverMultistepScheduler()
+        tokenizer = T5Tokenizer.from_pretrained(f"{ckpt_id}/pixart_sigma_sdxlvae_T5_diffusers", subfolder="tokenizer")
+        text_encoder = T5EncoderModel.from_pretrained(
+            f"{ckpt_id}/pixart_sigma_sdxlvae_T5_diffusers", subfolder="text_encoder"
+        )
+        pipeline = PixArtSigmaPipeline(
+            tokenizer=tokenizer, text_encoder=text_encoder, transformer=transformer, vae=vae, scheduler=scheduler
+        )
+        pipeline.save_pretrained(args.dump_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--micro_condition", action="store_true", help="If use Micro-condition in PixArtMS structure during training."
+    )
+    parser.add_argument("--qk_norm", action="store_true", help="If use qk norm during training.")
+    parser.add_argument(
+        "--orig_ckpt_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--image_size",
+        default=1024,
+        type=int,
+        choices=[256, 512, 1024, 2048],
+        required=False,
+        help="Image size of pretrained model, 256, 512, 1024, or 2048.",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output pipeline.")
+    parser.add_argument("--only_transformer", default=True, type=bool, required=True)
+    args = parser.parse_args()
+    main(args)

diffusers/scripts/convert_sana_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,456 @@

+#!/usr/bin/env python
+from __future__ import annotations
+import argparse
+import os
+from contextlib import nullcontext
+import torch
+from accelerate import init_empty_weights
+from huggingface_hub import hf_hub_download, snapshot_download
+from termcolor import colored
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from diffusers import (
+    AutoencoderDC,
+    DPMSolverMultistepScheduler,
+    FlowMatchEulerDiscreteScheduler,
+    SanaPipeline,
+    SanaSprintPipeline,
+    SanaTransformer2DModel,
+    SCMScheduler,
+)
+from diffusers.models.modeling_utils import load_model_dict_into_meta
+from diffusers.utils.import_utils import is_accelerate_available
+CTX = init_empty_weights if is_accelerate_available else nullcontext
+ckpt_ids = [
+    "Efficient-Large-Model/Sana_Sprint_0.6B_1024px/checkpoints/Sana_Sprint_0.6B_1024px.pth"
+    "Efficient-Large-Model/Sana_Sprint_1.6B_1024px/checkpoints/Sana_Sprint_1.6B_1024px.pth"
+    "Efficient-Large-Model/SANA1.5_4.8B_1024px/checkpoints/SANA1.5_4.8B_1024px.pth",
+    "Efficient-Large-Model/SANA1.5_1.6B_1024px/checkpoints/SANA1.5_1.6B_1024px.pth",
+    "Efficient-Large-Model/Sana_1600M_4Kpx_BF16/checkpoints/Sana_1600M_4Kpx_BF16.pth",
+    "Efficient-Large-Model/Sana_1600M_2Kpx_BF16/checkpoints/Sana_1600M_2Kpx_BF16.pth",
+    "Efficient-Large-Model/Sana_1600M_1024px_MultiLing/checkpoints/Sana_1600M_1024px_MultiLing.pth",
+    "Efficient-Large-Model/Sana_1600M_1024px_BF16/checkpoints/Sana_1600M_1024px_BF16.pth",
+    "Efficient-Large-Model/Sana_1600M_512px_MultiLing/checkpoints/Sana_1600M_512px_MultiLing.pth",
+    "Efficient-Large-Model/Sana_1600M_1024px/checkpoints/Sana_1600M_1024px.pth",
+    "Efficient-Large-Model/Sana_1600M_512px/checkpoints/Sana_1600M_512px.pth",
+    "Efficient-Large-Model/Sana_600M_1024px/checkpoints/Sana_600M_1024px_MultiLing.pth",
+    "Efficient-Large-Model/Sana_600M_512px/checkpoints/Sana_600M_512px_MultiLing.pth",
+]
+# https://github.com/NVlabs/Sana/blob/main/scripts/inference.py
+def main(args):
+    cache_dir_path = os.path.expanduser("~/.cache/huggingface/hub")
+    if args.orig_ckpt_path is None or args.orig_ckpt_path in ckpt_ids:
+        ckpt_id = args.orig_ckpt_path or ckpt_ids[0]
+        snapshot_download(
+            repo_id=f"{'/'.join(ckpt_id.split('/')[:2])}",
+            cache_dir=cache_dir_path,
+            repo_type="model",
+        )
+        file_path = hf_hub_download(
+            repo_id=f"{'/'.join(ckpt_id.split('/')[:2])}",
+            filename=f"{'/'.join(ckpt_id.split('/')[2:])}",
+            cache_dir=cache_dir_path,
+            repo_type="model",
+        )
+    else:
+        file_path = args.orig_ckpt_path
+    print(colored(f"Loading checkpoint from {file_path}", "green", attrs=["bold"]))
+    all_state_dict = torch.load(file_path, weights_only=True)
+    state_dict = all_state_dict.pop("state_dict")
+    converted_state_dict = {}
+    # Patch embeddings.
+    converted_state_dict["patch_embed.proj.weight"] = state_dict.pop("x_embedder.proj.weight")
+    converted_state_dict["patch_embed.proj.bias"] = state_dict.pop("x_embedder.proj.bias")
+    # Caption projection.
+    converted_state_dict["caption_projection.linear_1.weight"] = state_dict.pop("y_embedder.y_proj.fc1.weight")
+    converted_state_dict["caption_projection.linear_1.bias"] = state_dict.pop("y_embedder.y_proj.fc1.bias")
+    converted_state_dict["caption_projection.linear_2.weight"] = state_dict.pop("y_embedder.y_proj.fc2.weight")
+    converted_state_dict["caption_projection.linear_2.bias"] = state_dict.pop("y_embedder.y_proj.fc2.bias")
+    # Handle different time embedding structure based on model type
+    if args.model_type in ["SanaSprint_1600M_P1_D20", "SanaSprint_600M_P1_D28"]:
+        # For Sana Sprint, the time embedding structure is different
+        converted_state_dict["time_embed.timestep_embedder.linear_1.weight"] = state_dict.pop(
+            "t_embedder.mlp.0.weight"
+        )
+        converted_state_dict["time_embed.timestep_embedder.linear_1.bias"] = state_dict.pop("t_embedder.mlp.0.bias")
+        converted_state_dict["time_embed.timestep_embedder.linear_2.weight"] = state_dict.pop(
+            "t_embedder.mlp.2.weight"
+        )
+        converted_state_dict["time_embed.timestep_embedder.linear_2.bias"] = state_dict.pop("t_embedder.mlp.2.bias")
+        # Guidance embedder for Sana Sprint
+        converted_state_dict["time_embed.guidance_embedder.linear_1.weight"] = state_dict.pop(
+            "cfg_embedder.mlp.0.weight"
+        )
+        converted_state_dict["time_embed.guidance_embedder.linear_1.bias"] = state_dict.pop("cfg_embedder.mlp.0.bias")
+        converted_state_dict["time_embed.guidance_embedder.linear_2.weight"] = state_dict.pop(
+            "cfg_embedder.mlp.2.weight"
+        )
+        converted_state_dict["time_embed.guidance_embedder.linear_2.bias"] = state_dict.pop("cfg_embedder.mlp.2.bias")
+    else:
+        # Original Sana time embedding structure
+        converted_state_dict["time_embed.emb.timestep_embedder.linear_1.weight"] = state_dict.pop(
+            "t_embedder.mlp.0.weight"
+        )
+        converted_state_dict["time_embed.emb.timestep_embedder.linear_1.bias"] = state_dict.pop(
+            "t_embedder.mlp.0.bias"
+        )
+        converted_state_dict["time_embed.emb.timestep_embedder.linear_2.weight"] = state_dict.pop(
+            "t_embedder.mlp.2.weight"
+        )
+        converted_state_dict["time_embed.emb.timestep_embedder.linear_2.bias"] = state_dict.pop(
+            "t_embedder.mlp.2.bias"
+        )
+    # Shared norm.
+    converted_state_dict["time_embed.linear.weight"] = state_dict.pop("t_block.1.weight")
+    converted_state_dict["time_embed.linear.bias"] = state_dict.pop("t_block.1.bias")
+    # y norm
+    converted_state_dict["caption_norm.weight"] = state_dict.pop("attention_y_norm.weight")
+    # scheduler
+    if args.image_size == 4096:
+        flow_shift = 6.0
+    else:
+        flow_shift = 3.0
+    # model config
+    if args.model_type in ["SanaMS_1600M_P1_D20", "SanaSprint_1600M_P1_D20", "SanaMS1.5_1600M_P1_D20"]:
+        layer_num = 20
+    elif args.model_type in ["SanaMS_600M_P1_D28", "SanaSprint_600M_P1_D28"]:
+        layer_num = 28
+    elif args.model_type == "SanaMS_4800M_P1_D60":
+        layer_num = 60
+    else:
+        raise ValueError(f"{args.model_type} is not supported.")
+    # Positional embedding interpolation scale.
+    interpolation_scale = {512: None, 1024: None, 2048: 1.0, 4096: 2.0}
+    qk_norm = (
+        "rms_norm_across_heads"
+        if args.model_type
+        in ["SanaMS1.5_1600M_P1_D20", "SanaMS1.5_4800M_P1_D60", "SanaSprint_600M_P1_D28", "SanaSprint_1600M_P1_D20"]
+        else None
+    )
+    for depth in range(layer_num):
+        # Transformer blocks.
+        converted_state_dict[f"transformer_blocks.{depth}.scale_shift_table"] = state_dict.pop(
+            f"blocks.{depth}.scale_shift_table"
+        )
+        # Linear Attention is all you need 🤘
+        # Self attention.
+        q, k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.attn.qkv.weight"), 3, dim=0)
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_q.weight"] = q
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_k.weight"] = k
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_v.weight"] = v
+        if qk_norm is not None:
+            # Add Q/K normalization for self-attention (attn1) - needed for Sana-Sprint and Sana-1.5
+            converted_state_dict[f"transformer_blocks.{depth}.attn1.norm_q.weight"] = state_dict.pop(
+                f"blocks.{depth}.attn.q_norm.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{depth}.attn1.norm_k.weight"] = state_dict.pop(
+                f"blocks.{depth}.attn.k_norm.weight"
+            )
+        # Projection.
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.weight"] = state_dict.pop(
+            f"blocks.{depth}.attn.proj.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.attn1.to_out.0.bias"] = state_dict.pop(
+            f"blocks.{depth}.attn.proj.bias"
+        )
+        # Feed-forward.
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_inverted.weight"] = state_dict.pop(
+            f"blocks.{depth}.mlp.inverted_conv.conv.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_inverted.bias"] = state_dict.pop(
+            f"blocks.{depth}.mlp.inverted_conv.conv.bias"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_depth.weight"] = state_dict.pop(
+            f"blocks.{depth}.mlp.depth_conv.conv.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_depth.bias"] = state_dict.pop(
+            f"blocks.{depth}.mlp.depth_conv.conv.bias"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_point.weight"] = state_dict.pop(
+            f"blocks.{depth}.mlp.point_conv.conv.weight"
+        )
+        # Cross-attention.
+        q = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.weight")
+        q_bias = state_dict.pop(f"blocks.{depth}.cross_attn.q_linear.bias")
+        k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.weight"), 2, dim=0)
+        k_bias, v_bias = torch.chunk(state_dict.pop(f"blocks.{depth}.cross_attn.kv_linear.bias"), 2, dim=0)
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.weight"] = q
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_q.bias"] = q_bias
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.weight"] = k
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_k.bias"] = k_bias
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.weight"] = v
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_v.bias"] = v_bias
+        if qk_norm is not None:
+            # Add Q/K normalization for cross-attention (attn2) - needed for Sana-Sprint and Sana-1.5
+            converted_state_dict[f"transformer_blocks.{depth}.attn2.norm_q.weight"] = state_dict.pop(
+                f"blocks.{depth}.cross_attn.q_norm.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{depth}.attn2.norm_k.weight"] = state_dict.pop(
+                f"blocks.{depth}.cross_attn.k_norm.weight"
+            )
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.weight"] = state_dict.pop(
+            f"blocks.{depth}.cross_attn.proj.weight"
+        )
+        converted_state_dict[f"transformer_blocks.{depth}.attn2.to_out.0.bias"] = state_dict.pop(
+            f"blocks.{depth}.cross_attn.proj.bias"
+        )
+    # Final block.
+    converted_state_dict["proj_out.weight"] = state_dict.pop("final_layer.linear.weight")
+    converted_state_dict["proj_out.bias"] = state_dict.pop("final_layer.linear.bias")
+    converted_state_dict["scale_shift_table"] = state_dict.pop("final_layer.scale_shift_table")
+    # Transformer
+    with CTX():
+        transformer_kwargs = {
+            "in_channels": 32,
+            "out_channels": 32,
+            "num_attention_heads": model_kwargs[args.model_type]["num_attention_heads"],
+            "attention_head_dim": model_kwargs[args.model_type]["attention_head_dim"],
+            "num_layers": model_kwargs[args.model_type]["num_layers"],
+            "num_cross_attention_heads": model_kwargs[args.model_type]["num_cross_attention_heads"],
+            "cross_attention_head_dim": model_kwargs[args.model_type]["cross_attention_head_dim"],
+            "cross_attention_dim": model_kwargs[args.model_type]["cross_attention_dim"],
+            "caption_channels": 2304,
+            "mlp_ratio": 2.5,
+            "attention_bias": False,
+            "sample_size": args.image_size // 32,
+            "patch_size": 1,
+            "norm_elementwise_affine": False,
+            "norm_eps": 1e-6,
+            "interpolation_scale": interpolation_scale[args.image_size],
+        }
+        # Add qk_norm parameter for Sana Sprint
+        if args.model_type in [
+            "SanaMS1.5_1600M_P1_D20",
+            "SanaMS1.5_4800M_P1_D60",
+            "SanaSprint_600M_P1_D28",
+            "SanaSprint_1600M_P1_D20",
+        ]:
+            transformer_kwargs["qk_norm"] = "rms_norm_across_heads"
+        if args.model_type in ["SanaSprint_1600M_P1_D20", "SanaSprint_600M_P1_D28"]:
+            transformer_kwargs["guidance_embeds"] = True
+        transformer = SanaTransformer2DModel(**transformer_kwargs)
+    if is_accelerate_available():
+        load_model_dict_into_meta(transformer, converted_state_dict)
+    else:
+        transformer.load_state_dict(converted_state_dict, strict=True, assign=True)
+    try:
+        state_dict.pop("y_embedder.y_embedding")
+        state_dict.pop("pos_embed")
+        state_dict.pop("logvar_linear.weight")
+        state_dict.pop("logvar_linear.bias")
+    except KeyError:
+        print("y_embedder.y_embedding or pos_embed not found in the state_dict")
+    assert len(state_dict) == 0, f"State dict is not empty, {state_dict.keys()}"
+    num_model_params = sum(p.numel() for p in transformer.parameters())
+    print(f"Total number of transformer parameters: {num_model_params}")
+    transformer = transformer.to(weight_dtype)
+    if not args.save_full_pipeline:
+        print(
+            colored(
+                f"Only saving transformer model of {args.model_type}. "
+                f"Set --save_full_pipeline to save the whole Pipeline",
+                "green",
+                attrs=["bold"],
+            )
+        )
+        transformer.save_pretrained(
+            os.path.join(args.dump_path, "transformer"), safe_serialization=True, max_shard_size="5GB"
+        )
+    else:
+        print(colored(f"Saving the whole Pipeline containing {args.model_type}", "green", attrs=["bold"]))
+        # VAE
+        ae = AutoencoderDC.from_pretrained("mit-han-lab/dc-ae-f32c32-sana-1.1-diffusers", torch_dtype=torch.float32)
+        # Text Encoder
+        text_encoder_model_path = "Efficient-Large-Model/gemma-2-2b-it"
+        tokenizer = AutoTokenizer.from_pretrained(text_encoder_model_path)
+        tokenizer.padding_side = "right"
+        text_encoder = AutoModelForCausalLM.from_pretrained(
+            text_encoder_model_path, torch_dtype=torch.bfloat16
+        ).get_decoder()
+        # Choose the appropriate pipeline and scheduler based on model type
+        if args.model_type in ["SanaSprint_1600M_P1_D20", "SanaSprint_600M_P1_D28"]:
+            # Force SCM Scheduler for Sana Sprint regardless of scheduler_type
+            if args.scheduler_type != "scm":
+                print(
+                    colored(
+                        f"Warning: Overriding scheduler_type '{args.scheduler_type}' to 'scm' for SanaSprint model",
+                        "yellow",
+                        attrs=["bold"],
+                    )
+                )
+            # SCM Scheduler for Sana Sprint
+            scheduler_config = {
+                "prediction_type": "trigflow",
+                "sigma_data": 0.5,
+            }
+            scheduler = SCMScheduler(**scheduler_config)
+            pipe = SanaSprintPipeline(
+                tokenizer=tokenizer,
+                text_encoder=text_encoder,
+                transformer=transformer,
+                vae=ae,
+                scheduler=scheduler,
+            )
+        else:
+            # Original Sana scheduler
+            if args.scheduler_type == "flow-dpm_solver":
+                scheduler = DPMSolverMultistepScheduler(
+                    flow_shift=flow_shift,
+                    use_flow_sigmas=True,
+                    prediction_type="flow_prediction",
+                )
+            elif args.scheduler_type == "flow-euler":
+                scheduler = FlowMatchEulerDiscreteScheduler(shift=flow_shift)
+            else:
+                raise ValueError(f"Scheduler type {args.scheduler_type} is not supported")
+            pipe = SanaPipeline(
+                tokenizer=tokenizer,
+                text_encoder=text_encoder,
+                transformer=transformer,
+                vae=ae,
+                scheduler=scheduler,
+            )
+        pipe.save_pretrained(args.dump_path, safe_serialization=True, max_shard_size="5GB")
+DTYPE_MAPPING = {
+    "fp32": torch.float32,
+    "fp16": torch.float16,
+    "bf16": torch.bfloat16,
+}
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--orig_ckpt_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--image_size",
+        default=1024,
+        type=int,
+        choices=[512, 1024, 2048, 4096],
+        required=False,
+        help="Image size of pretrained model, 512, 1024, 2048 or 4096.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default="SanaMS_1600M_P1_D20",
+        type=str,
+        choices=[
+            "SanaMS_1600M_P1_D20",
+            "SanaMS_600M_P1_D28",
+            "SanaMS1.5_1600M_P1_D20",
+            "SanaMS1.5_4800M_P1_D60",
+            "SanaSprint_1600M_P1_D20",
+            "SanaSprint_600M_P1_D28",
+        ],
+    )
+    parser.add_argument(
+        "--scheduler_type",
+        default="flow-dpm_solver",
+        type=str,
+        choices=["flow-dpm_solver", "flow-euler", "scm"],
+        help="Scheduler type to use. Use 'scm' for Sana Sprint models.",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output pipeline.")
+    parser.add_argument("--save_full_pipeline", action="store_true", help="save all the pipeline elements in one.")
+    parser.add_argument("--dtype", default="fp32", type=str, choices=["fp32", "fp16", "bf16"], help="Weight dtype.")
+    args = parser.parse_args()
+    model_kwargs = {
+        "SanaMS_1600M_P1_D20": {
+            "num_attention_heads": 70,
+            "attention_head_dim": 32,
+            "num_cross_attention_heads": 20,
+            "cross_attention_head_dim": 112,
+            "cross_attention_dim": 2240,
+            "num_layers": 20,
+        },
+        "SanaMS_600M_P1_D28": {
+            "num_attention_heads": 36,
+            "attention_head_dim": 32,
+            "num_cross_attention_heads": 16,
+            "cross_attention_head_dim": 72,
+            "cross_attention_dim": 1152,
+            "num_layers": 28,
+        },
+        "SanaMS1.5_1600M_P1_D20": {
+            "num_attention_heads": 70,
+            "attention_head_dim": 32,
+            "num_cross_attention_heads": 20,
+            "cross_attention_head_dim": 112,
+            "cross_attention_dim": 2240,
+            "num_layers": 20,
+        },
+        "SanaMS1.5_4800M_P1_D60": {
+            "num_attention_heads": 70,
+            "attention_head_dim": 32,
+            "num_cross_attention_heads": 20,
+            "cross_attention_head_dim": 112,
+            "cross_attention_dim": 2240,
+            "num_layers": 60,
+        },
+        "SanaSprint_600M_P1_D28": {
+            "num_attention_heads": 36,
+            "attention_head_dim": 32,
+            "num_cross_attention_heads": 16,
+            "cross_attention_head_dim": 72,
+            "cross_attention_dim": 1152,
+            "num_layers": 28,
+        },
+        "SanaSprint_1600M_P1_D20": {
+            "num_attention_heads": 70,
+            "attention_head_dim": 32,
+            "num_cross_attention_heads": 20,
+            "cross_attention_head_dim": 112,
+            "cross_attention_dim": 2240,
+            "num_layers": 20,
+        },
+    }
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    weight_dtype = DTYPE_MAPPING[args.dtype]
+    main(args)

diffusers/scripts/convert_stable_cascade.py ADDED Viewed

	@@ -0,0 +1,218 @@

+# Run this script to convert the Stable Cascade model weights to a diffusers pipeline.
+import argparse
+from contextlib import nullcontext
+import torch
+from safetensors.torch import load_file
+from transformers import (
+    AutoTokenizer,
+    CLIPConfig,
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPVisionModelWithProjection,
+)
+from diffusers import (
+    DDPMWuerstchenScheduler,
+    StableCascadeCombinedPipeline,
+    StableCascadeDecoderPipeline,
+    StableCascadePriorPipeline,
+)
+from diffusers.loaders.single_file_utils import convert_stable_cascade_unet_single_file_to_diffusers
+from diffusers.models import StableCascadeUNet
+from diffusers.models.modeling_utils import load_model_dict_into_meta
+from diffusers.pipelines.wuerstchen import PaellaVQModel
+from diffusers.utils import is_accelerate_available
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+parser = argparse.ArgumentParser(description="Convert Stable Cascade model weights to a diffusers pipeline")
+parser.add_argument("--model_path", type=str, help="Location of Stable Cascade weights")
+parser.add_argument("--stage_c_name", type=str, default="stage_c.safetensors", help="Name of stage c checkpoint file")
+parser.add_argument("--stage_b_name", type=str, default="stage_b.safetensors", help="Name of stage b checkpoint file")
+parser.add_argument("--skip_stage_c", action="store_true", help="Skip converting stage c")
+parser.add_argument("--skip_stage_b", action="store_true", help="Skip converting stage b")
+parser.add_argument("--use_safetensors", action="store_true", help="Use SafeTensors for conversion")
+parser.add_argument(
+    "--prior_output_path", default="stable-cascade-prior", type=str, help="Hub organization to save the pipelines to"
+)
+parser.add_argument(
+    "--decoder_output_path",
+    type=str,
+    default="stable-cascade-decoder",
+    help="Hub organization to save the pipelines to",
+)
+parser.add_argument(
+    "--combined_output_path",
+    type=str,
+    default="stable-cascade-combined",
+    help="Hub organization to save the pipelines to",
+)
+parser.add_argument("--save_combined", action="store_true")
+parser.add_argument("--push_to_hub", action="store_true", help="Push to hub")
+parser.add_argument("--variant", type=str, help="Set to bf16 to save bfloat16 weights")
+args = parser.parse_args()
+if args.skip_stage_b and args.skip_stage_c:
+    raise ValueError("At least one stage should be converted")
+if (args.skip_stage_b or args.skip_stage_c) and args.save_combined:
+    raise ValueError("Cannot skip stages when creating a combined pipeline")
+model_path = args.model_path
+device = "cpu"
+if args.variant == "bf16":
+    dtype = torch.bfloat16
+else:
+    dtype = torch.float32
+# set paths to model weights
+prior_checkpoint_path = f"{model_path}/{args.stage_c_name}"
+decoder_checkpoint_path = f"{model_path}/{args.stage_b_name}"
+# Clip Text encoder and tokenizer
+config = CLIPConfig.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+config.text_config.projection_dim = config.projection_dim
+text_encoder = CLIPTextModelWithProjection.from_pretrained(
+    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", config=config.text_config
+)
+tokenizer = AutoTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+# image processor
+feature_extractor = CLIPImageProcessor()
+image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+# scheduler for prior and decoder
+scheduler = DDPMWuerstchenScheduler()
+ctx = init_empty_weights if is_accelerate_available() else nullcontext
+if not args.skip_stage_c:
+    # Prior
+    if args.use_safetensors:
+        prior_orig_state_dict = load_file(prior_checkpoint_path, device=device)
+    else:
+        prior_orig_state_dict = torch.load(prior_checkpoint_path, map_location=device)
+    prior_state_dict = convert_stable_cascade_unet_single_file_to_diffusers(prior_orig_state_dict)
+    with ctx():
+        prior_model = StableCascadeUNet(
+            in_channels=16,
+            out_channels=16,
+            timestep_ratio_embedding_dim=64,
+            patch_size=1,
+            conditioning_dim=2048,
+            block_out_channels=[2048, 2048],
+            num_attention_heads=[32, 32],
+            down_num_layers_per_block=[8, 24],
+            up_num_layers_per_block=[24, 8],
+            down_blocks_repeat_mappers=[1, 1],
+            up_blocks_repeat_mappers=[1, 1],
+            block_types_per_layer=[
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+            ],
+            clip_text_in_channels=1280,
+            clip_text_pooled_in_channels=1280,
+            clip_image_in_channels=768,
+            clip_seq=4,
+            kernel_size=3,
+            dropout=[0.1, 0.1],
+            self_attn=True,
+            timestep_conditioning_type=["sca", "crp"],
+            switch_level=[False],
+        )
+    if is_accelerate_available():
+        load_model_dict_into_meta(prior_model, prior_state_dict)
+    else:
+        prior_model.load_state_dict(prior_state_dict)
+    # Prior pipeline
+    prior_pipeline = StableCascadePriorPipeline(
+        prior=prior_model,
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        image_encoder=image_encoder,
+        scheduler=scheduler,
+        feature_extractor=feature_extractor,
+    )
+    prior_pipeline.to(dtype).save_pretrained(
+        args.prior_output_path, push_to_hub=args.push_to_hub, variant=args.variant
+    )
+if not args.skip_stage_b:
+    # Decoder
+    if args.use_safetensors:
+        decoder_orig_state_dict = load_file(decoder_checkpoint_path, device=device)
+    else:
+        decoder_orig_state_dict = torch.load(decoder_checkpoint_path, map_location=device)
+    decoder_state_dict = convert_stable_cascade_unet_single_file_to_diffusers(decoder_orig_state_dict)
+    with ctx():
+        decoder = StableCascadeUNet(
+            in_channels=4,
+            out_channels=4,
+            timestep_ratio_embedding_dim=64,
+            patch_size=2,
+            conditioning_dim=1280,
+            block_out_channels=[320, 640, 1280, 1280],
+            down_num_layers_per_block=[2, 6, 28, 6],
+            up_num_layers_per_block=[6, 28, 6, 2],
+            down_blocks_repeat_mappers=[1, 1, 1, 1],
+            up_blocks_repeat_mappers=[3, 3, 2, 2],
+            num_attention_heads=[0, 0, 20, 20],
+            block_types_per_layer=[
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+                ["SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"],
+            ],
+            clip_text_pooled_in_channels=1280,
+            clip_seq=4,
+            effnet_in_channels=16,
+            pixel_mapper_in_channels=3,
+            kernel_size=3,
+            dropout=[0, 0, 0.1, 0.1],
+            self_attn=True,
+            timestep_conditioning_type=["sca"],
+        )
+    if is_accelerate_available():
+        load_model_dict_into_meta(decoder, decoder_state_dict)
+    else:
+        decoder.load_state_dict(decoder_state_dict)
+    # VQGAN from Wuerstchen-V2
+    vqmodel = PaellaVQModel.from_pretrained("warp-ai/wuerstchen", subfolder="vqgan")
+    # Decoder pipeline
+    decoder_pipeline = StableCascadeDecoderPipeline(
+        decoder=decoder, text_encoder=text_encoder, tokenizer=tokenizer, vqgan=vqmodel, scheduler=scheduler
+    )
+    decoder_pipeline.to(dtype).save_pretrained(
+        args.decoder_output_path, push_to_hub=args.push_to_hub, variant=args.variant
+    )
+if args.save_combined:
+    # Stable Cascade combined pipeline
+    stable_cascade_pipeline = StableCascadeCombinedPipeline(
+        # Decoder
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        decoder=decoder,
+        scheduler=scheduler,
+        vqgan=vqmodel,
+        # Prior
+        prior_text_encoder=text_encoder,
+        prior_tokenizer=tokenizer,
+        prior_prior=prior_model,
+        prior_scheduler=scheduler,
+        prior_image_encoder=image_encoder,
+        prior_feature_extractor=feature_extractor,
+    )
+    stable_cascade_pipeline.to(dtype).save_pretrained(
+        args.combined_output_path, push_to_hub=args.push_to_hub, variant=args.variant
+    )

diffusers/scripts/convert_vae_pt_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import argparse
+import io
+import requests
+import torch
+import yaml
+from diffusers import AutoencoderKL
+from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
+    assign_to_checkpoint,
+    conv_attn_to_linear,
+    create_vae_diffusers_config,
+    renew_vae_attention_paths,
+    renew_vae_resnet_paths,
+)
+from diffusers.utils.constants import DIFFUSERS_REQUEST_TIMEOUT
+def custom_convert_ldm_vae_checkpoint(checkpoint, config):
+    vae_state_dict = checkpoint
+    new_checkpoint = {}
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+    for i in range(num_down_blocks):
+        resnets = [
+            key
+            for key in down_blocks[i]
+            if f"down.{i}" in key and f"down.{i}.downsample" not in key and "attn" not in key
+        ]
+        attentions = [key for key in down_blocks[i] if f"down.{i}.attn" in key]
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+        paths = renew_vae_attention_paths(attentions)
+        meta_path = {"old": f"down.{i}.attn", "new": f"down_blocks.{i}.attentions"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key
+            for key in up_blocks[block_id]
+            if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key and "attn" not in key
+        ]
+        attentions = [key for key in up_blocks[block_id] if f"up.{block_id}.attn" in key]
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+        paths = renew_vae_attention_paths(attentions)
+        meta_path = {"old": f"up.{block_id}.attn", "new": f"up_blocks.{i}.attentions"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+def vae_pt_to_vae_diffuser(
+    checkpoint_path: str,
+    output_path: str,
+):
+    # Only support V1
+    r = requests.get(
+        " https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml",
+        timeout=DIFFUSERS_REQUEST_TIMEOUT,
+    )
+    io_obj = io.BytesIO(r.content)
+    original_config = yaml.safe_load(io_obj)
+    image_size = 512
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if checkpoint_path.endswith("safetensors"):
+        from safetensors import safe_open
+        checkpoint = {}
+        with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                checkpoint[key] = f.get_tensor(key)
+    else:
+        checkpoint = torch.load(checkpoint_path, map_location=device)["state_dict"]
+    # Convert the VAE model.
+    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+    converted_vae_checkpoint = custom_convert_ldm_vae_checkpoint(checkpoint, vae_config)
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+    vae.save_pretrained(output_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vae_pt_path", default=None, type=str, required=True, help="Path to the VAE.pt to convert.")
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the VAE.pt to convert.")
+    args = parser.parse_args()
+    vae_pt_to_vae_diffuser(args.vae_pt_path, args.dump_path)

diffusers/scripts/convert_wuerstchen.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Run inside root directory of official source code: https://github.com/dome272/wuerstchen/
+import os
+import torch
+from transformers import AutoTokenizer, CLIPTextModel
+from vqgan import VQModel
+from diffusers import (
+    DDPMWuerstchenScheduler,
+    WuerstchenCombinedPipeline,
+    WuerstchenDecoderPipeline,
+    WuerstchenPriorPipeline,
+)
+from diffusers.pipelines.wuerstchen import PaellaVQModel, WuerstchenDiffNeXt, WuerstchenPrior
+model_path = "models/"
+device = "cpu"
+paella_vqmodel = VQModel()
+state_dict = torch.load(os.path.join(model_path, "vqgan_f4_v1_500k.pt"), map_location=device)["state_dict"]
+paella_vqmodel.load_state_dict(state_dict)
+state_dict["vquantizer.embedding.weight"] = state_dict["vquantizer.codebook.weight"]
+state_dict.pop("vquantizer.codebook.weight")
+vqmodel = PaellaVQModel(num_vq_embeddings=paella_vqmodel.codebook_size, latent_channels=paella_vqmodel.c_latent)
+vqmodel.load_state_dict(state_dict)
+# Clip Text encoder and tokenizer
+text_encoder = CLIPTextModel.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+tokenizer = AutoTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+# Generator
+gen_text_encoder = CLIPTextModel.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K").to("cpu")
+gen_tokenizer = AutoTokenizer.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
+orig_state_dict = torch.load(os.path.join(model_path, "model_v2_stage_b.pt"), map_location=device)["state_dict"]
+state_dict = {}
+for key in orig_state_dict.keys():
+    if key.endswith("in_proj_weight"):
+        weights = orig_state_dict[key].chunk(3, 0)
+        state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0]
+        state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1]
+        state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2]
+    elif key.endswith("in_proj_bias"):
+        weights = orig_state_dict[key].chunk(3, 0)
+        state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0]
+        state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1]
+        state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2]
+    elif key.endswith("out_proj.weight"):
+        weights = orig_state_dict[key]
+        state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights
+    elif key.endswith("out_proj.bias"):
+        weights = orig_state_dict[key]
+        state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights
+    else:
+        state_dict[key] = orig_state_dict[key]
+decoder = WuerstchenDiffNeXt()
+decoder.load_state_dict(state_dict)
+# Prior
+orig_state_dict = torch.load(os.path.join(model_path, "model_v3_stage_c.pt"), map_location=device)["ema_state_dict"]
+state_dict = {}
+for key in orig_state_dict.keys():
+    if key.endswith("in_proj_weight"):
+        weights = orig_state_dict[key].chunk(3, 0)
+        state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0]
+        state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1]
+        state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2]
+    elif key.endswith("in_proj_bias"):
+        weights = orig_state_dict[key].chunk(3, 0)
+        state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0]
+        state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1]
+        state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2]
+    elif key.endswith("out_proj.weight"):
+        weights = orig_state_dict[key]
+        state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights
+    elif key.endswith("out_proj.bias"):
+        weights = orig_state_dict[key]
+        state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights
+    else:
+        state_dict[key] = orig_state_dict[key]
+prior_model = WuerstchenPrior(c_in=16, c=1536, c_cond=1280, c_r=64, depth=32, nhead=24).to(device)
+prior_model.load_state_dict(state_dict)
+# scheduler
+scheduler = DDPMWuerstchenScheduler()
+# Prior pipeline
+prior_pipeline = WuerstchenPriorPipeline(
+    prior=prior_model, text_encoder=text_encoder, tokenizer=tokenizer, scheduler=scheduler
+)
+prior_pipeline.save_pretrained("warp-ai/wuerstchen-prior")
+decoder_pipeline = WuerstchenDecoderPipeline(
+    text_encoder=gen_text_encoder, tokenizer=gen_tokenizer, vqgan=vqmodel, decoder=decoder, scheduler=scheduler
+)
+decoder_pipeline.save_pretrained("warp-ai/wuerstchen")
+# Wuerstchen pipeline
+wuerstchen_pipeline = WuerstchenCombinedPipeline(
+    # Decoder
+    text_encoder=gen_text_encoder,
+    tokenizer=gen_tokenizer,
+    decoder=decoder,
+    scheduler=scheduler,
+    vqgan=vqmodel,
+    # Prior
+    prior_tokenizer=tokenizer,
+    prior_text_encoder=text_encoder,
+    prior=prior_model,
+    prior_scheduler=scheduler,
+)
+wuerstchen_pipeline.save_pretrained("warp-ai/WuerstchenCombinedPipeline")

illustrious_generated/low_quality_images.json ADDED Viewed

The diff for this file is too large to render. See raw diff

illustrious_generated/natural_caption_generation_report.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+=== Natural Caption Generation Report ===
+Processing Statistics:
+- Total images processed: 9618
+- Successfully captioned: 9618
+- Errors encountered: 0
+- Success rate: 100.0%
+Time Statistics:
+- Total processing time: 533.1 minutes
+- Average time per image: 3.33 seconds
+Completion time: 2025-07-29 20:42:34

illustrious_generated/optimization_final_results.json ADDED Viewed

The diff for this file is too large to render. See raw diff

illustrious_generated/optimization_summary_report.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+=== 图像质量优化总结报告 ===
+处理统计:
+- 总图像数: 9618
+- 检测到低质量图像: 181
+- 重新生成处理: 100
+- 成功改善质量: 17
+- 改善成功率: 17.0%
+质量提升:
+- 平均质量提升: 2.3 分
+- 改善图像保存位置: /home/ubuntu/lyl/QwenIllustrious/illustrious_generated/improved
+详细结果文件:
+- 低质量图像记录: low_quality_images.json
+- 重新生成结果: regeneration_results.json
+- 最终优化结果: optimization_final_results.json
+优化完成时间: 2025-07-29 08:24:41

illustrious_generated/regeneration_results.json ADDED Viewed

The diff for this file is too large to render. See raw diff

peft/.gitignore ADDED Viewed

	@@ -0,0 +1,145 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# VSCode
+.vscode
+# IntelliJ
+.idea
+# Mac .DS_Store
+.DS_Store
+# More test things
+wandb
+# method_comparison logs
+method_comparison/MetaMathQA/cancelled_results/
+method_comparison/MetaMathQA/temporary_results/

peft/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.9.2
+    hooks:
+      - id: ruff
+        args:
+          - --fix
+      - id: ruff-format
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: check-merge-conflict
+      - id: check-yaml

peft/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

peft/Makefile ADDED Viewed

	@@ -0,0 +1,66 @@

+.PHONY: quality style test docs
+check_dirs := src tests examples docs scripts docker
+# Check that source code meets quality standards
+# this target runs checks on all files
+quality:
+	ruff check $(check_dirs)
+	ruff format --check $(check_dirs)
+	doc-builder style src/peft tests docs/source --max_len 119 --check_only
+# Format source code automatically and check is there are any problems left that need manual fixing
+style:
+	ruff check --fix $(check_dirs)
+	ruff format $(check_dirs)
+	doc-builder style src/peft tests docs/source --max_len 119
+test:
+	python -m pytest -n 3 tests/ $(if $(IS_GITHUB_CI),--report-log "ci_tests.log",)
+tests_examples_multi_gpu:
+	python -m pytest -m multi_gpu_tests tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "multi_gpu_examples.log",)
+tests_examples_single_gpu:
+	python -m pytest -m single_gpu_tests tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "single_gpu_examples.log",)
+tests_core_multi_gpu:
+	python -m pytest -m multi_gpu_tests tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_multi_gpu.log",)
+tests_core_single_gpu:
+	python -m pytest -m single_gpu_tests tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_single_gpu.log",)
+# exclude gemma tests, as generation fails with torch.compile, these failures
+# trigger side effects that make other tests fail with 'RuntimeError: Offset
+# increment outside graph capture encountered unexpectedly.'
+# TODO re-enable gemma once/if it is fixed
+tests_common_gpu:
+	python -m pytest tests/test_decoder_models.py -k "not gemma" $(if $(IS_GITHUB_CI),--report-log "common_decoder.log",)
+	python -m pytest tests/test_encoder_decoder_models.py $(if $(IS_GITHUB_CI),--report-log "common_encoder_decoder.log",)
+	python -m pytest tests/test_gptqmodel.py $(if $(IS_GITHUB_CI),--report-log "gptqmodel_gpu.log",)
+tests_examples_multi_gpu_bnb:
+	python -m pytest -m "multi_gpu_tests and bitsandbytes" tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "multi_gpu_examples.log",)
+tests_examples_single_gpu_bnb:
+	python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "single_gpu_examples.log",)
+tests_core_multi_gpu_bnb:
+	python -m pytest -m "multi_gpu_tests and bitsandbytes" tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_multi_gpu.log",)
+tests_core_single_gpu_bnb:
+	python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_single_gpu.log",)
+tests_gpu_bnb_regression:
+	python -m pytest tests/bnb/test_bnb_regression.py $(if $(IS_GITHUB_CI),--report-log "bnb_regression_gpu.log",)
+# For testing transformers tests for bnb runners
+transformers_tests:
+	RUN_SLOW=1 python -m pytest transformers-clone/tests/quantization/bnb $(if $(IS_GITHUB_CI),--report-log "transformers_tests.log",)
+tests_regression:
+	python -m pytest -s --regression tests/regression/ $(if $(IS_GITHUB_CI),--report-log "regression_tests.log",)
+tests_torch_compile:
+	python -m pytest tests/test_torch_compile.py $(if $(IS_GITHUB_CI),--report-log "compile_tests.log",)

peft/README.md ADDED Viewed

	@@ -0,0 +1,189 @@

+<!---
+Copyright 2023 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<h1 align="center"> <p>🤗 PEFT</p></h1>
+<h3 align="center">
+    <p>State-of-the-art Parameter-Efficient Fine-Tuning (PEFT) methods</p>
+</h3>
+Fine-tuning large pretrained models is often prohibitively costly due to their scale. Parameter-Efficient Fine-Tuning (PEFT) methods enable efficient adaptation of large pretrained models to various downstream applications by only fine-tuning a small number of (extra) model parameters instead of all the model's parameters. This significantly decreases the computational and storage costs. Recent state-of-the-art PEFT techniques achieve performance comparable to fully fine-tuned models.
+PEFT is integrated with Transformers for easy model training and inference, Diffusers for conveniently managing different adapters, and Accelerate for distributed training and inference for really big models.
+> [!TIP]
+> Visit the [PEFT](https://huggingface.co/PEFT) organization to read about the PEFT methods implemented in the library and to see notebooks demonstrating how to apply these methods to a variety of downstream tasks. Click the "Watch repos" button on the organization page to be notified of newly implemented methods and notebooks!
+Check the PEFT Adapters API Reference section for a list of supported PEFT methods, and read the [Adapters](https://huggingface.co/docs/peft/en/conceptual_guides/adapter), [Soft prompts](https://huggingface.co/docs/peft/en/conceptual_guides/prompting), and [IA3](https://huggingface.co/docs/peft/en/conceptual_guides/ia3) conceptual guides to learn more about how these methods work.
+## Quickstart
+Install PEFT from pip:
+```bash
+pip install peft
+```
+Prepare a model for training with a PEFT method such as LoRA by wrapping the base model and PEFT configuration with `get_peft_model`. For the bigscience/mt0-large model, you're only training 0.19% of the parameters!
+```python
+from transformers import AutoModelForCausalLM
+from peft import LoraConfig, TaskType, get_peft_model
+device = "cuda"
+model_id = "Qwen/Qwen2.5-3B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device)
+peft_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    task_type=TaskType.CAUSAL_LM,
+    # target_modules=["q_proj", "v_proj", ...]  # optionally indicate target modules
+)
+model = get_peft_model(model, peft_config)
+model.print_trainable_parameters()
+# prints: trainable params: 3,686,400 || all params: 3,089,625,088 || trainable%: 0.1193
+# now perform training on your dataset, e.g. using transformers Trainer, then save the model
+model.save_pretrained("qwen2.5-3b-lora")
+```
+To load a PEFT model for inference:
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel
+device = "cuda"
+model_id = "Qwen/Qwen2.5-3B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device)
+model = PeftModel.from_pretrained(model, "qwen2.5-3b-lora")
+inputs = tokenizer("Preheat the oven to 350 degrees and place the cookie dough", return_tensors="pt")
+outputs = model.generate(**inputs.to(device), max_new_tokens=50)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+# prints something like: Preheat the oven to 350 degrees and place the cookie dough in a baking dish [...]
+```
+## Why you should use PEFT
+There are many benefits of using PEFT but the main one is the huge savings in compute and storage, making PEFT applicable to many different use cases.
+### High performance on consumer hardware
+Consider the memory requirements for training the following models on the [ought/raft/twitter_complaints](https://huggingface.co/datasets/ought/raft/viewer/twitter_complaints) dataset with an A100 80GB GPU with more than 64GB of CPU RAM.
+|   Model         | Full Finetuning | PEFT-LoRA PyTorch  | PEFT-LoRA DeepSpeed with CPU Offloading |
+| --------- | ---- | ---- | ---- |
+| bigscience/T0_3B (3B params) | 47.14GB GPU / 2.96GB CPU  | 14.4GB GPU / 2.96GB CPU | 9.8GB GPU / 17.8GB CPU |
+| bigscience/mt0-xxl (12B params) | OOM GPU | 56GB GPU / 3GB CPU | 22GB GPU / 52GB CPU |
+| bigscience/bloomz-7b1 (7B params) | OOM GPU | 32GB GPU / 3.8GB CPU | 18.1GB GPU / 35GB CPU |
+With LoRA you can fully finetune a 12B parameter model that would've otherwise run out of memory on the 80GB GPU, and comfortably fit and train a 3B parameter model. When you look at the 3B parameter model's performance, it is comparable to a fully finetuned model at a fraction of the GPU memory.
+|   Submission Name        | Accuracy |
+| --------- | ---- |
+| Human baseline (crowdsourced) |	0.897 |
+| Flan-T5 | 0.892 |
+| lora-t0-3b | 0.863 |
+> [!TIP]
+> The bigscience/T0_3B model performance isn't optimized in the table above. You can squeeze even more performance out of it by playing around with the input instruction templates, LoRA hyperparameters, and other training related hyperparameters. The final checkpoint size of this model is just 19MB compared to 11GB of the full bigscience/T0_3B model. Learn more about the advantages of finetuning with PEFT in this [blog post](https://www.philschmid.de/fine-tune-flan-t5-peft).
+### Quantization
+Quantization is another method for reducing the memory requirements of a model by representing the data in a lower precision. It can be combined with PEFT methods to make it even easier to train and load LLMs for inference.
+* Learn how to finetune [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) with QLoRA and the [TRL](https://huggingface.co/docs/trl/index) library on a 16GB GPU in the [Finetune LLMs on your own consumer hardware using tools from PyTorch and Hugging Face ecosystem](https://pytorch.org/blog/finetune-llms/) blog post.
+* Learn how to finetune a [openai/whisper-large-v2](https://huggingface.co/openai/whisper-large-v2) model for multilingual automatic speech recognition with LoRA and 8-bit quantization in this [notebook](https://colab.research.google.com/drive/1DOkD_5OUjFa0r5Ik3SgywJLJtEo2qLxO?usp=sharing) (see this [notebook](https://colab.research.google.com/drive/1vhF8yueFqha3Y3CpTHN6q9EVcII9EYzs?usp=sharing) instead for an example of streaming a dataset).
+### Save compute and storage
+PEFT can help you save storage by avoiding full finetuning of models on each of downstream task or dataset. In many cases, you're only finetuning a very small fraction of a model's parameters and each checkpoint is only a few MBs in size (instead of GBs). These smaller PEFT adapters demonstrate performance comparable to a fully finetuned model. If you have many datasets, you can save a lot of storage with a PEFT model and not have to worry about catastrophic forgetting or overfitting the backbone or base model.
+## PEFT integrations
+PEFT is widely supported across the Hugging Face ecosystem because of the massive efficiency it brings to training and inference.
+### Diffusers
+The iterative diffusion process consumes a lot of memory which can make it difficult to train. PEFT can help reduce the memory requirements and reduce the storage size of the final model checkpoint. For example, consider the memory required for training a Stable Diffusion model with LoRA on an A100 80GB GPU with more than 64GB of CPU RAM. The final model checkpoint size is only 8.8MB!
+|   Model         | Full Finetuning | PEFT-LoRA  | PEFT-LoRA with Gradient Checkpointing  |
+| --------- | ---- | ---- | ---- |
+| CompVis/stable-diffusion-v1-4 | 27.5GB GPU / 3.97GB CPU | 15.5GB GPU / 3.84GB CPU | 8.12GB GPU / 3.77GB CPU |
+> [!TIP]
+> Take a look at the [examples/lora_dreambooth/train_dreambooth.py](examples/lora_dreambooth/train_dreambooth.py) training script to try training your own Stable Diffusion model with LoRA, and play around with the [smangrul/peft-lora-sd-dreambooth](https://huggingface.co/spaces/smangrul/peft-lora-sd-dreambooth) Space which is running on a T4 instance. Learn more about the PEFT integration in Diffusers in this [tutorial](https://huggingface.co/docs/peft/main/en/tutorial/peft_integrations#diffusers).
+### Transformers
+PEFT is directly integrated with [Transformers](https://huggingface.co/docs/transformers/main/en/peft). After loading a model, call `add_adapter` to add a new PEFT adapter to the model:
+```python
+from peft import LoraConfig
+model = ...  # transformers model
+peft_config = LoraConfig(...)
+model.add_adapter(lora_config, adapter_name="lora_1")
+```
+To load a trained PEFT adapter, call `load_adapter`:
+```python
+model = ...  # transformers model
+model.load_adapter(<path-to-adapter>, adapter_name="lora_1")
+```
+And to switch between different adapters, call `set_adapter`:
+```python
+model.set_adapter("lora_2")
+```
+The Transformers integration doesn't include all the functionalities offered in PEFT, such as methods for merging the adapter into the base model.
+### Accelerate
+[Accelerate](https://huggingface.co/docs/accelerate/index) is a library for distributed training and inference on various training setups and hardware (GPUs, TPUs, Apple Silicon, etc.). PEFT models work with Accelerate out of the box, making it really convenient to train really large models or use them for inference on consumer hardware with limited resources.
+### TRL
+PEFT can also be applied to training LLMs with RLHF components such as the ranker and policy. Get started by reading:
+* [Fine-tune a Mistral-7b model with Direct Preference Optimization](https://towardsdatascience.com/fine-tune-a-mistral-7b-model-with-direct-preference-optimization-708042745aac) with PEFT and the [TRL](https://huggingface.co/docs/trl/index) library to learn more about the Direct Preference Optimization (DPO) method and how to apply it to a LLM.
+* [Fine-tuning 20B LLMs with RLHF on a 24GB consumer GPU](https://huggingface.co/blog/trl-peft) with PEFT and the [TRL](https://huggingface.co/docs/trl/index) library, and then try out the [gpt2-sentiment_peft.ipynb](https://github.com/huggingface/trl/blob/main/examples/notebooks/gpt2-sentiment.ipynb) notebook to optimize GPT2 to generate positive movie reviews.
+* [StackLLaMA: A hands-on guide to train LLaMA with RLHF](https://huggingface.co/blog/stackllama) with PEFT, and then try out the [stack_llama/scripts](https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama/scripts) for supervised finetuning, reward modeling, and RL finetuning.
+## Model support
+Use this [Space](https://stevhliu-peft-methods.hf.space) or check out the [docs](https://huggingface.co/docs/peft/main/en/index) to find which models officially support a PEFT method out of the box. Even if you don't see a model listed below, you can manually configure the model config to enable PEFT for a model. Read the [New transformers architecture](https://huggingface.co/docs/peft/main/en/developer_guides/custom_models#new-transformers-architectures) guide to learn how.
+## Contribute
+If you would like to contribute to PEFT, please check out our [contribution guide](https://huggingface.co/docs/peft/developer_guides/contributing).
+## Citing 🤗 PEFT
+To use 🤗 PEFT in your publication, please cite it by using the following BibTeX entry.
+```bibtex
+@Misc{peft,
+  title =        {PEFT: State-of-the-art Parameter-Efficient Fine-Tuning methods},
+  author =       {Sourab Mangrulkar and Sylvain Gugger and Lysandre Debut and Younes Belkada and Sayak Paul and Benjamin Bossan},
+  howpublished = {\url{https://github.com/huggingface/peft}},
+  year =         {2022}
+}
+```

peft/pyproject.toml ADDED Viewed

	@@ -0,0 +1,50 @@

+[tool.black]
+# Only used by `hf-doc-builder´.
+line-length = 119
+target-version = ['py38']
+[tool.ruff]
+target-version = "py39"
+line-length = 119
+extend-exclude = ["*.ipynb"]
+[tool.ruff.lint]
+preview = true
+explicit-preview-rules = true
+extend-select = [
+    "C", # Complexity
+    "E", # PEP8 errors
+    "F", # PEP8 formatting
+    "I", # Import sorting
+    "UP", # Pyupgrade upgrades
+    "W", # PEP8 warnings
+    "PT009", # Pytest assertions
+    "RUF022", # Sorting of __all__
+]
+ignore = [
+    "C901", # Function too complex
+    "E501", # Line length (handled by ruff-format)
+    "F841", # unused variable
+    "UP007", # X | Y style Unions
+    "C420", # dict.fromkeys
+]
+[tool.ruff.lint.isort]
+lines-after-imports = 2
+known-first-party = ["peft"]
+[tool.pytest]
+doctest_optionflags = [
+    "NORMALIZE_WHITESPACE",
+    "ELLIPSIS",
+    "NUMBER",
+]
+[tool.pytest.ini_options]
+addopts = "--cov=src/peft --cov-report=term-missing --durations=10"
+markers = [
+    "single_gpu_tests: tests that run on a single GPU",
+    "multi_gpu_tests: tests that run on multiple GPUs",
+    "regression: whether to run regression suite test",
+    "bitsandbytes: select bitsandbytes integration tests"
+]

peft/requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+accelerate
+torch
+safetensors
+bitsandbytes
+scipy
+peft
+transformers
+tqdm
+packaging
+pytest
+numpy
+pyyaml
+datasets
+psutil
+setuptools

peft/setup.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from setuptools import find_packages, setup
+VERSION = "0.16.1.dev0"
+extras = {}
+extras["quality"] = [
+    "black",  # doc-builder has an implicit dependency on Black, see huggingface/doc-builder#434
+    "hf-doc-builder",
+    "ruff~=0.9.2",
+]
+extras["docs_specific"] = [
+    "black",  # doc-builder has an implicit dependency on Black, see huggingface/doc-builder#434
+    "hf-doc-builder",
+]
+extras["dev"] = extras["quality"] + extras["docs_specific"]
+extras["test"] = extras["dev"] + [
+    "pytest",
+    "pytest-cov",
+    "pytest-xdist",
+    "parameterized",
+    "datasets",
+    "diffusers",
+    "scipy",
+    "protobuf",
+    "sentencepiece",
+]
+setup(
+    name="peft",
+    version=VERSION,
+    description="Parameter-Efficient Fine-Tuning (PEFT)",
+    license_files=["LICENSE"],
+    long_description=open("README.md", encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    keywords="deep learning",
+    license="Apache",
+    author="The HuggingFace team",
+    author_email="benjamin@huggingface.co",
+    url="https://github.com/huggingface/peft",
+    package_dir={"": "src"},
+    packages=find_packages("src"),
+    package_data={"peft": ["py.typed", "tuners/boft/fbd/fbd_cuda.cpp", "tuners/boft/fbd/fbd_cuda_kernel.cu"]},
+    entry_points={},
+    python_requires=">=3.9.0",
+    install_requires=[
+        "numpy>=1.17",
+        "packaging>=20.0",
+        "psutil",
+        "pyyaml",
+        "torch>=1.13.0",
+        "transformers",
+        "tqdm",
+        "accelerate>=0.21.0",
+        "safetensors",
+        "huggingface_hub>=0.25.0",
+    ],
+    extras_require=extras,
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+)
+# Release checklist
+# 1. Change the version in __init__.py and setup.py to the release version, e.g. from "0.6.1.dev0" to "0.7.0"
+# 2. Check if there are any deprecations that need to be addressed for this release by searching for "# TODO" in the code
+# 3. Commit these changes with the message: "Release: VERSION", create a PR and merge it.
+# 4. Add a tag in git to mark the release: "git tag -a v<VERSION> -m 'Adds tag <VERSION> for pypi' "
+#    Push the tag to git:
+#      git push --tags origin main
+#    It is necessary to work on the original repository, not on a fork.
+# 5. Run the following commands in the top-level directory:
+#      python setup.py bdist_wheel
+#      python setup.py sdist
+#    Ensure that you are on the clean and up-to-date main branch (git status --untracked-files=no should not list any
+#    files and show the main branch)
+# 6. Upload the package to the pypi test server first:
+#      twine upload dist/* -r pypitest
+# 7. Check that you can install it in a virtualenv by running:
+#      pip install -i https://testpypi.python.org/pypi --extra-index-url https://pypi.org/simple peft
+# 8. Upload the final version to actual pypi:
+#      twine upload dist/* -r pypi
+# 9. Add release notes to the tag on https://github.com/huggingface/peft/releases once everything is looking hunky-dory.
+#      Check the notes here: https://docs.google.com/document/d/1k-sOIfykuKjWcOIALqjhFKz4amFEp-myeJUJEzNgjoU/edit?usp=sharing
+# 10. Update the version in __init__.py, setup.py to the bumped patch version + ".dev0" (e.g. from "0.7.0" to "0.7.1.dev0")

sentence-transformers/.gitignore ADDED Viewed

	@@ -0,0 +1,69 @@

+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Docs
+/docs/_build/
+/docs/make.bat
+# Editors
+.idea
+.vscode
+# Coverage
+htmlcov
+.coverage*
+coverage.xml
+# Examples
+/examples/**/output/*
+/examples/datasets/
+/examples/embeddings/
+/examples/sentence_transformer/training/quora_duplicate_questions/quora-IR-dataset/
+examples/datasets/*/
+# Specific files and folders
+/pretrained-models/
+/cheatsheet.txt
+/testsuite.txt
+/TODO.txt
+# Virtual environments
+.env
+.venv
+env/
+venv/
+# Database
+/qdrant_storage
+/elastic-start-local
+# Others
+*.pyc
+*.gz
+*.tsv
+tmp_*.py
+nr_*/
+wandb
+checkpoints
+tmp
+.DS_Store
+/runs
+/tmp_trainer/