Spaces:

NZUONG
/

mude

Sleeping

App Files Files Community

NZUONG commited on Aug 14, 2025

Commit

1913ec5

verified ·

1 Parent(s): 7083e95

Upload 27 files

Browse files

Files changed (28) hide show

.gitattributes +1 -0
README.md +24 -12
app.py +395 -0
examples/sample.wav +3 -0
model/__pycache__/scheduler.cpython-310.pyc +0 -0
model/ae/__pycache__/music_dcae.cpython-310.pyc +0 -0
model/ae/__pycache__/music_log_mel.cpython-310.pyc +0 -0
model/ae/__pycache__/music_vocoder.cpython-310.pyc +0 -0
model/ae/music_dcae.py +169 -0
model/ae/music_log_mel.py +115 -0
model/ae/music_vocoder.py +587 -0
model/ldm/__pycache__/attention.cpython-310.pyc +0 -0
model/ldm/__pycache__/audioldm.cpython-310.pyc +0 -0
model/ldm/__pycache__/customer_attention_processor.cpython-310.pyc +0 -0
model/ldm/__pycache__/dpm_solver_pytorch.cpython-310.pyc +0 -0
model/ldm/__pycache__/editing_unet.cpython-310.pyc +0 -0
model/ldm/__pycache__/linear_attention_block.cpython-310.pyc +0 -0
model/ldm/__pycache__/transformer.cpython-310.pyc +0 -0
model/ldm/attention.py +355 -0
model/ldm/audioldm.py +946 -0
model/ldm/customer_attention_processor.py +507 -0
model/ldm/dpm_solver_pytorch.py +1307 -0
model/ldm/editing_unet.py +47 -0
model/ldm/exp_config.json +20 -0
model/ldm/linear_attention_block.py +230 -0
model/ldm/transformer.py +201 -0
model/scheduler.py +136 -0
requirements.txt +9 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/sample.wav filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,24 @@
----
-title: Mude
-emoji: 🐢
-colorFrom: pink
-colorTo: purple
-sdk: gradio
-sdk_version: 5.42.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: AI Audio Editor
+emoji: 🎵
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk\_version: 4.0.0
+app\_file: app.py
+pinned: false
+license: mit
+---

app.py ADDED Viewed

	@@ -0,0 +1,395 @@

+import os
+import torch
+import torchaudio
+import gradio as gr
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+from transformers import UMT5EncoderModel, AutoTokenizer
+from huggingface_hub import hf_hub_download, snapshot_download
+import json
+import numpy as np
+import tempfile
+from io import BytesIO
+import warnings
+warnings.filterwarnings("ignore")
+# Import model components
+from model.ae.music_dcae import MusicDCAE
+from model.ldm.editing_unet import EditingUNet
+from model.ldm.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
+# Configuration
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
+# Model repository - UPDATE THIS TO YOUR MODEL REPO
+MODEL_REPO = "NZUONG/mude"  # Your uploaded model repository
+# DDPM Parameters
+DDPM_NUM_TIMESTEPS = 1000
+DDPM_BETA_START = 0.0001
+DDPM_BETA_END = 0.02
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def download_models():
+    """Download models from Hugging Face Hub"""
+    print("🔄 Downloading models from Hugging Face Hub...")
+    # Create local directories
+    os.makedirs("checkpoints", exist_ok=True)
+    try:
+        # Download the entire repository
+        local_dir = snapshot_download(
+            repo_id=MODEL_REPO,
+            cache_dir="./cache",
+            local_dir="./checkpoints",
+            repo_type="model"
+        )
+        print(f"✅ Models downloaded to: {local_dir}")
+        return True
+    except Exception as e:
+        print(f"❌ Error downloading models: {e}")
+        return False
+class AudioEditor:
+    def __init__(self):
+        self.dcae = None
+        self.tokenizer = None
+        self.text_encoder = None
+        self.model = None
+        self.is_loaded = False
+    def load_models(self):
+        """Load all models once at startup"""
+        if self.is_loaded:
+            return True
+        # Download models if not present
+        if not os.path.exists("checkpoints/music_dcae_f8c8"):
+            print("📥 Models not found locally, downloading...")
+            if not download_models():
+                return False
+        print("🔄 Loading models...")
+        try:
+            # Model paths
+            dcae_path = "checkpoints/music_dcae_f8c8"
+            vocoder_path = "checkpoints/music_vocoder"
+            t5_path = "checkpoints/umt5-base"
+            unet_config_path = "model/ldm/exp_config.json"
+            trained_model_path = "checkpoints/fm_checkpoint_epoch_9.pt"
+            # Load DCAE
+            self.dcae = MusicDCAE(
+                dcae_checkpoint_path=dcae_path,
+                vocoder_checkpoint_path=vocoder_path
+            ).to(DEVICE).eval()
+            # Load text encoder
+            self.tokenizer = AutoTokenizer.from_pretrained(t5_path)
+            self.text_encoder = UMT5EncoderModel.from_pretrained(t5_path).to(DEVICE, dtype=DTYPE).eval()
+            # Load UNet config
+            with open(unet_config_path, 'r') as f:
+                unet_config = AttrDict(json.load(f)['model']['unet'])
+            self.model = EditingUNet(unet_config, use_flow_matching=False).to("cpu", dtype=DTYPE).eval()
+            # Load checkpoint
+            checkpoint = torch.load(trained_model_path, map_location="cpu")
+            model_state_dict = checkpoint.get('model_state_dict', checkpoint)
+            if any(key.startswith('_orig_mod.') for key in model_state_dict.keys()):
+                model_state_dict = {key.replace('_orig_mod.', ''): value for key, value in model_state_dict.items()}
+            self.model.load_state_dict(model_state_dict, strict=False)
+            self.is_loaded = True
+            print("✅ All models loaded successfully!")
+            return True
+        except Exception as e:
+            print(f"❌ Error loading models: {e}")
+            return False
+    def dpm_solver_sampling(self, model, source_latent, instruction_embedding, uncond_embedding,
+                           strength=1.0, steps=25, guidance_scale=7.5, seed=42):
+        """DPM-Solver sampling function"""
+        print(f"🚀 Starting DPM-Solver++ sampling with {steps} steps...")
+        # Setup noise schedule
+        betas = torch.linspace(DDPM_BETA_START, DDPM_BETA_END, DDMP_NUM_TIMESTEPS, dtype=torch.float32)
+        alphas_cumprod = torch.cumprod(1.0 - betas, dim=0)
+        noise_schedule = NoiseScheduleVP(schedule='discrete', alphas_cumprod=alphas_cumprod)
+        # Setup model wrapper
+        model_fn = model_wrapper(
+            model,
+            noise_schedule,
+            model_type="noise",  # DDPM objective only
+            model_kwargs={
+                "source_latent": source_latent,
+            },
+            guidance_type="classifier-free",
+            condition=instruction_embedding,
+            unconditional_condition=uncond_embedding,
+            guidance_scale=guidance_scale,
+        )
+        # Initialize DPM-Solver++
+        dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
+        # Calculate time range
+        t_end = noise_schedule.T / noise_schedule.total_N
+        t_start = t_end + strength * (noise_schedule.T - t_end)
+        # Add initial noise
+        torch.manual_seed(seed)
+        noise = torch.randn_like(source_latent)
+        latents = dpm_solver.add_noise(source_latent, torch.tensor([t_start], device=DEVICE), noise)
+        latents = latents.to(DTYPE)
+        # Run DPM solver sampling
+        with torch.amp.autocast(device_type="cuda", dtype=DTYPE, enabled=(DTYPE != torch.float32)):
+            with torch.no_grad():
+                final_latent, _ = dpm_solver.sample(
+                    latents,
+                    steps=steps,
+                    t_start=t_start,
+                    t_end=t_end,
+                    order=2,
+                    method="multistep",
+                    skip_type="time_uniform",
+                    lower_order_final=True,
+                    return_intermediate=True,
+                )
+        return final_latent
+    def process_audio(self, audio_file, instruction, guidance_scale, steps, strength, seed):
+        """Main audio processing function"""
+        try:
+            if not self.load_models():
+                return None, None, "❌ Failed to load models. Please try again."
+            # Load and preprocess audio
+            print(f"🎵 Processing audio: {audio_file}")
+            audio, sr = torchaudio.load(audio_file)
+            TARGET_SR_DCAE = 44100
+            TARGET_LEN_DCAE = TARGET_SR_DCAE * 10
+            if sr != TARGET_SR_DCAE:
+                audio = torchaudio.transforms.Resample(sr, TARGET_SR_DCAE)(audio)
+            if audio.shape[1] > TARGET_LEN_DCAE:
+                audio = audio[:, :TARGET_LEN_DCAE]
+            elif audio.shape[1] < TARGET_LEN_DCAE:
+                audio = torch.nn.functional.pad(audio, (0, TARGET_LEN_DCAE - audio.shape[1]))
+            if audio.shape[0] == 1:
+                audio = audio.repeat(2, 1)
+            # Encode audio
+            with torch.no_grad():
+                source_latent_scaled, _ = self.dcae.encode(audio.to(DEVICE).unsqueeze(0))
+            # Prepare text embeddings
+            with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=DTYPE, enabled=(DTYPE != torch.float32)):
+                text_input = self.tokenizer([instruction], max_length=32, padding="max_length",
+                                          truncation=True, return_tensors="pt")
+                instruction_embedding = self.text_encoder(text_input.input_ids.to(DEVICE))[0]
+                uncond_input = self.tokenizer([""], max_length=32, padding="max_length",
+                                            truncation=True, return_tensors="pt")
+                uncond_embedding = self.text_encoder(uncond_input.input_ids.to(DEVICE))[0]
+            # Move models for inference
+            self.dcae = self.dcae.cpu()
+            torch.cuda.empty_cache()
+            self.model = self.model.to(DEVICE, dtype=DTYPE)
+            # Generate
+            print("🎨 Generating edited audio...")
+            with torch.amp.autocast(device_type="cuda", dtype=DTYPE, enabled=(DTYPE != torch.float32)):
+                with torch.no_grad():
+                    final_latent = self.dpm_solver_sampling(
+                        model=self.model,
+                        source_latent=source_latent_scaled,
+                        instruction_embedding=instruction_embedding,
+                        uncond_embedding=uncond_embedding,
+                        strength=strength,
+                        steps=int(steps),
+                        guidance_scale=guidance_scale,
+                        seed=int(seed)
+                    )
+            # Decode results
+            self.model = self.model.cpu()
+            torch.cuda.empty_cache()
+            self.dcae = self.dcae.to(DEVICE)
+            final_latent_unscaled = (final_latent.float() / self.dcae.scale_factor) + self.dcae.shift_factor
+            source_latent_raw = (source_latent_scaled / self.dcae.scale_factor) + self.dcae.shift_factor
+            with torch.no_grad():
+                source_mel = self.dcae.decode_to_mel(source_latent_raw)
+                edited_mel = self.dcae.decode_to_mel(final_latent_unscaled)
+                _, pred_wavs = self.dcae.decode(latents=final_latent.float(), sr=44100)
+                edited_audio = pred_wavs[0]
+            # Create comparison plot
+            comparison_plot = self.create_mel_comparison(source_mel, edited_mel, instruction)
+            # Save output audio
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                torchaudio.save(tmp_file.name, edited_audio.cpu().float(), 44100)
+                output_path = tmp_file.name
+            # Cleanup
+            self.dcae = self.dcae.cpu()
+            torch.cuda.empty_cache()
+            return output_path, comparison_plot, f"✅ Audio editing completed! Instruction: '{instruction}'"
+        except Exception as e:
+            import traceback
+            error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
+            print(error_msg)
+            return None, None, error_msg
+    def create_mel_comparison(self, source_mel, edited_mel, instruction):
+        """Create mel-spectrogram comparison plot"""
+        try:
+            source_mel_np = source_mel.squeeze(0)[0].cpu().float().numpy()
+            edited_mel_np = edited_mel.squeeze(0)[0].cpu().float().numpy()
+            fig, axs = plt.subplots(2, 1, figsize=(12, 8), sharex=True, sharey=True)
+            fig.suptitle(f'Mel-Spectrogram Comparison', fontsize=14)
+            # Plot source
+            im1 = axs[0].imshow(source_mel_np, aspect='auto', origin='lower', cmap='viridis')
+            axs[0].set_title('Original Audio')
+            axs[0].set_ylabel('Mel Bins')
+            plt.colorbar(im1, ax=axs[0])
+            # Plot edited
+            im2 = axs[1].imshow(edited_mel_np, aspect='auto', origin='lower', cmap='viridis')
+            axs[1].set_title(f'Edited Audio: "{instruction}"')
+            axs[1].set_ylabel('Mel Bins')
+            axs[1].set_xlabel('Time Frames')
+            plt.colorbar(im2, ax=axs[1])
+            plt.tight_layout()
+            # Save to temporary file for Gradio
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
+                plt.savefig(tmp_file.name, dpi=100, bbox_inches='tight')
+                plt.close()
+                return tmp_file.name
+        except Exception as e:
+            print(f"Error creating plot: {e}")
+            plt.close()
+            return None
+# Initialize the audio editor
+audio_editor = AudioEditor()
+def gradio_interface(audio_file, instruction, guidance_scale, steps, strength, seed):
+    """Gradio interface function"""
+    if audio_file is None:
+        return None, None, "Please upload an audio file"
+    if not instruction.strip():
+        return None, None, "Please provide an editing instruction"
+    return audio_editor.process_audio(audio_file, instruction, guidance_scale, steps, strength, seed)
+# Create Gradio interface
+with gr.Blocks(title="🎵 AI Audio Editor", theme=gr.themes.Soft()) as demo:
+    gr.HTML("""
+    <div style="text-align: center; margin-bottom: 20px;">
+        <h1>🎵 AI Audio Editor</h1>
+        <p>Upload an audio file and provide instructions to edit it using AI.<br/>
+        The model uses DPM-Solver++ for fast, high-quality generation.</p>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Input components
+            audio_input = gr.Audio(
+                label="📁 Upload Audio File",
+                type="filepath"
+            )
+            instruction_input = gr.Textbox(
+                label="✏️ Editing Instruction",
+                placeholder="e.g., 'Add drums', 'Make it more energetic', 'Remove vocals'",
+                lines=2
+            )
+            with gr.Accordion("🔧 Advanced Settings", open=False):
+                guidance_scale = gr.Slider(
+                    minimum=1.0, maximum=20.0, value=7.5, step=0.5,
+                    label="Guidance Scale",
+                    info="Higher values follow the instruction more closely"
+                )
+                steps = gr.Slider(
+                    minimum=10, maximum=50, value=25, step=5,
+                    label="Sampling Steps",
+                    info="More steps = better quality, slower generation"
+                )
+                strength = gr.Slider(
+                    minimum=0.1, maximum=1.0, value=1.0, step=0.1,
+                    label="Denoising Strength",
+                    info="1.0 = full denoising, lower = more conservative editing"
+                )
+                seed = gr.Number(
+                    value=42, label="Seed",
+                    info="For reproducible results"
+                )
+            generate_btn = gr.Button("🎨 Generate Edited Audio", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            # Output components
+            status_output = gr.Textbox(label="📊 Status", interactive=False)
+            audio_output = gr.Audio(label="🎵 Generated Audio")
+            plot_output = gr.Image(label="📈 Mel-Spectrogram Comparison")
+    gr.HTML("""
+    <div style="margin-top: 20px; padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
+        <h3>📝 Usage Tips:</h3>
+        <ul>
+            <li><b>Audio Length:</b> Files are automatically processed to 10 seconds</li>
+            <li><b>Instructions:</b> Be specific (e.g., "Add heavy drums" vs "Add drums")</li>
+            <li><b>Guidance Scale:</b> Start with 7.5, increase for stronger effects</li>
+            <li><b>Steps:</b> 25 steps provide good quality/speed balance</li>
+        </ul>
+    </div>
+    """)
+    # Connect the interface
+    generate_btn.click(
+        fn=gradio_interface,
+        inputs=[audio_input, instruction_input, guidance_scale, steps, strength, seed],
+        outputs=[audio_output, plot_output, status_output],
+        show_progress=True
+    )
+# Launch settings
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )

examples/sample.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b94ed3260e322a90dc10b88a3fd1c4d1ad5da50a7f40d62d976d7a59a495eee9
+size 3528078

model/__pycache__/scheduler.cpython-310.pyc ADDED Viewed

Binary file (4.22 kB). View file

model/ae/__pycache__/music_dcae.cpython-310.pyc ADDED Viewed

Binary file (4.71 kB). View file

model/ae/__pycache__/music_log_mel.cpython-310.pyc ADDED Viewed

Binary file (2.95 kB). View file

model/ae/__pycache__/music_vocoder.cpython-310.pyc ADDED Viewed

Binary file (15.7 kB). View file

model/ae/music_dcae.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import os
+import torch
+from diffusers import AutoencoderDC
+import torchaudio
+import torchvision.transforms as transforms
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.loaders import FromOriginalModelMixin
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from tqdm import tqdm
+try:
+    from .music_vocoder import ADaMoSHiFiGANV1
+except ImportError:
+    from music_vocoder import ADaMoSHiFiGANV1
+root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+DEFAULT_PRETRAINED_PATH = os.path.join(root_dir, "checkpoints", "music_dcae_f8c8")
+VOCODER_PRETRAINED_PATH = os.path.join(root_dir, "checkpoints", "music_vocoder")
+print(DEFAULT_PRETRAINED_PATH)
+class MusicDCAE(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    @register_to_config
+    def __init__(
+        self,
+        source_sample_rate=None,
+        dcae_checkpoint_path= "D:\do an\checkpoints\music_dcae_f8c8", #DEFAULT_PRETRAINED_PATH ,
+        vocoder_checkpoint_path= "D:\do an\checkpoints\music_vocoder" #VOCODER_PRETRAINED_PATH,
+    ):
+        super(MusicDCAE, self).__init__()
+        self.dcae = AutoencoderDC.from_pretrained(dcae_checkpoint_path)
+        self.vocoder = ADaMoSHiFiGANV1.from_pretrained(vocoder_checkpoint_path)
+        if source_sample_rate is None:
+            source_sample_rate = 48000
+        self.resampler = torchaudio.transforms.Resample(source_sample_rate, 44100)
+        self.transform = transforms.Compose(
+            [
+                transforms.Normalize(0.5, 0.5),
+            ]
+        )
+        self.min_mel_value = -11.0
+        self.max_mel_value = 3.0
+        self.audio_chunk_size = int(round((1024 * 512 / 44100 * 48000)))
+        self.mel_chunk_size = 1024
+        self.time_dimention_multiple = 8
+        self.latent_chunk_size = self.mel_chunk_size // self.time_dimention_multiple
+        self.scale_factor = 0.1786
+        self.shift_factor = -1.9091
+    def load_audio(self, audio_path):
+        audio, sr = torchaudio.load(audio_path)
+        if audio.shape[0] == 1:
+            audio = audio.repeat(2, 1)
+        return audio, sr
+    def forward_mel(self, audios):
+        mels = []
+        for i in range(len(audios)):
+            image = self.vocoder.mel_transform(audios[i])
+            mels.append(image)
+        mels = torch.stack(mels)
+        return mels
+    @torch.no_grad()
+    def encode(self, audios, audio_lengths=None, sr=None):
+        if audio_lengths is None:
+            audio_lengths = torch.tensor([audios.shape[2]] * audios.shape[0])
+            audio_lengths = audio_lengths.to(audios.device)
+        # audios: N x 2 x T, 48kHz
+        device = audios.device
+        dtype = audios.dtype
+        if sr is None:
+            sr = 48000
+            resampler = self.resampler
+        else:
+            resampler = torchaudio.transforms.Resample(sr, 44100).to(device).to(dtype)
+        audio = resampler(audios)
+        max_audio_len = audio.shape[-1]
+        if max_audio_len % (8 * 512) != 0:
+            audio = torch.nn.functional.pad(
+                audio, (0, 8 * 512 - max_audio_len % (8 * 512))
+            )
+        mels = self.forward_mel(audio)
+        mels = (mels - self.min_mel_value) / (self.max_mel_value - self.min_mel_value)
+        mels = self.transform(mels)
+        latents = []
+        for mel in mels:
+            latent = self.dcae.encoder(mel.unsqueeze(0))
+            latents.append(latent)
+        latents = torch.cat(latents, dim=0)
+        latent_lengths = (
+            audio_lengths / sr * 44100 / 512 / self.time_dimention_multiple
+        ).long()
+        latents = (latents - self.shift_factor) * self.scale_factor
+        return latents, latent_lengths
+    @torch.no_grad()
+    def decode(self, latents, audio_lengths=None, sr=None):
+        latents = latents / self.scale_factor + self.shift_factor
+        pred_wavs = []
+        for latent in latents:
+            mels = self.dcae.decoder(latent.unsqueeze(0))
+            mels = mels * 0.5 + 0.5
+            mels = mels * (self.max_mel_value - self.min_mel_value) + self.min_mel_value
+            # wav = self.vocoder.decode(mels[0]).squeeze(1)
+            # decode waveform for each channels to reduce vram footprint
+            wav_ch1 = self.vocoder.decode(mels[:,0,:,:]).squeeze(1).cpu()
+            wav_ch2 = self.vocoder.decode(mels[:,1,:,:]).squeeze(1).cpu()
+            wav = torch.cat([wav_ch1, wav_ch2],dim=0)
+            if sr is not None:
+                resampler = (
+                    torchaudio.transforms.Resample(44100, sr)
+                )
+                wav = resampler(wav.cpu().float())
+            else:
+                sr = 44100
+            pred_wavs.append(wav)
+        if audio_lengths is not None:
+            pred_wavs = [
+                wav[:, :length].cpu() for wav, length in zip(pred_wavs, audio_lengths)
+            ]
+        return sr, pred_wavs
+    @torch.no_grad()
+    def decode_to_mel(self, latents):
+        """
+        Decodes latent representations into mel-spectrograms using the DCAE decoder.
+        Args:
+            latents (torch.Tensor): A batch of latent tensors to decode, typically of shape (batch_size, ...).
+        Returns:
+            list of torch.Tensor: A list of mel-spectrogram tensors corresponding to each input latent.
+        """
+        # Un-scale latent theo logic của DCAE
+        #latents_for_decoder = (latents - self.shift_factor) * self.scale_factor
+        # Ensure latents have the same dtype as the decoder's parameters
+        # Convert to float32 to match the bias type
+        latents = latents.float()
+        # Process each latent individually like in the decode method
+        mels_list = []
+        for latent in latents:
+            mel = self.dcae.decoder(latent.unsqueeze(0))
+            mel = mel * 0.5 + 0.5
+            mel = mel * (self.max_mel_value - self.min_mel_value) + self.min_mel_value
+            mels_list.append(mel)
+        # Concatenate all mels if multiple latents were processed
+        if len(mels_list) == 1:
+            return mels_list[0]
+        else:
+            return torch.cat(mels_list, dim=0)

model/ae/music_log_mel.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+ACE-Step: A Step Towards Music Generation Foundation Model
+https://github.com/ace-step/ACE-Step
+Apache 2.0 License
+"""
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torchaudio.transforms import MelScale
+class LinearSpectrogram(nn.Module):
+    def __init__(
+        self,
+        n_fft=2048,
+        win_length=2048,
+        hop_length=512,
+        center=False,
+        mode="pow2_sqrt",
+    ):
+        super().__init__()
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.center = center
+        self.mode = mode
+        self.register_buffer("window", torch.hann_window(win_length))
+    def forward(self, y: Tensor) -> Tensor:
+        if y.ndim == 3:
+            y = y.squeeze(1)
+        y = torch.nn.functional.pad(
+            y.unsqueeze(1),
+            (
+                (self.win_length - self.hop_length) // 2,
+                (self.win_length - self.hop_length + 1) // 2,
+            ),
+            mode="reflect",
+        ).squeeze(1)
+        dtype = y.dtype
+        spec = torch.stft(
+            y.float(),
+            self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=self.window,
+            center=self.center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+        spec = torch.view_as_real(spec)
+        if self.mode == "pow2_sqrt":
+            spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+        spec = spec.to(dtype)
+        return spec
+class LogMelSpectrogram(nn.Module):
+    def __init__(
+        self,
+        sample_rate=44100,
+        n_fft=2048,
+        win_length=2048,
+        hop_length=512,
+        n_mels=128,
+        center=False,
+        f_min=0.0,
+        f_max=None,
+    ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.center = center
+        self.n_mels = n_mels
+        self.f_min = f_min
+        self.f_max = f_max or sample_rate // 2
+        self.spectrogram = LinearSpectrogram(n_fft, win_length, hop_length, center)
+        self.mel_scale = MelScale(
+            self.n_mels,
+            self.sample_rate,
+            self.f_min,
+            self.f_max,
+            self.n_fft // 2 + 1,
+            "slaney",
+            "slaney",
+        )
+    def compress(self, x: Tensor) -> Tensor:
+        return torch.log(torch.clamp(x, min=1e-5))
+    def decompress(self, x: Tensor) -> Tensor:
+        return torch.exp(x)
+    def forward(self, x: Tensor, return_linear: bool = False) -> Tensor:
+        linear = self.spectrogram(x)
+        x = self.mel_scale(linear)
+        x = self.compress(x)
+        # print(x.shape)
+        if return_linear:
+            return x, self.compress(linear)
+        return x

model/ae/music_vocoder.py ADDED Viewed

	@@ -0,0 +1,587 @@

+"""
+ACE-Step: A Step Towards Music Generation Foundation Model
+https://github.com/ace-step/ACE-Step
+Apache 2.0 License
+"""
+import librosa
+import torch
+from torch import nn
+from functools import partial
+from math import prod
+from typing import Callable, Tuple, List
+import numpy as np
+import torch.nn.functional as F
+from torch.nn import Conv1d
+from torch.nn.utils import weight_norm
+from torch.nn.utils.parametrize import remove_parametrizations as remove_weight_norm
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.loaders import FromOriginalModelMixin
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+try:
+    from music_log_mel import LogMelSpectrogram
+except ImportError:
+    from .music_log_mel import LogMelSpectrogram
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """  # noqa: E501
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""  # noqa: E501
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob,3):0.3f}"
+class LayerNorm(nn.Module):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """  # noqa: E501
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape,)
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(
+                x, self.normalized_shape, self.weight, self.bias, self.eps
+            )
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None] * x + self.bias[:, None]
+            return x
+class ConvNeXtBlock(nn.Module):
+    r"""ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
+        kernel_size (int): Kernel size for depthwise conv. Default: 7.
+        dilation (int): Dilation for depthwise conv. Default: 1.
+    """  # noqa: E501
+    def __init__(
+        self,
+        dim: int,
+        drop_path: float = 0.0,
+        layer_scale_init_value: float = 1e-6,
+        mlp_ratio: float = 4.0,
+        kernel_size: int = 7,
+        dilation: int = 1,
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv1d(
+            dim,
+            dim,
+            kernel_size=kernel_size,
+            padding=int(dilation * (kernel_size - 1) / 2),
+            groups=dim,
+        )  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, int(mlp_ratio * dim)
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(int(mlp_ratio * dim), dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    def forward(self, x, apply_residual: bool = True):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 1)  # (N, C, L) -> (N, L, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 2, 1)  # (N, L, C) -> (N, C, L)
+        x = self.drop_path(x)
+        if apply_residual:
+            x = input + x
+        return x
+class ParallelConvNeXtBlock(nn.Module):
+    def __init__(self, kernel_sizes: List[int], *args, **kwargs):
+        super().__init__()
+        self.blocks = nn.ModuleList(
+            [
+                ConvNeXtBlock(kernel_size=kernel_size, *args, **kwargs)
+                for kernel_size in kernel_sizes
+            ]
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.stack(
+            [block(x, apply_residual=False) for block in self.blocks] + [x],
+            dim=1,
+        ).sum(dim=1)
+class ConvNeXtEncoder(nn.Module):
+    def __init__(
+        self,
+        input_channels=3,
+        depths=[3, 3, 9, 3],
+        dims=[96, 192, 384, 768],
+        drop_path_rate=0.0,
+        layer_scale_init_value=1e-6,
+        kernel_sizes: Tuple[int] = (7,),
+    ):
+        super().__init__()
+        assert len(depths) == len(dims)
+        self.channel_layers = nn.ModuleList()
+        stem = nn.Sequential(
+            nn.Conv1d(
+                input_channels,
+                dims[0],
+                kernel_size=7,
+                padding=3,
+                padding_mode="replicate",
+            ),
+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
+        )
+        self.channel_layers.append(stem)
+        for i in range(len(depths) - 1):
+            mid_layer = nn.Sequential(
+                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+                nn.Conv1d(dims[i], dims[i + 1], kernel_size=1),
+            )
+            self.channel_layers.append(mid_layer)
+        block_fn = (
+            partial(ConvNeXtBlock, kernel_size=kernel_sizes[0])
+            if len(kernel_sizes) == 1
+            else partial(ParallelConvNeXtBlock, kernel_sizes=kernel_sizes)
+        )
+        self.stages = nn.ModuleList()
+        drop_path_rates = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]
+        cur = 0
+        for i in range(len(depths)):
+            stage = nn.Sequential(
+                *[
+                    block_fn(
+                        dim=dims[i],
+                        drop_path=drop_path_rates[cur + j],
+                        layer_scale_init_value=layer_scale_init_value,
+                    )
+                    for j in range(depths[i])
+                ]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+        self.norm = LayerNorm(dims[-1], eps=1e-6, data_format="channels_first")
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv1d, nn.Linear)):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            nn.init.constant_(m.bias, 0)
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        for channel_layer, stage in zip(self.channel_layers, self.stages):
+            x = channel_layer(x)
+            x = stage(x)
+        return self.norm(x)
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return (kernel_size * dilation - dilation) // 2
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super().__init__()
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.silu(x)
+            xt = c1(xt)
+            xt = F.silu(xt)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for conv in self.convs1:
+            remove_weight_norm(conv)
+        for conv in self.convs2:
+            remove_weight_norm(conv)
+class HiFiGANGenerator(nn.Module):
+    def __init__(
+        self,
+        *,
+        hop_length: int = 512,
+        upsample_rates: Tuple[int] = (8, 8, 2, 2, 2),
+        upsample_kernel_sizes: Tuple[int] = (16, 16, 8, 2, 2),
+        resblock_kernel_sizes: Tuple[int] = (3, 7, 11),
+        resblock_dilation_sizes: Tuple[Tuple[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
+        num_mels: int = 128,
+        upsample_initial_channel: int = 512,
+        use_template: bool = True,
+        pre_conv_kernel_size: int = 7,
+        post_conv_kernel_size: int = 7,
+        post_activation: Callable = partial(nn.SiLU, inplace=True),
+    ):
+        super().__init__()
+        assert (
+            prod(upsample_rates) == hop_length
+        ), f"hop_length must be {prod(upsample_rates)}"
+        self.conv_pre = weight_norm(
+            nn.Conv1d(
+                num_mels,
+                upsample_initial_channel,
+                pre_conv_kernel_size,
+                1,
+                padding=get_padding(pre_conv_kernel_size),
+            )
+        )
+        self.num_upsamples = len(upsample_rates)
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.noise_convs = nn.ModuleList()
+        self.use_template = use_template
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(
+                weight_norm(
+                    nn.ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+            if not use_template:
+                continue
+            if i + 1 < len(upsample_rates):
+                stride_f0 = np.prod(upsample_rates[i + 1 :])
+                self.noise_convs.append(
+                    Conv1d(
+                        1,
+                        c_cur,
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes):
+                self.resblocks.append(ResBlock1(ch, k, d))
+        self.activation_post = post_activation()
+        self.conv_post = weight_norm(
+            nn.Conv1d(
+                ch,
+                1,
+                post_conv_kernel_size,
+                1,
+                padding=get_padding(post_conv_kernel_size),
+            )
+        )
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x, template=None):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.silu(x, inplace=True)
+            x = self.ups[i](x)
+            if self.use_template:
+                x = x + self.noise_convs[i](template)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        for up in self.ups:
+            remove_weight_norm(up)
+        for block in self.resblocks:
+            block.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class ADaMoSHiFiGANV1(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    @register_to_config
+    def __init__(
+        self,
+        input_channels: int = 128,
+        depths: List[int] = [3, 3, 9, 3],
+        dims: List[int] = [128, 256, 384, 512],
+        drop_path_rate: float = 0.0,
+        kernel_sizes: Tuple[int] = (7,),
+        upsample_rates: Tuple[int] = (4, 4, 2, 2, 2, 2, 2),
+        upsample_kernel_sizes: Tuple[int] = (8, 8, 4, 4, 4, 4, 4),
+        resblock_kernel_sizes: Tuple[int] = (3, 7, 11, 13),
+        resblock_dilation_sizes: Tuple[Tuple[int]] = (
+            (1, 3, 5),
+            (1, 3, 5),
+            (1, 3, 5),
+            (1, 3, 5),
+        ),
+        num_mels: int = 512,
+        upsample_initial_channel: int = 1024,
+        use_template: bool = False,
+        pre_conv_kernel_size: int = 13,
+        post_conv_kernel_size: int = 13,
+        sampling_rate: int = 44100,
+        n_fft: int = 2048,
+        win_length: int = 2048,
+        hop_length: int = 512,
+        f_min: int = 40,
+        f_max: int = 16000,
+        n_mels: int = 128,
+    ):
+        super().__init__()
+        self.backbone = ConvNeXtEncoder(
+            input_channels=input_channels,
+            depths=depths,
+            dims=dims,
+            drop_path_rate=drop_path_rate,
+            kernel_sizes=kernel_sizes,
+        )
+        self.head = HiFiGANGenerator(
+            hop_length=hop_length,
+            upsample_rates=upsample_rates,
+            upsample_kernel_sizes=upsample_kernel_sizes,
+            resblock_kernel_sizes=resblock_kernel_sizes,
+            resblock_dilation_sizes=resblock_dilation_sizes,
+            num_mels=num_mels,
+            upsample_initial_channel=upsample_initial_channel,
+            use_template=use_template,
+            pre_conv_kernel_size=pre_conv_kernel_size,
+            post_conv_kernel_size=post_conv_kernel_size,
+        )
+        self.sampling_rate = sampling_rate
+        self.mel_transform = LogMelSpectrogram(
+            sample_rate=sampling_rate,
+            n_fft=n_fft,
+            win_length=win_length,
+            hop_length=hop_length,
+            f_min=f_min,
+            f_max=f_max,
+            n_mels=n_mels,
+        )
+        self.eval()
+    @torch.no_grad()
+    def decode(self, mel):
+        y = self.backbone(mel)
+        y = self.head(y)
+        return y
+    @torch.no_grad()
+    def encode(self, x):
+        return self.mel_transform(x)
+    def forward(self, mel):
+        y = self.backbone(mel)
+        y = self.head(y)
+        return y
+if __name__ == "__main__":
+    import soundfile as sf
+    x = "test_audio.wav"
+    model = ADaMoSHiFiGANV1.from_pretrained(
+        "./checkpoints/music_vocoder", local_files_only=True
+    )
+    wav, sr = librosa.load(x, sr=44100, mono=True)
+    wav = torch.from_numpy(wav).float()[None]
+    mel = model.encode(wav)
+    wav = model.decode(mel)[0].mT
+    sf.write("test_audio_vocoder_rec.wav", wav.cpu().numpy(), 44100)

model/ldm/__pycache__/attention.cpython-310.pyc ADDED Viewed

Binary file (10.9 kB). View file

model/ldm/__pycache__/audioldm.cpython-310.pyc ADDED Viewed

Binary file (23.7 kB). View file

model/ldm/__pycache__/customer_attention_processor.cpython-310.pyc ADDED Viewed

Binary file (9.44 kB). View file

model/ldm/__pycache__/dpm_solver_pytorch.cpython-310.pyc ADDED Viewed

Binary file (54.8 kB). View file

model/ldm/__pycache__/editing_unet.cpython-310.pyc ADDED Viewed

Binary file (1.59 kB). View file

model/ldm/__pycache__/linear_attention_block.cpython-310.pyc ADDED Viewed

Binary file (3.53 kB). View file

model/ldm/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (4.21 kB). View file

model/ldm/attention.py ADDED Viewed

	@@ -0,0 +1,355 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from inspect import isfunction
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn, einsum
+from einops import rearrange, repeat
+from diffusers.models.attention import Attention as DiffusersAttention
+from diffusers.models.attention_processor import AttnProcessor2_0
+from .customer_attention_processor import CustomLiteLACrossAttnProcessor2_0, CustomLiteLAProcessor2_0
+class CheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+        with torch.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+    @staticmethod
+    def backward(ctx, *output_grads):
+        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
+        with torch.enable_grad():
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = torch.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+def exists(val):
+    return val is not None
+def uniq(arr):
+    return {el: True for el in arr}.keys()
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+# feedforward
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = (
+            nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU())
+            if not glu
+            else GEGLU(dim, inner_dim)
+        )
+        self.net = nn.Sequential(
+            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+        )
+    def forward(self, x):
+        return self.net(x)
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(
+        num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+    )
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(
+            qkv, "b (qkv heads c) h w -> qkv b heads c (h w)", heads=self.heads, qkv=3
+        )
+        k = k.softmax(dim=-1)
+        context = torch.einsum("bhdn,bhen->bhde", k, v)
+        out = torch.einsum("bhde,bhdn->bhen", context, q)
+        out = rearrange(
+            out, "b heads c (h w) -> b (heads c) h w", heads=self.heads, h=h, w=w
+        )
+        return self.to_out(out)
+class SpatialSelfAttention(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b (h w) c")
+        k = rearrange(k, "b c h w -> b c (h w)")
+        w_ = torch.einsum("bij,bjk->bik", q, k)
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = rearrange(v, "b c h w -> b c (h w)")
+        w_ = rearrange(w_, "b i j -> b j i")
+        h_ = torch.einsum("bij,bjk->bik", v, w_)
+        h_ = rearrange(h_, "b c (h w) -> b c h w", h=h)
+        h_ = self.proj_out(h_)
+        return x + h_
+class CrossAttention(nn.Module):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+    def forward(self, x, context=None, mask=None):
+        h = self.heads
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v))
+        sim = einsum("b i d, b j d -> b i j", q, k) * self.scale
+        if exists(mask):
+            mask = rearrange(mask, "b ... -> b (...)")
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = repeat(mask, "b j -> (b h) () j", h=h)
+            sim.masked_fill_(~mask, max_neg_value)
+        # attention, what we cannot get enough of
+        attn = sim.softmax(dim=-1)
+        out = einsum("b i j, b j d -> b i d", attn, v)
+        out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
+        return self.to_out(out)
+class BasicTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+    ):
+        super().__init__()
+        # UNet BasicTransformerBlock with Linear Attention for both Self and Cross attention
+        # 1. Self-Attention with Linear Attention for efficiency
+        self.attn1 = DiffusersAttention(
+            query_dim=dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            processor=CustomLiteLAProcessor2_0()  # Linear attention for self-attention
+        )
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        # 2. Cross-Attention with Standard Attention for optimal text conditioning
+        # Using AttnProcessor2_0 for better text-audio alignment and conditioning quality
+        self.attn2 = DiffusersAttention(
+            query_dim=dim,
+            cross_attention_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            processor=AttnProcessor2_0()  # Standard attention for best cross-attention performance
+        )
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+    def forward(self, x, context=None):
+        # Hàm checkpoint tùy chỉnh của Amphion có thể không tương thích tốt
+        # Hãy sử dụng checkpoint của PyTorch nếu cần, nhưng để đơn giản, ta tạm bỏ qua
+        # return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
+        return self._forward(x, context)
+    def _forward(self, x, context=None):
+        # 1. Self-Attention
+        # Lớp của Diffusers trả về tensor trực tiếp, không phải tuple
+        out1, _ = self.attn1(self.norm1(x))
+        x = out1 + x
+        # 2. Cross-Attention
+        #out2, _ = self.attn2(self.norm2(x), encoder_hidden_states=context)
+        x = self.attn2(self.norm2(x), encoder_hidden_states=context) + x
+        # 3. Feed-forward
+        x = self.ff(self.norm3(x)) + x
+        return x
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    """
+    def __init__(
+        self, in_channels, n_heads, d_head, depth=1, dropout=0.0, context_dim=None
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)
+        self.proj_in = nn.Conv2d(
+            in_channels, inner_dim, kernel_size=1, stride=1, padding=0
+        )
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim
+                )
+                for d in range(depth)
+            ]
+        )
+        self.proj_out = zero_module(
+            nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        )
+    def forward(self, x, context=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        b, c, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        x = self.proj_in(x)
+        x = rearrange(x, "b c h w -> b (h w) c")
+        for block in self.transformer_blocks:
+            x = block(x, context=context)
+        x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
+        x = self.proj_out(x)
+        return x + x_in

model/ldm/audioldm.py ADDED Viewed

	@@ -0,0 +1,946 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from abc import abstractmethod
+from functools import partial
+import math
+from typing import Iterable
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from einops import repeat
+from torch.utils.checkpoint import checkpoint as pt_checkpoint
+from .attention import SpatialTransformer
+# from attention import SpatialTransformer
+class CheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+        with torch.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+    @staticmethod
+    def backward(ctx, *output_grads):
+        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
+        with torch.enable_grad():
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = torch.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        ).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+    else:
+        embedding = repeat(timesteps, "b -> b d", d=dim)
+    return embedding
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        # Lấy dtype mục tiêu từ chính tham số của lớp này
+        # Điều này đảm bảo input và weight/bias luôn có cùng dtype
+        target_dtype = self.weight.dtype
+        # Chuyển input sang đúng dtype và thực hiện phép toán
+        return F.group_norm(
+            x.to(target_dtype),
+            self.num_groups,
+            self.weight,
+            self.bias,
+            self.eps
+        )
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(32, channels)
+def count_flops_attn(model, _x, y):
+    """
+    A counter for the `thop` package to count the operations in an
+    attention operation.
+    Meant to be used like:
+        macs, params = thop.profile(
+            model,
+            inputs=(inputs, timestamps),
+            custom_ops={QKVAttention: QKVAttention.count_flops},
+        )
+    """
+    b, c, *spatial = y[0].shape
+    num_spatial = int(np.prod(spatial))
+    # We perform two matmuls with the same number of ops.
+    # The first computes the weight matrix, the second computes
+    # the combination of the value vectors.
+    matmul_ops = 2 * b * (num_spatial**2) * c
+    model.total_ops += torch.DoubleTensor([matmul_ops])
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+class QKVAttention(nn.Module):
+    """
+    A module which performs QKV attention and splits in a different order.
+    """
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.chunk(3, dim=1)  # [N x (H * C) x T]
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = torch.einsum(
+            "bct,bcs->bts",
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = torch.einsum(
+            "bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length)
+        )
+        return a.reshape(bs, -1, length)
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+class QKVAttentionLegacy(nn.Module):
+    """
+    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
+    """
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = torch.einsum(
+            "bct,bcs->bts", q * scale, k * scale
+        )  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = torch.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+class AttentionPool2d(nn.Module):
+    """
+    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
+    """
+    def __init__(
+        self,
+        spacial_dim: int,
+        embed_dim: int,
+        num_heads_channels: int,
+        output_dim: int = None,
+    ):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            torch.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5
+        )
+        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
+        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
+        self.num_heads = embed_dim // num_heads_channels
+        self.attention = QKVAttention(self.num_heads)
+    def forward(self, x):
+        b, c, *_spatial = x.shape
+        x = x.reshape(b, c, -1)  # NC(HW)
+        x = torch.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)  # NC(HW+1)
+        x = x + self.positional_embedding[None, :, :].to(x.dtype)  # NC(HW+1)
+        x = self.qkv_proj(x)
+        x = self.attention(x)
+        x = self.c_proj(x)
+        return x[:, :, 0]
+class TimestepBlock(nn.Module):
+    """
+    Any module where forward() takes timestep embeddings as a second argument.
+    """
+    @abstractmethod
+    def forward(self, x, emb):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    """
+    A sequential module that passes timestep embeddings to the children that
+    support it as an extra input.
+    """
+    def forward(self, x, emb, context=None):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+            elif isinstance(layer, SpatialTransformer):
+                x = layer(x, context)
+            else:
+                x = layer(x)
+        return x
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(
+                dims, self.channels, self.out_channels, 3, padding=padding
+            )
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(
+                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
+            )
+        else:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+class TransposedUpsample(nn.Module):
+    "Learned 2x upsampling without padding"
+    def __init__(self, channels, out_channels=None, ks=5):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.up = nn.ConvTranspose2d(
+            self.channels, self.out_channels, kernel_size=ks, stride=2
+        )
+    def forward(self, x):
+        return self.up(x)
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(
+                dims,
+                self.channels,
+                self.out_channels,
+                3,
+                stride=stride,
+                padding=padding,
+            )
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class ResBlock(TimestepBlock):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param use_checkpoint: if True, use gradient checkpointing on this module.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    """
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        use_checkpoint=False,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, 3, padding=1),
+        )
+        self.updown = up or down
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(
+                emb_channels,
+                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+            ),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
+            ),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(
+                dims, channels, self.out_channels, 3, padding=1
+            )
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+    def forward(self, x, emb):
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        :param x: an [N x C x ...] Tensor of features.
+        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        if self.use_checkpoint:
+            # Use PyTorch's native checkpointing
+            return pt_checkpoint(self._forward, x, emb, use_reentrant=False)
+        else:
+            return self._forward(x, emb)
+    def _forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = torch.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other.
+    Originally ported from here, but adapted to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    """
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.norm = normalization(channels)
+        self.qkv = conv_nd(1, channels, channels * 3, 1)
+        if use_new_attention_order:
+            # split qkv before split heads
+            self.attention = QKVAttention(self.num_heads)
+        else:
+            # split heads before split qkv
+            self.attention = QKVAttentionLegacy(self.num_heads)
+        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
+    def forward(self, x):
+        if self.use_checkpoint:
+            # Use PyTorch's native checkpointing
+            return pt_checkpoint(self._forward, x, use_reentrant=False)
+        else:
+            return self._forward(x)
+    def _forward(self, x):
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
+class UNetModel(nn.Module):
+    """
+    The full UNet model with attention and timestep embedding.
+    :param in_channels: channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param num_res_blocks: number of residual blocks per downsample.
+    :param attention_resolutions: a collection of downsample rates at which
+        attention will take place. May be a set, list, or tuple.
+        For example, if this contains 4, then at 4x downsampling, attention
+        will be used.
+    :param dropout: the dropout probability.
+    :param channel_mult: channel multiplier for each level of the UNet.
+    :param conv_resample: if True, use learned convolutions for upsampling and
+        downsampling.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param num_classes: if specified (as an int), then this model will be
+        class-conditional with `num_classes` classes.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param num_heads: the number of attention heads in each attention layer.
+    :param num_heads_channels: if specified, ignore num_heads and instead use
+                               a fixed channel width per attention head.
+    :param num_heads_upsample: works with num_heads to set a different number
+                               of heads for upsampling. Deprecated.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param resblock_updown: use residual blocks for up/downsampling.
+    :param use_new_attention_order: use a different attention pattern for potentially
+                                    increased efficiency.
+    """
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=True,
+        num_heads=-1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        use_spatial_transformer=False,  # custom transformer support
+        transformer_depth=1,  # custom transformer support
+        context_dim=None,  # custom transformer support
+        n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
+        legacy=True,
+    ):
+        super().__init__()
+        if use_spatial_transformer:
+            assert (
+                context_dim is not None
+            ), "Fool!! You forgot to include the dimension of your cross-attention conditioning..."
+        if context_dim is not None:
+            assert (
+                use_spatial_transformer
+            ), "Fool!! You forgot to use the spatial transformer for your cross-attention conditioning..."
+            from omegaconf.listconfig import ListConfig
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        if num_heads == -1:
+            assert (
+                num_head_channels != -1
+            ), "Either num_heads or num_head_channels has to be set"
+        if num_head_channels == -1:
+            assert (
+                num_heads != -1
+            ), "Either num_heads or num_head_channels has to be set"
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        #self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            nn.Linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim),
+        )
+        if self.num_classes is not None:
+            self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        # num_heads = 1
+                        dim_head = (
+                            ch // num_heads
+                            if use_spatial_transformer
+                            else num_head_channels
+                        )
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=dim_head,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                        if not use_spatial_transformer
+                        else SpatialTransformer(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            depth=transformer_depth,
+                            context_dim=context_dim,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        if legacy:
+            # num_heads = 1
+            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            (
+                AttentionBlock(
+                    ch,
+                    use_checkpoint=use_checkpoint,
+                    num_heads=num_heads,
+                    num_head_channels=dim_head,
+                    use_new_attention_order=use_new_attention_order,
+                )
+                if not use_spatial_transformer
+                else SpatialTransformer(
+                    ch,
+                    num_heads,
+                    dim_head,
+                    depth=transformer_depth,
+                    context_dim=context_dim,
+                )
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=model_channels * mult,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = model_channels * mult
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        # num_heads = 1
+                        dim_head = (
+                            ch // num_heads
+                            if use_spatial_transformer
+                            else num_head_channels
+                        )
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads_upsample,
+                            num_head_channels=dim_head,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                        if not use_spatial_transformer
+                        else SpatialTransformer(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            depth=transformer_depth,
+                            context_dim=context_dim,
+                        )
+                    )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                        )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
+        )
+        if self.predict_codebook_ids:
+            self.id_predictor = nn.Sequential(
+                normalization(ch),
+                conv_nd(dims, model_channels, n_embed, 1),
+                # nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
+            )
+    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param context: conditioning plugged in via crossattn
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), "must specify y if and only if the model is class-conditional"
+        hs = []
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        # Ensure t_emb matches the dtype of time_embed layer weights
+        emb = self.time_embed(t_emb.to(self.time_embed[0].weight.dtype))
+        if self.num_classes is not None:
+            assert y.shape == (x.shape[0],)
+            emb = emb + self.label_emb(y)
+        h = x#.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb, context)
+            hs.append(h)
+        h = self.middle_block(h, emb, context)
+        for module in self.output_blocks:
+            # print(h.shape, hs[-1].shape)
+            if h.shape != hs[-1].shape:
+                if h.shape[-1] > hs[-1].shape[-1]:
+                    h = h[:, :, :, : hs[-1].shape[-1]]
+                if h.shape[-2] > hs[-1].shape[-2]:
+                    h = h[:, :, : hs[-1].shape[-2], :]
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context)
+            # print(h.shape)
+        #h = h.type(x.dtype)
+        if self.predict_codebook_ids:
+            return self.id_predictor(h)
+        else:
+            return self.out(h)
+class AudioLDM(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.unet = UNetModel(
+            image_size=cfg.image_size,
+            in_channels=cfg.in_channels,
+            out_channels=cfg.out_channels,
+            model_channels=cfg.model_channels,
+            attention_resolutions=cfg.attention_resolutions,
+            num_res_blocks=cfg.num_res_blocks,
+            channel_mult=cfg.channel_mult,
+            num_heads=cfg.num_heads,
+            use_spatial_transformer=cfg.use_spatial_transformer,
+            transformer_depth=cfg.transformer_depth,
+            context_dim=cfg.context_dim,
+            use_checkpoint=cfg.use_checkpoint,
+            legacy=cfg.legacy,
+        )
+    def forward(self, x, timesteps=None, context=None, y=None):
+        x = self.unet(x=x, timesteps=timesteps, context=context, y=y)
+        return x

model/ldm/customer_attention_processor.py ADDED Viewed

	@@ -0,0 +1,507 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Union, Tuple
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.utils import logging
+from diffusers.models.attention_processor import Attention
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+# ADD THIS NEW CLASS to the end of customer_attention_processor.py
+class CustomLiteLACrossAttnProcessor2_0:
+    """
+    Attention processor for LINEAR CROSS-ATTENTION.
+    This correctly uses the `encoder_hidden_states` for keys and values.
+    """
+    def __init__(self):
+        self.kernel_func = nn.ReLU(inplace=False)
+        self.eps = 1e-15
+        self.pad_val = 1.0
+    # The apply_rotary_emb function is identical, you can copy it from above if needed
+    def apply_rotary_emb(self, x, freqs_cis):
+        cos, sin = freqs_cis
+        cos, sin = cos[None, None].to(x.device), sin[None, None].to(x.device)
+        x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
+        x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        return (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        rotary_freqs_cis: Optional[Union[torch.Tensor, Tuple[torch.Tensor]]] = None,
+        # Add other args for compatibility
+        **kwargs,
+    ) -> torch.FloatTensor:
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size = hidden_states.shape[0]
+        # --- KEY FIX IS HERE ---
+        # Q is from audio, K and V are from text
+        query = attn.to_q(hidden_states)
+        # Use encoder_hidden_states for K and V
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states # Fallback to self-attention
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        # --- END OF FIX ---
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
+        key = key.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1).transpose(-1, -2)
+        value = value.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
+        # Reshape query for RoPE
+        query = query.permute(0, 1, 3, 2)
+        # Apply RoPE if needed
+        if rotary_freqs_cis is not None:
+            query = self.apply_rotary_emb(query, rotary_freqs_cis)
+            # For cross-attention, you might have separate freqs for text
+            # but we assume they share for simplicity here
+            key_freqs = kwargs.get("rotary_freqs_cis_cross", rotary_freqs_cis)
+            key = self.apply_rotary_emb(key, key_freqs)
+        # Reshape query back
+        query = query.permute(0, 1, 3, 2)
+        # Linear attention math
+        query = self.kernel_func(query)
+        key = self.kernel_func(key)
+        query, key, value = query.float(), key.float(), value.float()
+        value = F.pad(value, (0, 0, 0, 1), mode="constant", value=self.pad_val)
+        vk = torch.matmul(value, key)
+        hidden_states = torch.matmul(vk, query)
+        hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + self.eps)
+        hidden_states = hidden_states.view(batch_size, attn.heads * head_dim, -1).permute(0, 2, 1)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        return hidden_states
+class CustomLiteLAProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections. add rms norm for query and key and apply RoPE"""
+    def __init__(self):
+        self.kernel_func = nn.ReLU(inplace=False)
+        self.eps = 1e-15
+        self.pad_val = 1.0
+    def apply_rotary_emb(
+        self,
+        x: torch.Tensor,
+        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+        to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+        reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+        tensors contain rotary embeddings and are returned as real tensors.
+        Args:
+            x (`torch.Tensor`):
+                Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
+            freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+        """
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+        x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
+        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        hidden_states_len = hidden_states.shape[1]
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        if encoder_hidden_states is not None:
+            context_input_ndim = encoder_hidden_states.ndim
+            if context_input_ndim == 4:
+                batch_size, channel, height, width = encoder_hidden_states.shape
+                encoder_hidden_states = encoder_hidden_states.view(
+                    batch_size, channel, height * width
+                ).transpose(1, 2)
+        batch_size = hidden_states.shape[0]
+        # `sample` projections.
+        dtype = hidden_states.dtype
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        # `context` projections.
+        has_encoder_hidden_state_proj = (
+            hasattr(attn, "add_q_proj")
+            and hasattr(attn, "add_k_proj")
+            and hasattr(attn, "add_v_proj")
+        )
+        if encoder_hidden_states is not None and has_encoder_hidden_state_proj:
+            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+            # attention
+            if not attn.is_cross_attention:
+                query = torch.cat([query, encoder_hidden_states_query_proj], dim=1)
+                key = torch.cat([key, encoder_hidden_states_key_proj], dim=1)
+                value = torch.cat([value, encoder_hidden_states_value_proj], dim=1)
+            else:
+                query = hidden_states
+                key = encoder_hidden_states
+                value = encoder_hidden_states
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
+        key = (
+            key.transpose(-1, -2)
+            .reshape(batch_size, attn.heads, head_dim, -1)
+            .transpose(-1, -2)
+        )
+        value = value.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
+        # RoPE需要 [B, H, S, D] 输入
+        # 此时 query是 [B, H, D, S], 需要转成 [B, H, S, D] 才能应用RoPE
+        query = query.permute(0, 1, 3, 2)  # [B, H, S, D]  (从 [B, H, D, S])
+        # Apply query and key normalization if needed
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # Apply RoPE if needed
+        if rotary_freqs_cis is not None:
+            query = self.apply_rotary_emb(query, rotary_freqs_cis)
+            if not attn.is_cross_attention:
+                key = self.apply_rotary_emb(key, rotary_freqs_cis)
+            elif rotary_freqs_cis_cross is not None and has_encoder_hidden_state_proj:
+                key = self.apply_rotary_emb(key, rotary_freqs_cis_cross)
+        # 此时 query是 [B, H, S, D]，需要还原成 [B, H, D, S]
+        query = query.permute(0, 1, 3, 2)  # [B, H, D, S]
+        if attention_mask is not None:
+            # attention_mask: [B, S] -> [B, 1, S, 1]
+            attention_mask = attention_mask[:, None, :, None].to(
+                key.dtype
+            )  # [B, 1, S, 1]
+            query = query * attention_mask.permute(
+                0, 1, 3, 2
+            )  # [B, H, S, D] * [B, 1, S, 1]
+            if not attn.is_cross_attention:
+                key = (
+                    key * attention_mask
+                )  # key: [B, h, S, D] 与 mask [B, 1, S, 1] 相乘
+                value = value * attention_mask.permute(
+                    0, 1, 3, 2
+                )  # 如果 value 是 [B, h, D, S]，那么需调整mask以匹配S维度
+        if (
+            attn.is_cross_attention
+            and encoder_attention_mask is not None
+            and has_encoder_hidden_state_proj
+        ):
+            encoder_attention_mask = encoder_attention_mask[:, None, :, None].to(
+                key.dtype
+            )  # [B, 1, S_enc, 1]
+            # 此时 key: [B, h, S_enc, D], value: [B, h, D, S_enc]
+            key = key * encoder_attention_mask  # [B, h, S_enc, D] * [B, 1, S_enc, 1]
+            value = value * encoder_attention_mask.permute(
+                0, 1, 3, 2
+            )  # [B, h, D, S_enc] * [B, 1, 1, S_enc]
+        query = self.kernel_func(query)
+        key = self.kernel_func(key)
+        query, key, value = query.float(), key.float(), value.float()
+        value = F.pad(value, (0, 0, 0, 1), mode="constant", value=self.pad_val)
+        vk = torch.matmul(value, key)
+        hidden_states = torch.matmul(vk, query)
+        if hidden_states.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.float()
+        hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + self.eps)
+        hidden_states = hidden_states.view(
+            batch_size, attn.heads * head_dim, -1
+        ).permute(0, 2, 1)
+        hidden_states = hidden_states.to(dtype)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states = encoder_hidden_states.to(dtype)
+        # Split the attention outputs.
+        if (
+            encoder_hidden_states is not None
+            and not attn.is_cross_attention
+            and has_encoder_hidden_state_proj
+        ):
+            hidden_states, encoder_hidden_states = (
+                hidden_states[:, :hidden_states_len],
+                hidden_states[:, hidden_states_len:],
+            )
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        # if (
+        #     encoder_hidden_states is not None
+        #     and not attn.context_pre_only
+        #     and not attn.is_cross_attention
+        #     and hasattr(attn, "to_add_out")
+        # ):
+        #     encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if encoder_hidden_states is not None and context_input_ndim == 4:
+            encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if torch.get_autocast_gpu_dtype() == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+            if encoder_hidden_states is not None:
+                encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+        return hidden_states, encoder_hidden_states
+class CustomerAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+    def apply_rotary_emb(
+        self,
+        x: torch.Tensor,
+        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+        to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+        reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+        tensors contain rotary embeddings and are returned as real tensors.
+        Args:
+            x (`torch.Tensor`):
+                Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
+            freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+        """
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+        x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
+        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        has_encoder_hidden_state_proj = (
+            hasattr(attn, "add_q_proj")
+            and hasattr(attn, "add_k_proj")
+            and hasattr(attn, "add_v_proj")
+        )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # Apply RoPE if needed
+        if rotary_freqs_cis is not None:
+            query = self.apply_rotary_emb(query, rotary_freqs_cis)
+            if not attn.is_cross_attention:
+                key = self.apply_rotary_emb(key, rotary_freqs_cis)
+            elif rotary_freqs_cis_cross is not None and has_encoder_hidden_state_proj:
+                key = self.apply_rotary_emb(key, rotary_freqs_cis_cross)
+        if (
+            attn.is_cross_attention
+            and encoder_attention_mask is not None
+            and has_encoder_hidden_state_proj
+        ):
+            # attention_mask: N x S1
+            # encoder_attention_mask: N x S2
+            # cross attention 整合attention_mask和encoder_attention_mask
+            combined_mask = (
+                attention_mask[:, :, None] * encoder_attention_mask[:, None, :]
+            )
+            attention_mask = torch.where(combined_mask == 1, 0.0, -torch.inf)
+            attention_mask = (
+                attention_mask[:, None, :, :]
+                .expand(-1, attn.heads, -1, -1)
+                .to(query.dtype)
+            )
+        elif not attn.is_cross_attention and attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(
+                attention_mask, sequence_length, batch_size
+            )
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(
+                batch_size, attn.heads, -1, attention_mask.shape[-1]
+            )
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

model/ldm/dpm_solver_pytorch.py ADDED Viewed

	@@ -0,0 +1,1307 @@

+import torch
+import torch.nn.functional as F
+import math
+class NoiseScheduleVP:
+    def __init__(
+            self,
+            schedule='discrete',
+            betas=None,
+            alphas_cumprod=None,
+            continuous_beta_0=0.1,
+            continuous_beta_1=20.,
+            dtype=torch.float32,
+        ):
+        """Create a wrapper class for the forward SDE (VP type).
+        ***
+        Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
+                We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
+        ***
+        The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
+        We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
+        Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
+            log_alpha_t = self.marginal_log_mean_coeff(t)
+            sigma_t = self.marginal_std(t)
+            lambda_t = self.marginal_lambda(t)
+        Moreover, as lambda(t) is an invertible function, we also support its inverse function:
+            t = self.inverse_lambda(lambda_t)
+        ===============================================================
+        We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
+        1. For discrete-time DPMs:
+            For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
+                t_i = (i + 1) / N
+            e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
+            We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
+            Args:
+                betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
+                alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
+            Note that we always have alphas_cumprod = cumprod(1 - betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
+            **Important**:  Please pay special attention for the args for `alphas_cumprod`:
+                The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
+                    q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
+                Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
+                    alpha_{t_n} = \sqrt{\hat{alpha_n}},
+                and
+                    log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
+        2. For continuous-time DPMs:
+            We support the linear VPSDE for the continuous time setting. The hyperparameters for the noise
+            schedule are the default settings in Yang Song's ScoreSDE:
+            Args:
+                beta_min: A `float` number. The smallest beta for the linear schedule.
+                beta_max: A `float` number. The largest beta for the linear schedule.
+                T: A `float` number. The ending time of the forward process.
+        ===============================================================
+        Args:
+            schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
+                    'linear' for continuous-time DPMs.
+        Returns:
+            A wrapper object of the forward SDE (VP type).
+        ===============================================================
+        Example:
+        # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
+        >>> ns = NoiseScheduleVP('discrete', betas=betas)
+        # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
+        >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
+        # For continuous-time DPMs (VPSDE), linear schedule:
+        >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
+        """
+        if schedule not in ['discrete', 'linear']:
+            raise ValueError("Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear'".format(schedule))
+        self.schedule = schedule
+        if schedule == 'discrete':
+            if betas is not None:
+                log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
+            else:
+                assert alphas_cumprod is not None
+                log_alphas = 0.5 * torch.log(alphas_cumprod)
+            self.T = 1.
+            self.log_alpha_array = self.numerical_clip_alpha(log_alphas).reshape((1, -1,)).to(dtype=dtype)
+            self.total_N = self.log_alpha_array.shape[1]
+            self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1)).to(dtype=dtype)
+        else:
+            self.T = 1.
+            self.total_N = 1000
+            self.beta_0 = continuous_beta_0
+            self.beta_1 = continuous_beta_1
+    def numerical_clip_alpha(self, log_alphas, clipped_lambda=-5.1):
+        """
+        For some beta schedules such as cosine schedule, the log-SNR has numerical isssues.
+        We clip the log-SNR near t=T within -5.1 to ensure the stability.
+        Such a trick is very useful for diffusion models with the cosine schedule, such as i-DDPM, guided-diffusion and GLIDE.
+        """
+        log_sigmas = 0.5 * torch.log(1. - torch.exp(2. * log_alphas))
+        lambs = log_alphas - log_sigmas
+        idx = torch.searchsorted(torch.flip(lambs, [0]), clipped_lambda)
+        if idx > 0:
+            log_alphas = log_alphas[:-idx]
+        return log_alphas
+    def marginal_log_mean_coeff(self, t):
+        """
+        Compute log(alpha_t) of a given continuous-time label t in [0, T].
+        """
+        if self.schedule == 'discrete':
+            return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device)).reshape((-1))
+        elif self.schedule == 'linear':
+            return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
+    def marginal_alpha(self, t):
+        """
+        Compute alpha_t of a given continuous-time label t in [0, T].
+        """
+        return torch.exp(self.marginal_log_mean_coeff(t))
+    def marginal_std(self, t):
+        """
+        Compute sigma_t of a given continuous-time label t in [0, T].
+        """
+        return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
+    def marginal_lambda(self, t):
+        """
+        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
+        """
+        log_mean_coeff = self.marginal_log_mean_coeff(t)
+        log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
+        return log_mean_coeff - log_std
+    def inverse_lambda(self, lamb):
+        """
+        Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
+        """
+        if self.schedule == 'linear':
+            tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
+            Delta = self.beta_0**2 + tmp
+            return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
+        elif self.schedule == 'discrete':
+            log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
+            t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]), torch.flip(self.t_array.to(lamb.device), [1]))
+            return t.reshape((-1,))
+def model_wrapper(
+    model,
+    noise_schedule,
+    model_type="noise",
+    model_kwargs={},
+    guidance_type="uncond",
+    condition=None,
+    unconditional_condition=None,
+    guidance_scale=1.,
+    classifier_fn=None,
+    classifier_kwargs={},
+):
+    """Create a wrapper function for the noise prediction model.
+    DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to
+    firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.
+    We support four types of the diffusion model by setting `model_type`:
+        1. "noise": noise prediction model. (Trained by predicting noise).
+        2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).
+        3. "v": velocity prediction model. (Trained by predicting the velocity).
+            The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].
+            [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models."
+                arXiv preprint arXiv:2202.00512 (2022).
+            [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
+                arXiv preprint arXiv:2210.02303 (2022).
+        4. "score": marginal score function. (Trained by denoising score matching).
+            Note that the score function and the noise prediction model follows a simple relationship:
+            ```
+                noise(x_t, t) = -sigma_t * score(x_t, t)
+            ```
+    We support three types of guided sampling by DPMs by setting `guidance_type`:
+        1. "uncond": unconditional sampling by DPMs.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
+            ``
+        2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
+            ``
+            The input `classifier_fn` has the following format:
+            ``
+                classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
+            ``
+            [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis,"
+                in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.
+        3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
+            ``
+            And if cond == `unconditional_condition`, the model output is the unconditional DPM output.
+            [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
+                arXiv preprint arXiv:2207.12598 (2022).
+    The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
+    or continuous-time labels (i.e. epsilon to T).
+    We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
+    ``
+        def model_fn(x, t_continuous) -> noise:
+            t_input = get_model_input_time(t_continuous)
+            return noise_pred(model, x, t_input, **model_kwargs)
+    ``
+    where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver.
+    ===============================================================
+    Args:
+        model: A diffusion model with the corresponding format described above.
+        noise_schedule: A noise schedule object, such as NoiseScheduleVP.
+        model_type: A `str`. The parameterization type of the diffusion model.
+                    "noise" or "x_start" or "v" or "score".
+        model_kwargs: A `dict`. A dict for the other inputs of the model function.
+        guidance_type: A `str`. The type of the guidance for sampling.
+                    "uncond" or "classifier" or "classifier-free".
+        condition: A pytorch tensor. The condition for the guided sampling.
+                    Only used for "classifier" or "classifier-free" guidance type.
+        unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
+                    Only used for "classifier-free" guidance type.
+        guidance_scale: A `float`. The scale for the guided sampling.
+        classifier_fn: A classifier function. Only used for the classifier guidance.
+        classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
+    Returns:
+        A noise prediction model that accepts the noised data and the continuous time as the inputs.
+    """
+    def get_model_input_time(t_continuous):
+        """
+        Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
+        For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
+        For continuous-time DPMs, we just use `t_continuous`.
+        """
+        if noise_schedule.schedule == 'discrete':
+            return (t_continuous - 1. / noise_schedule.total_N) * 1000.
+        else:
+            return t_continuous
+    def noise_pred_fn(x, t_continuous, cond=None):
+        t_input = get_model_input_time(t_continuous)
+        if cond is None:
+            # For EditingUNet: (noisy_target_latent, source_latent, context, timesteps)
+            output = model(noisy_target_latent=x, timesteps=t_input, **model_kwargs)
+        else:
+            # For EditingUNet with condition: (noisy_target_latent, source_latent, context, timesteps)
+            output = model(noisy_target_latent=x, context=cond, timesteps=t_input, **model_kwargs)
+        if model_type == "noise":
+            return output
+        elif model_type == "x_start":
+            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
+            return (x - expand_dims(alpha_t, x.dim()) * output) / expand_dims(sigma_t, x.dim())
+        elif model_type == "v":
+            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
+            return expand_dims(alpha_t, x.dim()) * output + expand_dims(sigma_t, x.dim()) * x
+        elif model_type == "score":
+            sigma_t = noise_schedule.marginal_std(t_continuous)
+            return -expand_dims(sigma_t, x.dim()) * output
+    def cond_grad_fn(x, t_input):
+        """
+        Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
+        """
+        with torch.enable_grad():
+            x_in = x.detach().requires_grad_(True)
+            log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
+            return torch.autograd.grad(log_prob.sum(), x_in)[0]
+    def model_fn(x, t_continuous):
+        """
+        The noise predicition model function that is used for DPM-Solver.
+        """
+        if guidance_type == "uncond":
+            return noise_pred_fn(x, t_continuous)
+        elif guidance_type == "classifier":
+            assert classifier_fn is not None
+            t_input = get_model_input_time(t_continuous)
+            cond_grad = cond_grad_fn(x, t_input)
+            sigma_t = noise_schedule.marginal_std(t_continuous)
+            noise = noise_pred_fn(x, t_continuous)
+            return noise - guidance_scale * expand_dims(sigma_t, x.dim()) * cond_grad
+        elif guidance_type == "classifier-free":
+            if guidance_scale == 1. or unconditional_condition is None:
+                return noise_pred_fn(x, t_continuous, cond=condition)
+            else:
+                x_in = torch.cat([x] * 2)
+                t_in = torch.cat([t_continuous] * 2)
+                c_in = torch.cat([unconditional_condition, condition])
+                noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
+                return noise_uncond + guidance_scale * (noise - noise_uncond)
+    assert model_type in ["noise", "x_start", "v", "score"]
+    assert guidance_type in ["uncond", "classifier", "classifier-free"]
+    return model_fn
+class DPM_Solver:
+    def __init__(
+        self,
+        model_fn,
+        noise_schedule,
+        algorithm_type="dpmsolver++",
+        correcting_x0_fn=None,
+        correcting_xt_fn=None,
+        thresholding_max_val=1.,
+        dynamic_thresholding_ratio=0.995,
+    ):
+        """Construct a DPM-Solver.
+        We support both DPM-Solver (`algorithm_type="dpmsolver"`) and DPM-Solver++ (`algorithm_type="dpmsolver++"`).
+        We also support the "dynamic thresholding" method in Imagen[1]. For pixel-space diffusion models, you
+        can set both `algorithm_type="dpmsolver++"` and `correcting_x0_fn="dynamic_thresholding"` to use the
+        dynamic thresholding. The "dynamic thresholding" can greatly improve the sample quality for pixel-space
+        DPMs with large guidance scales. Note that the thresholding method is **unsuitable** for latent-space
+        DPMs (such as stable-diffusion).
+        To support advanced algorithms in image-to-image applications, we also support corrector functions for
+        both x0 and xt.
+        Args:
+            model_fn: A noise prediction model function which accepts the continuous-time input (t in [epsilon, T]):
+                ``
+                def model_fn(x, t_continuous):
+                    return noise
+                ``
+                The shape of `x` is `(batch_size, **shape)`, and the shape of `t_continuous` is `(batch_size,)`.
+            noise_schedule: A noise schedule object, such as NoiseScheduleVP.
+            algorithm_type: A `str`. Either "dpmsolver" or "dpmsolver++".
+            correcting_x0_fn: A `str` or a function with the following format:
+                ```
+                def correcting_x0_fn(x0, t):
+                    x0_new = ...
+                    return x0_new
+                ```
+                This function is to correct the outputs of the data prediction model at each sampling step. e.g.,
+                ```
+                x0_pred = data_pred_model(xt, t)
+                if correcting_x0_fn is not None:
+                    x0_pred = correcting_x0_fn(x0_pred, t)
+                xt_1 = update(x0_pred, xt, t)
+                ```
+                If `correcting_x0_fn="dynamic_thresholding"`, we use the dynamic thresholding proposed in Imagen[1].
+            correcting_xt_fn: A function with the following format:
+                ```
+                def correcting_xt_fn(xt, t, step):
+                    x_new = ...
+                    return x_new
+                ```
+                This function is to correct the intermediate samples xt at each sampling step. e.g.,
+                ```
+                xt = ...
+                xt = correcting_xt_fn(xt, t, step)
+                ```
+            thresholding_max_val: A `float`. The max value for thresholding.
+                Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`.
+            dynamic_thresholding_ratio: A `float`. The ratio for dynamic thresholding (see Imagen[1] for details).
+                Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`.
+        [1] Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour,
+            Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models
+            with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b.
+        """
+        self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
+        self.noise_schedule = noise_schedule
+        assert algorithm_type in ["dpmsolver", "dpmsolver++"]
+        self.algorithm_type = algorithm_type
+        if correcting_x0_fn == "dynamic_thresholding":
+            self.correcting_x0_fn = self.dynamic_thresholding_fn
+        else:
+            self.correcting_x0_fn = correcting_x0_fn
+        self.correcting_xt_fn = correcting_xt_fn
+        self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
+        self.thresholding_max_val = thresholding_max_val
+    def dynamic_thresholding_fn(self, x0, t):
+        """
+        The dynamic thresholding method.
+        """
+        dims = x0.dim()
+        p = self.dynamic_thresholding_ratio
+        s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
+        s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims)
+        x0 = torch.clamp(x0, -s, s) / s
+        return x0
+    def noise_prediction_fn(self, x, t):
+        """
+        Return the noise prediction model.
+        """
+        return self.model(x, t)
+    def data_prediction_fn(self, x, t):
+        """
+        Return the data prediction model (with corrector).
+        """
+        noise = self.noise_prediction_fn(x, t)
+        alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
+        x0 = (x - sigma_t * noise) / alpha_t
+        if self.correcting_x0_fn is not None:
+            x0 = self.correcting_x0_fn(x0, t)
+        return x0
+    def model_fn(self, x, t):
+        """
+        Convert the model to the noise prediction model or the data prediction model.
+        """
+        if self.algorithm_type == "dpmsolver++":
+            return self.data_prediction_fn(x, t)
+        else:
+            return self.noise_prediction_fn(x, t)
+    def get_time_steps(self, skip_type, t_T, t_0, N, device):
+        """Compute the intermediate time steps for sampling.
+        Args:
+            skip_type: A `str`. The type for the spacing of the time steps. We support three types:
+                - 'logSNR': uniform logSNR for the time steps.
+                - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
+                - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
+            t_T: A `float`. The starting time of the sampling (default is T).
+            t_0: A `float`. The ending time of the sampling (default is epsilon).
+            N: A `int`. The total number of the spacing of the time steps.
+            device: A torch device.
+        Returns:
+            A pytorch tensor of the time steps, with the shape (N + 1,).
+        """
+        if skip_type == 'logSNR':
+            lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
+            lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
+            logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
+            return self.noise_schedule.inverse_lambda(logSNR_steps)
+        elif skip_type == 'time_uniform':
+            return torch.linspace(t_T, t_0, N + 1).to(device)
+        elif skip_type == 'time_quadratic':
+            t_order = 2
+            t = torch.linspace(t_T**(1. / t_order), t_0**(1. / t_order), N + 1).pow(t_order).to(device)
+            return t
+        else:
+            raise ValueError("Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type))
+    def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
+        """
+        Get the order of each step for sampling by the singlestep DPM-Solver.
+        We combine both DPM-Solver-1,2,3 to use all the function evaluations, which is named as "DPM-Solver-fast".
+        Given a fixed number of function evaluations by `steps`, the sampling procedure by DPM-Solver-fast is:
+            - If order == 1:
+                We take `steps` of DPM-Solver-1 (i.e. DDIM).
+            - If order == 2:
+                - Denote K = (steps // 2). We take K or (K + 1) intermediate time steps for sampling.
+                - If steps % 2 == 0, we use K steps of DPM-Solver-2.
+                - If steps % 2 == 1, we use K steps of DPM-Solver-2 and 1 step of DPM-Solver-1.
+            - If order == 3:
+                - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
+                - If steps % 3 == 0, we use (K - 2) steps of DPM-Solver-3, and 1 step of DPM-Solver-2 and 1 step of DPM-Solver-1.
+                - If steps % 3 == 1, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-1.
+                - If steps % 3 == 2, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-2.
+        ============================================
+        Args:
+            order: A `int`. The max order for the solver (2 or 3).
+            steps: A `int`. The total number of function evaluations (NFE).
+            skip_type: A `str`. The type for the spacing of the time steps. We support three types:
+                - 'logSNR': uniform logSNR for the time steps.
+                - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
+                - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
+            t_T: A `float`. The starting time of the sampling (default is T).
+            t_0: A `float`. The ending time of the sampling (default is epsilon).
+            device: A torch device.
+        Returns:
+            orders: A list of the solver order of each step.
+        """
+        if order == 3:
+            K = steps // 3 + 1
+            if steps % 3 == 0:
+                orders = [3,] * (K - 2) + [2, 1]
+            elif steps % 3 == 1:
+                orders = [3,] * (K - 1) + [1]
+            else:
+                orders = [3,] * (K - 1) + [2]
+        elif order == 2:
+            if steps % 2 == 0:
+                K = steps // 2
+                orders = [2,] * K
+            else:
+                K = steps // 2 + 1
+                orders = [2,] * (K - 1) + [1]
+        elif order == 1:
+            K = steps
+            orders = [1,] * steps
+        else:
+            raise ValueError("'order' must be '1' or '2' or '3'.")
+        if skip_type == 'logSNR':
+            # To reproduce the results in DPM-Solver paper
+            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
+        else:
+            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[torch.cumsum(torch.tensor([0,] + orders), 0).to(device)]
+        return timesteps_outer, orders
+    def denoise_to_zero_fn(self, x, s):
+        """
+        Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
+        """
+        return self.data_prediction_fn(x, s)
+    def dpm_solver_first_update(self, x, s, t, model_s=None, return_intermediate=False):
+        """
+        DPM-Solver-1 (equivalent to DDIM) from time `s` to time `t`.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            s: A pytorch tensor. The starting time, with the shape (1,).
+            t: A pytorch tensor. The ending time, with the shape (1,).
+            model_s: A pytorch tensor. The model function evaluated at time `s`.
+                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
+            return_intermediate: A `bool`. If true, also return the model value at time `s`.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        ns = self.noise_schedule
+        dims = x.dim()
+        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
+        h = lambda_t - lambda_s
+        log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t)
+        sigma_s, sigma_t = ns.marginal_std(s), ns.marginal_std(t)
+        alpha_t = torch.exp(log_alpha_t)
+        if self.algorithm_type == "dpmsolver++":
+            phi_1 = torch.expm1(-h)
+            if model_s is None:
+                model_s = self.model_fn(x, s)
+            x_t = (
+                sigma_t / sigma_s * x
+                - alpha_t * phi_1 * model_s
+            )
+            if return_intermediate:
+                return x_t, {'model_s': model_s}
+            else:
+                return x_t
+        else:
+            phi_1 = torch.expm1(h)
+            if model_s is None:
+                model_s = self.model_fn(x, s)
+            x_t = (
+                torch.exp(log_alpha_t - log_alpha_s) * x
+                - (sigma_t * phi_1) * model_s
+            )
+            if return_intermediate:
+                return x_t, {'model_s': model_s}
+            else:
+                return x_t
+    def singlestep_dpm_solver_second_update(self, x, s, t, r1=0.5, model_s=None, return_intermediate=False, solver_type='dpmsolver'):
+        """
+        Singlestep solver DPM-Solver-2 from time `s` to time `t`.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            s: A pytorch tensor. The starting time, with the shape (1,).
+            t: A pytorch tensor. The ending time, with the shape (1,).
+            r1: A `float`. The hyperparameter of the second-order solver.
+            model_s: A pytorch tensor. The model function evaluated at time `s`.
+                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
+            return_intermediate: A `bool`. If true, also return the model value at time `s` and `s1` (the intermediate time).
+            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        if solver_type not in ['dpmsolver', 'taylor']:
+            raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type))
+        if r1 is None:
+            r1 = 0.5
+        ns = self.noise_schedule
+        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
+        h = lambda_t - lambda_s
+        lambda_s1 = lambda_s + r1 * h
+        s1 = ns.inverse_lambda(lambda_s1)
+        log_alpha_s, log_alpha_s1, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(t)
+        sigma_s, sigma_s1, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(t)
+        alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t)
+        if self.algorithm_type == "dpmsolver++":
+            phi_11 = torch.expm1(-r1 * h)
+            phi_1 = torch.expm1(-h)
+            if model_s is None:
+                model_s = self.model_fn(x, s)
+            x_s1 = (
+                (sigma_s1 / sigma_s) * x
+                - (alpha_s1 * phi_11) * model_s
+            )
+            model_s1 = self.model_fn(x_s1, s1)
+            if solver_type == 'dpmsolver':
+                x_t = (
+                    (sigma_t / sigma_s) * x
+                    - (alpha_t * phi_1) * model_s
+                    - (0.5 / r1) * (alpha_t * phi_1) * (model_s1 - model_s)
+                )
+            elif solver_type == 'taylor':
+                x_t = (
+                    (sigma_t / sigma_s) * x
+                    - (alpha_t * phi_1) * model_s
+                    + (1. / r1) * (alpha_t * (phi_1 / h + 1.)) * (model_s1 - model_s)
+                )
+        else:
+            phi_11 = torch.expm1(r1 * h)
+            phi_1 = torch.expm1(h)
+            if model_s is None:
+                model_s = self.model_fn(x, s)
+            x_s1 = (
+                torch.exp(log_alpha_s1 - log_alpha_s) * x
+                - (sigma_s1 * phi_11) * model_s
+            )
+            model_s1 = self.model_fn(x_s1, s1)
+            if solver_type == 'dpmsolver':
+                x_t = (
+                    torch.exp(log_alpha_t - log_alpha_s) * x
+                    - (sigma_t * phi_1) * model_s
+                    - (0.5 / r1) * (sigma_t * phi_1) * (model_s1 - model_s)
+                )
+            elif solver_type == 'taylor':
+                x_t = (
+                    torch.exp(log_alpha_t - log_alpha_s) * x
+                    - (sigma_t * phi_1) * model_s
+                    - (1. / r1) * (sigma_t * (phi_1 / h - 1.)) * (model_s1 - model_s)
+                )
+        if return_intermediate:
+            return x_t, {'model_s': model_s, 'model_s1': model_s1}
+        else:
+            return x_t
+    def singlestep_dpm_solver_third_update(self, x, s, t, r1=1./3., r2=2./3., model_s=None, model_s1=None, return_intermediate=False, solver_type='dpmsolver'):
+        """
+        Singlestep solver DPM-Solver-3 from time `s` to time `t`.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            s: A pytorch tensor. The starting time, with the shape (1,).
+            t: A pytorch tensor. The ending time, with the shape (1,).
+            r1: A `float`. The hyperparameter of the third-order solver.
+            r2: A `float`. The hyperparameter of the third-order solver.
+            model_s: A pytorch tensor. The model function evaluated at time `s`.
+                If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
+            model_s1: A pytorch tensor. The model function evaluated at time `s1` (the intermediate time given by `r1`).
+                If `model_s1` is None, we evaluate the model at `s1`; otherwise we directly use it.
+            return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
+            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        if solver_type not in ['dpmsolver', 'taylor']:
+            raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type))
+        if r1 is None:
+            r1 = 1. / 3.
+        if r2 is None:
+            r2 = 2. / 3.
+        ns = self.noise_schedule
+        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
+        h = lambda_t - lambda_s
+        lambda_s1 = lambda_s + r1 * h
+        lambda_s2 = lambda_s + r2 * h
+        s1 = ns.inverse_lambda(lambda_s1)
+        s2 = ns.inverse_lambda(lambda_s2)
+        log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(s2), ns.marginal_log_mean_coeff(t)
+        sigma_s, sigma_s1, sigma_s2, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(s2), ns.marginal_std(t)
+        alpha_s1, alpha_s2, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_s2), torch.exp(log_alpha_t)
+        if self.algorithm_type == "dpmsolver++":
+            phi_11 = torch.expm1(-r1 * h)
+            phi_12 = torch.expm1(-r2 * h)
+            phi_1 = torch.expm1(-h)
+            phi_22 = torch.expm1(-r2 * h) / (r2 * h) + 1.
+            phi_2 = phi_1 / h + 1.
+            phi_3 = phi_2 / h - 0.5
+            if model_s is None:
+                model_s = self.model_fn(x, s)
+            if model_s1 is None:
+                x_s1 = (
+                    (sigma_s1 / sigma_s) * x
+                    - (alpha_s1 * phi_11) * model_s
+                )
+                model_s1 = self.model_fn(x_s1, s1)
+            x_s2 = (
+                (sigma_s2 / sigma_s) * x
+                - (alpha_s2 * phi_12) * model_s
+                + r2 / r1 * (alpha_s2 * phi_22) * (model_s1 - model_s)
+            )
+            model_s2 = self.model_fn(x_s2, s2)
+            if solver_type == 'dpmsolver':
+                x_t = (
+                    (sigma_t / sigma_s) * x
+                    - (alpha_t * phi_1) * model_s
+                    + (1. / r2) * (alpha_t * phi_2) * (model_s2 - model_s)
+                )
+            elif solver_type == 'taylor':
+                D1_0 = (1. / r1) * (model_s1 - model_s)
+                D1_1 = (1. / r2) * (model_s2 - model_s)
+                D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
+                D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
+                x_t = (
+                    (sigma_t / sigma_s) * x
+                    - (alpha_t * phi_1) * model_s
+                    + (alpha_t * phi_2) * D1
+                    - (alpha_t * phi_3) * D2
+                )
+        else:
+            phi_11 = torch.expm1(r1 * h)
+            phi_12 = torch.expm1(r2 * h)
+            phi_1 = torch.expm1(h)
+            phi_22 = torch.expm1(r2 * h) / (r2 * h) - 1.
+            phi_2 = phi_1 / h - 1.
+            phi_3 = phi_2 / h - 0.5
+            if model_s is None:
+                model_s = self.model_fn(x, s)
+            if model_s1 is None:
+                x_s1 = (
+                    (torch.exp(log_alpha_s1 - log_alpha_s)) * x
+                    - (sigma_s1 * phi_11) * model_s
+                )
+                model_s1 = self.model_fn(x_s1, s1)
+            x_s2 = (
+                (torch.exp(log_alpha_s2 - log_alpha_s)) * x
+                - (sigma_s2 * phi_12) * model_s
+                - r2 / r1 * (sigma_s2 * phi_22) * (model_s1 - model_s)
+            )
+            model_s2 = self.model_fn(x_s2, s2)
+            if solver_type == 'dpmsolver':
+                x_t = (
+                    (torch.exp(log_alpha_t - log_alpha_s)) * x
+                    - (sigma_t * phi_1) * model_s
+                    - (1. / r2) * (sigma_t * phi_2) * (model_s2 - model_s)
+                )
+            elif solver_type == 'taylor':
+                D1_0 = (1. / r1) * (model_s1 - model_s)
+                D1_1 = (1. / r2) * (model_s2 - model_s)
+                D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
+                D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
+                x_t = (
+                    (torch.exp(log_alpha_t - log_alpha_s)) * x
+                    - (sigma_t * phi_1) * model_s
+                    - (sigma_t * phi_2) * D1
+                    - (sigma_t * phi_3) * D2
+                )
+        if return_intermediate:
+            return x_t, {'model_s': model_s, 'model_s1': model_s1, 'model_s2': model_s2}
+        else:
+            return x_t
+    def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpmsolver"):
+        """
+        Multistep solver DPM-Solver-2 from time `t_prev_list[-1]` to time `t`.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            model_prev_list: A list of pytorch tensor. The previous computed model values.
+            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
+            t: A pytorch tensor. The ending time, with the shape (1,).
+            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        if solver_type not in ['dpmsolver', 'taylor']:
+            raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type))
+        ns = self.noise_schedule
+        model_prev_1, model_prev_0 = model_prev_list[-2], model_prev_list[-1]
+        t_prev_1, t_prev_0 = t_prev_list[-2], t_prev_list[-1]
+        lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
+        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
+        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
+        alpha_t = torch.exp(log_alpha_t)
+        h_0 = lambda_prev_0 - lambda_prev_1
+        h = lambda_t - lambda_prev_0
+        r0 = h_0 / h
+        D1_0 = (1. / r0) * (model_prev_0 - model_prev_1)
+        if self.algorithm_type == "dpmsolver++":
+            phi_1 = torch.expm1(-h)
+            if solver_type == 'dpmsolver':
+                x_t = (
+                    (sigma_t / sigma_prev_0) * x
+                    - (alpha_t * phi_1) * model_prev_0
+                    - 0.5 * (alpha_t * phi_1) * D1_0
+                )
+            elif solver_type == 'taylor':
+                x_t = (
+                    (sigma_t / sigma_prev_0) * x
+                    - (alpha_t * phi_1) * model_prev_0
+                    + (alpha_t * (phi_1 / h + 1.)) * D1_0
+                )
+        else:
+            phi_1 = torch.expm1(h)
+            if solver_type == 'dpmsolver':
+                x_t = (
+                    (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
+                    - (sigma_t * phi_1) * model_prev_0
+                    - 0.5 * (sigma_t * phi_1) * D1_0
+                )
+            elif solver_type == 'taylor':
+                x_t = (
+                    (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
+                    - (sigma_t * phi_1) * model_prev_0
+                    - (sigma_t * (phi_1 / h - 1.)) * D1_0
+                )
+        return x_t
+    def multistep_dpm_solver_third_update(self, x, model_prev_list, t_prev_list, t, solver_type='dpmsolver'):
+        """
+        Multistep solver DPM-Solver-3 from time `t_prev_list[-1]` to time `t`.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            model_prev_list: A list of pytorch tensor. The previous computed model values.
+            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
+            t: A pytorch tensor. The ending time, with the shape (1,).
+            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        ns = self.noise_schedule
+        model_prev_2, model_prev_1, model_prev_0 = model_prev_list
+        t_prev_2, t_prev_1, t_prev_0 = t_prev_list
+        lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_2), ns.marginal_lambda(t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
+        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
+        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
+        alpha_t = torch.exp(log_alpha_t)
+        h_1 = lambda_prev_1 - lambda_prev_2
+        h_0 = lambda_prev_0 - lambda_prev_1
+        h = lambda_t - lambda_prev_0
+        r0, r1 = h_0 / h, h_1 / h
+        D1_0 = (1. / r0) * (model_prev_0 - model_prev_1)
+        D1_1 = (1. / r1) * (model_prev_1 - model_prev_2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1. / (r0 + r1)) * (D1_0 - D1_1)
+        if self.algorithm_type == "dpmsolver++":
+            phi_1 = torch.expm1(-h)
+            phi_2 = phi_1 / h + 1.
+            phi_3 = phi_2 / h - 0.5
+            x_t = (
+                (sigma_t / sigma_prev_0) * x
+                - (alpha_t * phi_1) * model_prev_0
+                + (alpha_t * phi_2) * D1
+                - (alpha_t * phi_3) * D2
+            )
+        else:
+            phi_1 = torch.expm1(h)
+            phi_2 = phi_1 / h - 1.
+            phi_3 = phi_2 / h - 0.5
+            x_t = (
+                (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
+                - (sigma_t * phi_1) * model_prev_0
+                - (sigma_t * phi_2) * D1
+                - (sigma_t * phi_3) * D2
+            )
+        return x_t
+    def singlestep_dpm_solver_update(self, x, s, t, order, return_intermediate=False, solver_type='dpmsolver', r1=None, r2=None):
+        """
+        Singlestep DPM-Solver with the order `order` from time `s` to time `t`.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            s: A pytorch tensor. The starting time, with the shape (1,).
+            t: A pytorch tensor. The ending time, with the shape (1,).
+            order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
+            return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
+            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+            r1: A `float`. The hyperparameter of the second-order or third-order solver.
+            r2: A `float`. The hyperparameter of the third-order solver.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        if order == 1:
+            return self.dpm_solver_first_update(x, s, t, return_intermediate=return_intermediate)
+        elif order == 2:
+            return self.singlestep_dpm_solver_second_update(x, s, t, return_intermediate=return_intermediate, solver_type=solver_type, r1=r1)
+        elif order == 3:
+            return self.singlestep_dpm_solver_third_update(x, s, t, return_intermediate=return_intermediate, solver_type=solver_type, r1=r1, r2=r2)
+        else:
+            raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
+    def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, solver_type='dpmsolver'):
+        """
+        Multistep DPM-Solver with the order `order` from time `t_prev_list[-1]` to time `t`.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            model_prev_list: A list of pytorch tensor. The previous computed model values.
+            t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
+            t: A pytorch tensor. The ending time, with the shape (1,).
+            order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
+            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        if order == 1:
+            return self.dpm_solver_first_update(x, t_prev_list[-1], t, model_s=model_prev_list[-1])
+        elif order == 2:
+            return self.multistep_dpm_solver_second_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
+        elif order == 3:
+            return self.multistep_dpm_solver_third_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
+        else:
+            raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
+    def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5, solver_type='dpmsolver'):
+        """
+        The adaptive step size solver based on singlestep DPM-Solver.
+        Args:
+            x: A pytorch tensor. The initial value at time `t_T`.
+            order: A `int`. The (higher) order of the solver. We only support order == 2 or 3.
+            t_T: A `float`. The starting time of the sampling (default is T).
+            t_0: A `float`. The ending time of the sampling (default is epsilon).
+            h_init: A `float`. The initial step size (for logSNR).
+            atol: A `float`. The absolute tolerance of the solver. For image data, the default setting is 0.0078, followed [1].
+            rtol: A `float`. The relative tolerance of the solver. The default setting is 0.05.
+            theta: A `float`. The safety hyperparameter for adapting the step size. The default setting is 0.9, followed [1].
+            t_err: A `float`. The tolerance for the time. We solve the diffusion ODE until the absolute error between the
+                current time and `t_0` is less than `t_err`. The default setting is 1e-5.
+            solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
+                The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
+        Returns:
+            x_0: A pytorch tensor. The approximated solution at time `t_0`.
+        [1] A. Jolicoeur-Martineau, K. Li, R. Piché-Taillefer, T. Kachman, and I. Mitliagkas, "Gotta go fast when generating data with score-based models," arXiv preprint arXiv:2105.14080, 2021.
+        """
+        ns = self.noise_schedule
+        s = t_T * torch.ones((1,)).to(x)
+        lambda_s = ns.marginal_lambda(s)
+        lambda_0 = ns.marginal_lambda(t_0 * torch.ones_like(s).to(x))
+        h = h_init * torch.ones_like(s).to(x)
+        x_prev = x
+        nfe = 0
+        if order == 2:
+            r1 = 0.5
+            lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True)
+            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, solver_type=solver_type, **kwargs)
+        elif order == 3:
+            r1, r2 = 1. / 3., 2. / 3.
+            lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, return_intermediate=True, solver_type=solver_type)
+            higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs)
+        else:
+            raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order))
+        while torch.abs((s - t_0)).mean() > t_err:
+            t = ns.inverse_lambda(lambda_s + h)
+            x_lower, lower_noise_kwargs = lower_update(x, s, t)
+            x_higher = higher_update(x, s, t, **lower_noise_kwargs)
+            delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev)))
+            norm_fn = lambda v: torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
+            E = norm_fn((x_higher - x_lower) / delta).max()
+            if torch.all(E <= 1.):
+                x = x_higher
+                s = t
+                x_prev = x_lower
+                lambda_s = ns.marginal_lambda(s)
+            h = torch.min(theta * h * torch.float_power(E, -1. / order).float(), lambda_0 - lambda_s)
+            nfe += order
+        print('adaptive solver nfe', nfe)
+        return x
+    def add_noise(self, x, t, noise=None):
+        """
+        Compute the noised input xt = alpha_t * x + sigma_t * noise.
+        Args:
+            x: A `torch.Tensor` with shape `(batch_size, *shape)`.
+            t: A `torch.Tensor` with shape `(t_size,)`.
+        Returns:
+            xt with shape `(t_size, batch_size, *shape)`.
+        """
+        alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
+        if noise is None:
+            noise = torch.randn((t.shape[0], *x.shape), device=x.device)
+        x = x.reshape((-1, *x.shape))
+        xt = expand_dims(alpha_t, x.dim()) * x + expand_dims(sigma_t, x.dim()) * noise
+        if t.shape[0] == 1:
+            return xt.squeeze(0)
+        else:
+            return xt
+    def inverse(self, x, steps=20, t_start=None, t_end=None, order=2, skip_type='time_uniform',
+        method='multistep', lower_order_final=True, denoise_to_zero=False, solver_type='dpmsolver',
+        atol=0.0078, rtol=0.05, return_intermediate=False,
+    ):
+        """
+        Inverse the sample `x` from time `t_start` to `t_end` by DPM-Solver.
+        For discrete-time DPMs, we use `t_start=1/N`, where `N` is the total time steps during training.
+        """
+        t_0 = 1. / self.noise_schedule.total_N if t_start is None else t_start
+        t_T = self.noise_schedule.T if t_end is None else t_end
+        assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        return self.sample(x, steps=steps, t_start=t_0, t_end=t_T, order=order, skip_type=skip_type,
+            method=method, lower_order_final=lower_order_final, denoise_to_zero=denoise_to_zero, solver_type=solver_type,
+            atol=atol, rtol=rtol, return_intermediate=return_intermediate)
+    def sample(self, x, steps=20, t_start=None, t_end=None, order=2, skip_type='time_uniform',
+        method='multistep', lower_order_final=True, denoise_to_zero=False, solver_type='dpmsolver',
+        atol=0.0078, rtol=0.05, return_intermediate=False,
+    ):
+        """
+        Compute the sample at time `t_end` by DPM-Solver, given the initial `x` at time `t_start`.
+        =====================================================
+        We support the following algorithms for both noise prediction model and data prediction model:
+            - 'singlestep':
+                Singlestep DPM-Solver (i.e. "DPM-Solver-fast" in the paper), which combines different orders of singlestep DPM-Solver.
+                We combine all the singlestep solvers with order <= `order` to use up all the function evaluations (steps).
+                The total number of function evaluations (NFE) == `steps`.
+                Given a fixed NFE == `steps`, the sampling procedure is:
+                    - If `order` == 1:
+                        - Denote K = steps. We use K steps of DPM-Solver-1 (i.e. DDIM).
+                    - If `order` == 2:
+                        - Denote K = (steps // 2) + (steps % 2). We take K intermediate time steps for sampling.
+                        - If steps % 2 == 0, we use K steps of singlestep DPM-Solver-2.
+                        - If steps % 2 == 1, we use (K - 1) steps of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
+                    - If `order` == 3:
+                        - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
+                        - If steps % 3 == 0, we use (K - 2) steps of singlestep DPM-Solver-3, and 1 step of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
+                        - If steps % 3 == 1, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of DPM-Solver-1.
+                        - If steps % 3 == 2, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of singlestep DPM-Solver-2.
+            - 'multistep':
+                Multistep DPM-Solver with the order of `order`. The total number of function evaluations (NFE) == `steps`.
+                We initialize the first `order` values by lower order multistep solvers.
+                Given a fixed NFE == `steps`, the sampling procedure is:
+                    Denote K = steps.
+                    - If `order` == 1:
+                        - We use K steps of DPM-Solver-1 (i.e. DDIM).
+                    - If `order` == 2:
+                        - We firstly use 1 step of DPM-Solver-1, then use (K - 1) step of multistep DPM-Solver-2.
+                    - If `order` == 3:
+                        - We firstly use 1 step of DPM-Solver-1, then 1 step of multistep DPM-Solver-2, then (K - 2) step of multistep DPM-Solver-3.
+            - 'singlestep_fixed':
+                Fixed order singlestep DPM-Solver (i.e. DPM-Solver-1 or singlestep DPM-Solver-2 or singlestep DPM-Solver-3).
+                We use singlestep DPM-Solver-`order` for `order`=1 or 2 or 3, with total [`steps` // `order`] * `order` NFE.
+            - 'adaptive':
+                Adaptive step size DPM-Solver (i.e. "DPM-Solver-12" and "DPM-Solver-23" in the paper).
+                We ignore `steps` and use adaptive step size DPM-Solver with a higher order of `order`.
+                You can adjust the absolute tolerance `atol` and the relative tolerance `rtol` to balance the computatation costs
+                (NFE) and the sample quality.
+                    - If `order` == 2, we use DPM-Solver-12 which combines DPM-Solver-1 and singlestep DPM-Solver-2.
+                    - If `order` == 3, we use DPM-Solver-23 which combines singlestep DPM-Solver-2 and singlestep DPM-Solver-3.
+        =====================================================
+        Some advices for choosing the algorithm:
+            - For **unconditional sampling** or **guided sampling with small guidance scale** by DPMs:
+                Use singlestep DPM-Solver or DPM-Solver++ ("DPM-Solver-fast" in the paper) with `order = 3`.
+                e.g., DPM-Solver:
+                    >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver")
+                    >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
+                            skip_type='time_uniform', method='singlestep')
+                e.g., DPM-Solver++:
+                    >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
+                    >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
+                            skip_type='time_uniform', method='singlestep')
+            - For **guided sampling with large guidance scale** by DPMs:
+                Use multistep DPM-Solver with `algorithm_type="dpmsolver++"` and `order = 2`.
+                e.g.
+                    >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
+                    >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=2,
+                            skip_type='time_uniform', method='multistep')
+        We support three types of `skip_type`:
+            - 'logSNR': uniform logSNR for the time steps. **Recommended for low-resolutional images**
+            - 'time_uniform': uniform time for the time steps. **Recommended for high-resolutional images**.
+            - 'time_quadratic': quadratic time for the time steps.
+        =====================================================
+        Args:
+            x: A pytorch tensor. The initial value at time `t_start`
+                e.g. if `t_start` == T, then `x` is a sample from the standard normal distribution.
+            steps: A `int`. The total number of function evaluations (NFE).
+            t_start: A `float`. The starting time of the sampling.
+                If `T` is None, we use self.noise_schedule.T (default is 1.0).
+            t_end: A `float`. The ending time of the sampling.
+                If `t_end` is None, we use 1. / self.noise_schedule.total_N.
+                e.g. if total_N == 1000, we have `t_end` == 1e-3.
+                For discrete-time DPMs:
+                    - We recommend `t_end` == 1. / self.noise_schedule.total_N.
+                For continuous-time DPMs:
+                    - We recommend `t_end` == 1e-3 when `steps` <= 15; and `t_end` == 1e-4 when `steps` > 15.
+            order: A `int`. The order of DPM-Solver.
+            skip_type: A `str`. The type for the spacing of the time steps. 'time_uniform' or 'logSNR' or 'time_quadratic'.
+            method: A `str`. The method for sampling. 'singlestep' or 'multistep' or 'singlestep_fixed' or 'adaptive'.
+            denoise_to_zero: A `bool`. Whether to denoise to time 0 at the final step.
+                Default is `False`. If `denoise_to_zero` is `True`, the total NFE is (`steps` + 1).
+                This trick is firstly proposed by DDPM (https://arxiv.org/abs/2006.11239) and
+                score_sde (https://arxiv.org/abs/2011.13456). Such trick can improve the FID
+                for diffusion models sampling by diffusion SDEs for low-resolutional images
+                (such as CIFAR-10). However, we observed that such trick does not matter for
+                high-resolutional images. As it needs an additional NFE, we do not recommend
+                it for high-resolutional images.
+            lower_order_final: A `bool`. Whether to use lower order solvers at the final steps.
+                Only valid for `method=multistep` and `steps < 15`. We empirically find that
+                this trick is a key to stabilizing the sampling by DPM-Solver with very few steps
+                (especially for steps <= 10). So we recommend to set it to be `True`.
+            solver_type: A `str`. The taylor expansion type for the solver. `dpmsolver` or `taylor`. We recommend `dpmsolver`.
+            atol: A `float`. The absolute tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
+            rtol: A `float`. The relative tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
+            return_intermediate: A `bool`. Whether to save the xt at each step.
+                When set to `True`, method returns a tuple (x0, intermediates); when set to False, method returns only x0.
+        Returns:
+            x_end: A pytorch tensor. The approximated solution at time `t_end`.
+        """
+        t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
+        t_T = self.noise_schedule.T if t_start is None else t_start
+        assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        if return_intermediate:
+            assert method in ['multistep', 'singlestep', 'singlestep_fixed'], "Cannot use adaptive solver when saving intermediate values"
+        if self.correcting_xt_fn is not None:
+            assert method in ['multistep', 'singlestep', 'singlestep_fixed'], "Cannot use adaptive solver when correcting_xt_fn is not None"
+        device = x.device
+        intermediates = []
+        with torch.no_grad():
+            if method == 'adaptive':
+                x = self.dpm_solver_adaptive(x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol, solver_type=solver_type)
+            elif method == 'multistep':
+                assert steps >= order
+                timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
+                assert timesteps.shape[0] - 1 == steps
+                # Init the initial values.
+                step = 0
+                t = timesteps[step]
+                t_prev_list = [t]
+                model_prev_list = [self.model_fn(x, t)]
+                if self.correcting_xt_fn is not None:
+                    x = self.correcting_xt_fn(x, t, step)
+                if return_intermediate:
+                    intermediates.append(x)
+                # Init the first `order` values by lower order multistep DPM-Solver.
+                for step in range(1, order):
+                    t = timesteps[step]
+                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, t, step, solver_type=solver_type)
+                    if self.correcting_xt_fn is not None:
+                        x = self.correcting_xt_fn(x, t, step)
+                    if return_intermediate:
+                        intermediates.append(x)
+                    t_prev_list.append(t)
+                    model_prev_list.append(self.model_fn(x, t))
+                # Compute the remaining values by `order`-th order multistep DPM-Solver.
+                for step in range(order, steps + 1):
+                    t = timesteps[step]
+                    # We only use lower order for steps < 10
+                    if lower_order_final and steps < 10:
+                        step_order = min(order, steps + 1 - step)
+                    else:
+                        step_order = order
+                    x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, t, step_order, solver_type=solver_type)
+                    if self.correcting_xt_fn is not None:
+                        x = self.correcting_xt_fn(x, t, step)
+                    if return_intermediate:
+                        intermediates.append(x)
+                    for i in range(order - 1):
+                        t_prev_list[i] = t_prev_list[i + 1]
+                        model_prev_list[i] = model_prev_list[i + 1]
+                    t_prev_list[-1] = t
+                    # We do not need to evaluate the final model value.
+                    if step < steps:
+                        model_prev_list[-1] = self.model_fn(x, t)
+            elif method in ['singlestep', 'singlestep_fixed']:
+                if method == 'singlestep':
+                    timesteps_outer, orders = self.get_orders_and_timesteps_for_singlestep_solver(steps=steps, order=order, skip_type=skip_type, t_T=t_T, t_0=t_0, device=device)
+                elif method == 'singlestep_fixed':
+                    K = steps // order
+                    orders = [order,] * K
+                    timesteps_outer = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=K, device=device)
+                for step, order in enumerate(orders):
+                    s, t = timesteps_outer[step], timesteps_outer[step + 1]
+                    timesteps_inner = self.get_time_steps(skip_type=skip_type, t_T=s.item(), t_0=t.item(), N=order, device=device)
+                    lambda_inner = self.noise_schedule.marginal_lambda(timesteps_inner)
+                    h = lambda_inner[-1] - lambda_inner[0]
+                    r1 = None if order <= 1 else (lambda_inner[1] - lambda_inner[0]) / h
+                    r2 = None if order <= 2 else (lambda_inner[2] - lambda_inner[0]) / h
+                    x = self.singlestep_dpm_solver_update(x, s, t, order, solver_type=solver_type, r1=r1, r2=r2)
+                    if self.correcting_xt_fn is not None:
+                        x = self.correcting_xt_fn(x, t, step)
+                    if return_intermediate:
+                        intermediates.append(x)
+            else:
+                raise ValueError("Got wrong method {}".format(method))
+            if denoise_to_zero:
+                t = torch.ones((1,)).to(device) * t_0
+                x = self.denoise_to_zero_fn(x, t)
+                if self.correcting_xt_fn is not None:
+                    x = self.correcting_xt_fn(x, t, step + 1)
+                if return_intermediate:
+                    intermediates.append(x)
+        if return_intermediate:
+            return x, intermediates
+        else:
+            return x
+#############################################################
+# other utility functions
+#############################################################
+def interpolate_fn(x, xp, yp):
+    """
+    A piecewise linear function y = f(x), using xp and yp as keypoints.
+    We implement f(x) in a differentiable way (i.e. applicable for autograd).
+    The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
+    Args:
+        x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
+        xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
+        yp: PyTorch tensor with shape [C, K].
+    Returns:
+        The function values f(x), with shape [N, C].
+    """
+    N, K = x.shape[0], xp.shape[1]
+    all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
+    sorted_all_x, x_indices = torch.sort(all_x, dim=2)
+    x_idx = torch.argmin(x_indices, dim=2)
+    cand_start_idx = x_idx - 1
+    start_idx = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(1, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
+        ),
+    )
+    end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
+    start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
+    end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
+    start_idx2 = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(0, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
+        ),
+    )
+    y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
+    start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
+    end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
+    cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
+    return cand
+def expand_dims(v, dims):
+    """
+    Expand the tensor `v` to the dim `dims`.
+    Args:
+        `v`: a PyTorch tensor with shape [N].
+        `dim`: a `int`.
+    Returns:
+        a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
+    """
+    return v[(...,) + (None,)*(dims - 1)]

model/ldm/editing_unet.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import torch
+import torch.nn as nn
+from .audioldm import UNetModel
+class EditingUNet(nn.Module):
+    def __init__(self, unet_config, use_flow_matching=True, velocity_bound=4.0):
+        super().__init__()
+        original_in_channels = unet_config.in_channels
+        config_dict = dict(unet_config)
+        config_dict['in_channels'] = original_in_channels * 2
+        self.unet = UNetModel(**config_dict)
+        self.original_in_channels = original_in_channels
+        self.use_flow_matching = use_flow_matching
+        if self.use_flow_matching:
+            # SOTA PRACTICE: Using a bounded activation is crucial for training stability
+            # and provides a strong, valid inductive bias. The velocity is not infinite.
+            # This prevents loss explosion and helps the model converge.
+            self.final_activation = nn.Hardtanh(min_val=-velocity_bound, max_val=velocity_bound)
+            print(f"✅ EditingUNet configured with Hardtanh(bound={velocity_bound}) for stable Flow Matching.")
+        else:
+            self.final_activation = None
+            print("✅ EditingUNet configured for standard DDPM noise prediction.")
+    def forward(self, noisy_target_latent, source_latent, context, timesteps, **kwargs):
+        # Handle batch size mismatch for classifier-free guidance
+        # If noisy_target_latent has 2x batch size (for CFG), replicate source_latent
+        if noisy_target_latent.shape[0] != source_latent.shape[0]:
+            if noisy_target_latent.shape[0] == 2 * source_latent.shape[0]:
+                # Replicate source_latent for CFG (unconditional + conditional)
+                source_latent = source_latent.repeat(2, 1, 1, 1)
+            else:
+                raise ValueError(f"Batch size mismatch: noisy_target_latent={noisy_target_latent.shape[0]}, source_latent={source_latent.shape[0]}")
+        # NO dtype casting here. Let the trainer handle it.
+        combined_latent = torch.cat([noisy_target_latent, source_latent], dim=1)
+        prediction = self.unet(
+            x=combined_latent,
+            timesteps=timesteps,
+            context=context
+        )
+        if self.final_activation is not None:
+            return self.final_activation(prediction)
+        else:
+            return prediction

model/ldm/exp_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "model": {
+      "unet": {
+        "image_size": 32,
+        "in_channels": 8,
+        "out_channels": 8,
+        "model_channels": 256,
+        "attention_resolutions": [4, 2, 1],
+        "num_res_blocks": 2,
+        "channel_mult": [1, 2, 4, 4],
+        "num_heads": 8,
+        "use_spatial_transformer": true,
+        "transformer_depth": 2,
+        "context_dim": 768,
+        "use_checkpoint": true,
+        "legacy": false
+      }
+    }
+}

model/ldm/linear_attention_block.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import torch
+from torch import nn
+from typing import Optional
+import math
+# --- These are your existing, correct components ---
+from diffusers.models.attention_processor import AttnProcessor2_0
+from .customer_attention_processor import Attention, CustomLiteLAProcessor2_0
+from diffusers.models.normalization import RMSNorm
+from .attention import GLUMBConv # Using GLUMBConv from your attention.py
+#from diffusers.models.attention_processor import FusedAttnProcessor2_0
+class EditingTransformerBlock(nn.Module):
+    """
+    <<< PHIÊN BẢN CUỐI CÙNG >>>
+    Sử dụng kiến trúc Self-Attention + Cross-Attention, với Linear Attention Processor.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        text_embed_dim: int,
+        mlp_ratio: float = 4.0,
+        use_adaln_single: bool = True,
+    ):
+        super().__init__()
+        self.use_adaln_single = use_adaln_single
+        inner_dim = num_attention_heads * attention_head_dim
+        # --- 1. Khối Self-Attention cho chuỗi âm thanh (đã ghép) ---
+        # Sử dụng CustomLiteLAProcessor2_0 cho self-attention
+        self.norm_self = RMSNorm(dim, eps=1e-6)
+        self.attn_self = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            out_dim=inner_dim,
+            # QUAN TRỌNG: Gán linear attention processor ở đây
+            processor=CustomLiteLAProcessor2_0()
+        )
+        # --- 2. Khối Cross-Attention cho âm thanh chú ý đến văn bản ---
+        # Đối với Cross-Attention, sử dụng attention tiêu chuẩn (SDPA) thường ổn định hơn
+        # và quan trọng hơn cho việc căn chỉnh. Linear attention có thể quá yếu ở đây.
+        # Tuy nhiên, nếu bạn vẫn muốn dùng linear, hãy đổi thành CustomLiteLAProcessor2_0.
+        self.norm_cross = RMSNorm(dim, eps=1e-6)
+        self.attn_cross = Attention(
+            query_dim=dim,
+            cross_attention_dim=text_embed_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            out_dim=inner_dim,
+            # KHUYẾN NGHỊ: Bắt đầu với processor chuẩn cho cross-attention
+            processor=AttnProcessor2_0() # Hoặc AttnProcessor() nếu PyTorch < 2.0
+        )
+        # --- 3. Khối Feed-Forward ---
+        self.norm_ff = RMSNorm(dim, eps=1e-6)
+        self.ff = GLUMBConv(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+        )
+        # --- 4. Điều kiện hóa AdaLN ---
+        if use_adaln_single:
+            self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor],
+        temb: Optional[torch.FloatTensor],
+        use_checkpointing: bool = False,
+    ) -> torch.FloatTensor:
+        # Luồng xử lý không thay đổi so với phiên bản trước
+        # AdaLN setup
+        if self.use_adaln_single and temb is not None:
+            shift_self, scale_self, shift_cross, scale_cross, shift_ff, scale_ff = (
+                (self.scale_shift_table[None] + temb[:, None, :]).chunk(6, dim=1)
+            )
+        else:
+            scale_self, shift_self, scale_cross, shift_cross, scale_ff, shift_ff = (1.0, 0.0, 1.0, 0.0, 1.0, 0.0)
+        # --- 1. Self-Attention (với Linear Attention) ---
+        residual = hidden_states
+        norm_h = self.norm_self(hidden_states)
+        norm_h = norm_h * (1 + scale_self) + shift_self
+        # Processor sẽ tự động được gọi bên trong self.attn_self
+        attn_output, _ = self.attn_self(norm_h) # CustomLiteLAProcessor2_0 sẽ được dùng ở đây
+        hidden_states = attn_output + residual
+        # --- 2. Cross-Attention (với Attention chuẩn) ---
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            norm_h = self.norm_cross(hidden_states)
+            norm_h = norm_h * (1 + scale_cross) + shift_cross
+            # Cross-attention returns a tuple (output, attention_weights)
+            attn_output, _ = self.attn_cross(
+                hidden_states=norm_h,
+                encoder_hidden_states=encoder_hidden_states
+            )
+            hidden_states = attn_output + residual
+        # --- 3. Feed-Forward ---
+        residual = hidden_states
+        norm_h = self.norm_ff(hidden_states)
+        norm_h = norm_h * (1 + scale_ff) + shift_ff
+        ff_output = self.ff(norm_h)
+        hidden_states = ff_output + residual
+        return hidden_states
+# class EditingTransformerBlock(nn.Module):
+#     """
+#     A CORRECTED, fully linear attention transformer block for editing tasks.
+#     It combines self-attention and cross-attention into a single, EFFICIENT
+#     linear self-attention operation on a concatenated sequence.
+#     """
+#     def __init__(
+#         self,
+#         dim,
+#         num_attention_heads,
+#         attention_head_dim,
+#         mlp_ratio=4.0,
+#         use_adaln_single=True,
+#     ):
+#         super().__init__()
+#         self.use_adaln_single = use_adaln_single
+#         self.norm1 = RMSNorm(dim, elementwise_affine=False, eps=1e-6)
+#         # THE CRITICAL FIX: We use ONE attention block, initialized
+#         # with the LINEAR attention processor.
+#         self.attn = Attention(
+#             query_dim=dim,
+#             heads=num_attention_heads,
+#             dim_head=attention_head_dim,
+#             out_dim=dim,
+#             bias=True,
+#             processor=CustomLiteLAProcessor2_0(), # <--- THIS IS THE FIX
+#         )
+#         self.norm2 = RMSNorm(dim, elementwise_affine=False, eps=1e-6)
+#         self.ff = GLUMBConv(
+#             in_features=dim,
+#             hidden_features=int(dim * mlp_ratio),
+#             use_bias=(True, True, False),
+#             norm=(None, None, None),
+#             act=("silu", "silu", None),
+#         )
+#         if use_adaln_single:
+#             # This is simpler than the original 6-way split if we apply it once
+#             self.scale_shift_table = nn.Parameter(torch.randn(4, dim) / dim**0.5)
+#     def forward(
+#         self,
+#         hidden_states: torch.FloatTensor,
+#         encoder_hidden_states: Optional[torch.FloatTensor] = None,
+#         temb: Optional[torch.FloatTensor] = None,
+#         use_checkpointing: bool = False,
+#     ):
+#         hidden_states_len = hidden_states.shape[1]
+#         N = hidden_states.shape[0]
+#         # AdaLN-Single conditioning
+#         if self.use_adaln_single and temb is not None:
+#             shift_msa, scale_msa, shift_mlp, scale_mlp = (
+#                 (self.scale_shift_table[None] + temb[:, None, :])
+#                 .chunk(4, dim=1)
+#             )
+#         norm_hidden_states = self.norm1(hidden_states)
+#         if self.use_adaln_single and temb is not None:
+#             norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+#         # --- UNIFIED ATTENTION LOGIC ---
+#         # The CustomLiteLAProcessor2_0 will treat this as one long sequence
+#         # for its Q, K, V projections. This is where self- and cross-attention merge.
+#         attn_input = torch.cat([norm_hidden_states, encoder_hidden_states], dim=1)
+#         # Define the forward pass for checkpointing
+#         def attn_forward(x):
+#             attn_output, _ = self.attn(hidden_states=x)
+#             return attn_output
+#         if use_checkpointing:
+#             attn_output_combined = torch.utils.checkpoint.checkpoint(attn_forward, attn_input, use_reentrant=False)
+#         else:
+#             attn_output_combined, _ = self.attn(hidden_states=attn_input)
+#         # Slice the output to get only the processed audio part
+#         attn_output = attn_output_combined[:, :hidden_states_len, :]
+#         # --- END UNIFIED ATTENTION ---
+#         hidden_states = hidden_states + attn_output
+#         # Feed-forward part
+#         norm_ff_states = self.norm2(hidden_states)
+#         if self.use_adaln_single and temb is not None:
+#             norm_ff_states = norm_ff_states * (1 + scale_mlp) + shift_mlp
+#         ff_output = self.ff(norm_ff_states)
+#         hidden_states = hidden_states + ff_output
+#         return hidden_states
+class TimestepEmbedding(nn.Module):
+    """ Helper module for sinusoidal timestep embeddings. """
+    def __init__(self, dim, max_period=10000):
+        super().__init__()
+        self.dim = dim
+        self.max_period = max_period
+    def forward(self, t):
+        half = self.dim // 2
+        freqs = torch.exp(
+            -math.log(self.max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if self.dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding

model/ldm/transformer.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# file: model/ldm/transformer.py
+import torch
+from torch import nn
+import math
+from .linear_attention_block import EditingTransformerBlock
+from diffusers.models.normalization import RMSNorm
+class TimestepEmbedding(nn.Module):
+    """ Helper module for sinusoidal timestep embeddings. """
+    def __init__(self, dim, max_period=10000):
+        super().__init__()
+        self.dim = dim
+        self.max_period = max_period
+    def forward(self, t):
+        half = self.dim // 2
+        freqs = torch.exp(
+            -math.log(self.max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if self.dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+class EditingTransformer(nn.Module):
+    """
+    <<< THAY ĐỔI LỚN: KIẾN TRÚC ĐƯỢC CẬP NHẬT THEO PHƯƠNG PHÁP CỦA AUDIT >>>
+    """
+    def __init__(
+        self,
+        num_layers=12,
+        inner_dim=512,
+        num_heads=8,
+        attention_head_dim=64,
+        dcae_latent_channels=8,
+        text_embed_dim=768,
+        mlp_ratio=4.0,
+        ):
+        super().__init__()
+        self.inner_dim = inner_dim
+        # <<< THAY ĐỔI: Lớp project_in bây giờ sẽ xử lý cả latent nhiễu và latent nguồn.
+        self.proj_in = nn.Linear(dcae_latent_channels, inner_dim)
+        # Timestep embedding logic (không đổi)
+        self.time_embed = TimestepEmbedding(inner_dim)
+        self.time_mlp = nn.Sequential(
+            nn.Linear(inner_dim, inner_dim * 4),
+            nn.SiLU(),
+            nn.Linear(inner_dim * 4, inner_dim),
+        )
+        # <<< XÓA BỎ: Lớp context_proj cũ không còn cần thiết vì ta không hòa tan
+        # source_latent và text_embedding nữa.
+        # self.context_proj = nn.Linear(...)
+        # Các khối Transformer (không đổi, nhưng giờ sẽ hoạt động trên chuỗi dài hơn)
+        self.transformer_blocks = nn.ModuleList([
+            EditingTransformerBlock(
+                # <<< QUAN TRỌNG: Kích thước của khối transformer giờ là 2*inner_dim nếu bạn
+                # quyết định ghép các embedding lại. Tuy nhiên, kiến trúc self-attn rồi cross-attn
+                # sẽ hoạt động trên chuỗi dài hơn, nên dim của khối vẫn là inner_dim.
+                # Cách chúng ta làm là đưa chuỗi dài hơn vào.
+                dim=inner_dim,
+                num_attention_heads=num_heads,
+                attention_head_dim=attention_head_dim,
+                text_embed_dim=text_embed_dim,
+                mlp_ratio=mlp_ratio,
+            ) for _ in range(num_layers)
+        ])
+        # Final output projection (không đổi)
+        self.norm_out = RMSNorm(inner_dim, eps=1e-6)
+        self.proj_out = nn.Linear(inner_dim, dcae_latent_channels)
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.xavier_uniform_(module.weight)
+            if module.bias is not None:
+                nn.init.constant_(module.bias, 0)
+    # def forward(self, noisy_target_latent, source_latent, encoder_hidden_states, timestep, use_checkpointing=False):
+    #     """
+    #     <<< THAY ĐỔI LỚN: Luồng forward được viết lại hoàn toàn. >>>
+    #     """
+    #     b, c, h, w = noisy_target_latent.shape
+    #     num_target_tokens = h * w
+    #     # 1. Project cả latent nhiễu (zt) và latent nguồn (zin) thành các chuỗi token.
+    #     hidden_states = self.proj_in(noisy_target_latent.permute(0, 2, 3, 1).reshape(b, num_target_tokens, c))
+    #     source_states = self.proj_in(source_latent.permute(0, 2, 3, 1).reshape(b, num_target_tokens, c))
+    #     # 2. Ghép hai chuỗi token lại với nhau theo chiều dài (sequence length).
+    #     # Đây là cách làm tương đương với "ghép kênh" trong U-Net cho Transformer.
+    #     input_sequence = torch.cat([hidden_states, source_states], dim=1)
+    #     # 3. Tạo timestep embedding (không đổi).
+    #     t_emb = self.time_mlp(self.time_embed(timestep).to(input_sequence.dtype))
+    #     # 4. Đưa chuỗi dài đã ghép qua các khối Transformer.
+    #     # `encoder_hidden_states` bây giờ CHỈ là embedding văn bản.
+    #     processed_sequence = input_sequence
+    #     for block in self.transformer_blocks:
+    #         processed_sequence = block(
+    #             hidden_states=processed_sequence,
+    #             encoder_hidden_states=encoder_hidden_states,
+    #             temb=t_emb,
+    #             use_checkpointing=use_checkpointing
+    #         )
+    #     # 5. Tách lấy phần kết quả tương ứng với latent nhiễu ban đầu.
+    #     output_hidden_states = processed_sequence[:, :num_target_tokens, :]
+    #     # 6. Project ngược lại không gian latent.
+    #     output_hidden_states = self.norm_out(output_hidden_states)
+    #     output_latent_flat = self.proj_out(output_hidden_states)
+    #     output_latent = output_latent_flat.reshape(b, h, w, -1).permute(0, 3, 1, 2).contiguous()
+    #     return output_latent
+    def forward(self, noisy_target_latent, source_latent, encoder_hidden_states, timestep, use_checkpointing=False):
+        """
+        <<< THAY ĐỔI LỚN: Triển khai chiến lược CHUNKING để xử lý chuỗi dài >>>
+        """
+        b, c, h, w = noisy_target_latent.shape
+        num_target_tokens = h * w
+        # 1. Project latent thành các chuỗi token dài (như cũ)
+        hidden_states = self.proj_in(noisy_target_latent.permute(0, 2, 3, 1).reshape(b, num_target_tokens, c))
+        source_states = self.proj_in(source_latent.permute(0, 2, 3, 1).reshape(b, num_target_tokens, c))
+        # Ghép lại thành một chuỗi đầu vào rất dài
+        input_sequence = torch.cat([hidden_states, source_states], dim=1)
+        full_seq_len = input_sequence.shape[1]
+        # Tạo timestep embedding
+        t_emb = self.time_mlp(self.time_embed(timestep).to(input_sequence.dtype))
+        # --- BẮT ĐẦU LOGIC CHUNKING ---
+        # 2. Định nghĩa các tham số cho chunking
+        CHUNK_SIZE = 1024  # Kích thước mỗi đoạn. Bạn có thể điều chỉnh con số này.
+        OVERLAP = CHUNK_SIZE // 4  # Độ gối lên nhau, ví dụ 256.
+        # Khởi tạo tensor đầu ra và tensor đếm để lấy trung bình vùng overlap
+        output_sequence = torch.zeros_like(input_sequence)
+        overlap_count = torch.zeros_like(input_sequence)
+        # Tạo một cửa sổ "hanning" để làm mượt các cạnh của chunk, giúp việc ghép nối tốt hơn
+        window = torch.hann_window(CHUNK_SIZE, device=input_sequence.device).view(1, -1, 1)
+        # 3. Vòng lặp xử lý từng chunk
+        start = 0
+        while start < full_seq_len:
+            end = min(start + CHUNK_SIZE, full_seq_len)
+            # Nếu chunk cuối cùng quá ngắn, lùi lại để đảm bảo đủ độ dài
+            if end - start < CHUNK_SIZE and start > 0:
+                start = full_seq_len - CHUNK_SIZE
+                end = full_seq_len
+            # Lấy ra một chunk từ chuỗi đầu vào
+            current_chunk = input_sequence[:, start:end, :]
+            # --- Xử lý chunk này qua tất cả các khối transformer ---
+            processed_chunk = current_chunk
+            for block in self.transformer_blocks:
+                # Lưu ý: use_checkpointing vẫn có thể áp dụng ở đây cho từng chunk
+                processed_chunk = block(
+                    hidden_states=processed_chunk,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=t_emb,
+                    use_checkpointing=use_checkpointing
+                )
+            # 4. Cộng dồn kết quả vào tensor output
+            # Áp dụng cửa sổ để giảm hiệu ứng biên
+            output_sequence[:, start:end, :] += processed_chunk * window
+            overlap_count[:, start:end, :] += window
+            if end == full_seq_len:
+                break
+            start += (CHUNK_SIZE - OVERLAP)
+        # 5. Lấy trung bình các vùng gối lên nhau
+        # Thêm một epsilon nhỏ để tránh chia cho 0 ở những vùng không có overlap (mặc dù không nên xảy ra)
+        final_processed_sequence = output_sequence / (overlap_count + 1e-8)
+        # --- KẾT THÚC LOGIC CHUNKING ---
+        # 6. Tách lấy phần kết quả tương ứng với latent nhiễu
+        output_hidden_states = final_processed_sequence[:, :num_target_tokens, :]
+        # 7. Project ngược lại không gian latent
+        output_hidden_states = self.norm_out(output_hidden_states)
+        output_latent_flat = self.proj_out(output_hidden_states)
+        output_latent = output_latent_flat.reshape(b, h, w, -1).permute(0, 3, 1, 2).contiguous()
+        return output_latent

model/scheduler.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# file: model/scheduler.py
+import torch
+import torch.nn.functional as F
+class LinearNoiseScheduler:
+    def __init__(self, num_timesteps=1000, beta_start=0.0001, beta_end=0.02):
+        self.num_timesteps = num_timesteps
+        # Tạo lịch beta tuyến tính
+        self.betas = torch.linspace(beta_start, beta_end, num_timesteps)
+        # Tính toán các giá trị alpha
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, axis=0)
+        # Các hệ số để thêm nhiễu (forward process)
+        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - self.alphas_cumprod)
+        # Các hệ số để loại bỏ nhiễu (reverse process / sampling)
+        self.alphas_cumprod_prev = F.pad(self.alphas_cumprod[:-1], (1, 0), value=1.0)
+        self.posterior_variance = self.betas * (1. - self.alphas_cumprod_prev) / (1. - self.alphas_cumprod)
+        # Khởi tạo một lịch trình timestep mặc định
+        self.timesteps = torch.arange(0, num_timesteps).flip(0)
+    def set_timesteps(self, num_inference_steps, device=None):
+        """
+        Thiết lập các timestep rời rạc được sử dụng cho chuỗi diffusion.
+        """
+        device_to_use = device if device is not None else self.betas.device
+        self.timesteps = torch.linspace(self.num_timesteps - 1, 0, num_inference_steps, dtype=torch.long, device=device_to_use)
+    def to(self, device):
+        """Chuyển tất cả các tensor của scheduler sang một thiết bị cụ thể."""
+        self.betas = self.betas.to(device)
+        self.alphas = self.alphas.to(device)
+        self.alphas_cumprod = self.alphas_cumprod.to(device)
+        self.sqrt_alphas_cumprod = self.sqrt_alphas_cumprod.to(device)
+        self.sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod.to(device)
+        self.alphas_cumprod_prev = self.alphas_cumprod_prev.to(device)
+        self.posterior_variance = self.posterior_variance.to(device)
+        return self
+    def add_noise(self, original_samples, noise, timesteps):
+        """Thêm nhiễu vào mẫu gốc tại các bước thời gian t."""
+        sqrt_alphas_cumprod_t = self.sqrt_alphas_cumprod.to(timesteps.device)[timesteps].view(-1, 1, 1, 1)
+        sqrt_one_minus_alphas_cumprod_t = self.sqrt_one_minus_alphas_cumprod.to(timesteps.device)[timesteps].view(-1, 1, 1, 1)
+        noisy_samples = sqrt_alphas_cumprod_t * original_samples + sqrt_one_minus_alphas_cumprod_t * noise
+        return noisy_samples
+    def step(self, model_output, timestep, sample):
+        t = timestep
+        alpha_t = self.alphas[t]
+        alpha_bar_t = self.alphas_cumprod[t]
+        sqrt_one_minus_alpha_bar_t = self.sqrt_one_minus_alphas_cumprod[t]
+        pred_original_sample = (sample - sqrt_one_minus_alpha_bar_t * model_output) / torch.sqrt(alpha_bar_t)
+        pred_original_sample = torch.clamp(pred_original_sample, -1., 1.)
+        if t == 0:
+            return pred_original_sample
+        alpha_bar_t_prev = self.alphas_cumprod_prev[t]
+        posterior_variance_t = self.posterior_variance[t]
+        pred_sample_direction = torch.sqrt(alpha_bar_t_prev) * self.betas[t] / (1. - alpha_bar_t)
+        prev_sample_mean = torch.sqrt(alpha_t) * (1. - alpha_bar_t_prev) / (1. - alpha_bar_t) * sample + pred_sample_direction * pred_original_sample
+        noise = torch.randn_like(model_output) if t > 0 else torch.zeros_like(model_output)
+        prev_sample = prev_sample_mean + torch.sqrt(posterior_variance_t) * noise
+        return prev_sample
+    def ddim_step(self, model_output, timestep, sample, eta=0.0, prev_timestep=None):
+        """
+        DDIM-style deterministic sampling step. eta=0.0 for DDIM, eta=1.0 for DDPM-like behavior.
+        """
+        if prev_timestep is None:
+            # Final step: return x0 prediction
+            alpha_bar_t = self.alphas_cumprod[timestep]
+            pred_original_sample = (sample - torch.sqrt(1 - alpha_bar_t) * model_output) / torch.sqrt(alpha_bar_t)
+            pred_original_sample = torch.clamp(pred_original_sample, -1.0, 1.0)
+            return pred_original_sample
+        t = timestep
+        prev_t = prev_timestep
+        alpha_bar_t = self.alphas_cumprod[t]
+        alpha_bar_prev = self.alphas_cumprod[prev_t]
+        # 1. Compute predicted original sample
+        pred_original_sample = (sample - torch.sqrt(1 - alpha_bar_t) * model_output) / torch.sqrt(alpha_bar_t)
+        pred_original_sample = torch.clamp(pred_original_sample, -1.0, 1.0)
+        # 2. Compute variance for random noise (only effective when eta > 0)
+        sigma_t = eta * torch.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar_t) * (1 - alpha_bar_t / alpha_bar_prev))
+        # 3. Compute "direction pointing to x_t"
+        pred_sample_direction = torch.sqrt(1 - alpha_bar_prev - sigma_t**2) * model_output
+        # 4. Compute x_{t-1}
+        prev_sample = torch.sqrt(alpha_bar_prev) * pred_original_sample + pred_sample_direction
+        # 5. Add noise (if eta > 0)
+        if eta > 0:
+            noise = torch.randn_like(model_output)
+            prev_sample = prev_sample + sigma_t * noise
+        return prev_sample
+    def dpm_solver_multistep(self, model_output, timestep, sample, order=2, prev_timestep=None, prev_model_output=None):
+        if prev_timestep is None:
+            # Final step: return x0 prediction
+            alpha_bar_t = self.alphas_cumprod[timestep]
+            pred_original_sample = (sample - torch.sqrt(1 - alpha_bar_t) * model_output) / torch.sqrt(alpha_bar_t)
+            return torch.clamp(pred_original_sample, -1.0, 1.0)
+        t = timestep
+        prev_t = prev_timestep
+        alpha_bar_t = self.alphas_cumprod[t]
+        alpha_bar_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.alphas_cumprod.new_tensor(1.0)
+        pred_original_sample = (sample - torch.sqrt(1 - alpha_bar_t) * model_output) / torch.sqrt(alpha_bar_t)
+        pred_original_sample = torch.clamp(pred_original_sample, -1.0, 1.0)
+        if order == 1 or prev_model_output is None:
+            prev_sample = torch.sqrt(alpha_bar_prev) * pred_original_sample + torch.sqrt(1 - alpha_bar_prev) * model_output
+        else:
+            lambda_t = 0.5 * torch.log(alpha_bar_t / (1 - alpha_bar_t))
+            lambda_prev = 0.5 * torch.log(alpha_bar_prev / (1 - alpha_bar_prev))
+            h = lambda_prev - lambda_t
+            prev_sample = (
+                torch.sqrt(alpha_bar_prev) * pred_original_sample +
+                torch.sqrt(1 - alpha_bar_prev) * (
+                    model_output + h * (model_output - prev_model_output) / 2
+                )
+            )
+        return prev_sample

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch>=2.0.0
+torchaudio>=2.0.0
+transformers>=4.30.0
+gradio>=4.0.0
+matplotlib>=3.5.0
+numpy>=1.21.0
+tqdm>=4.64.0
+Pillow>=9.0.0
+huggingface_hub>=0.16.0