Spaces:

NZUONG
/

mude

Sleeping

App Files Files Community

NZUONG commited on Aug 14, 2025

Commit

18b3589

verified ·

1 Parent(s): e3bedf7

Update app.py

Browse files

Files changed (1) hide show

app.py +394 -394

app.py CHANGED Viewed

@@ -1,395 +1,395 @@
-import os
-import torch
-import torchaudio
-import gradio as gr
-import matplotlib.pyplot as plt
-from tqdm import tqdm
-from transformers import UMT5EncoderModel, AutoTokenizer
-from huggingface_hub import hf_hub_download, snapshot_download
-import json
-import numpy as np
-import tempfile
-from io import BytesIO
-import warnings
-warnings.filterwarnings("ignore")
-# Import model components
-from model.ae.music_dcae import MusicDCAE
-from model.ldm.editing_unet import EditingUNet
-from model.ldm.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
-# Configuration
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-DTYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
-# Model repository - UPDATE THIS TO YOUR MODEL REPO
-MODEL_REPO = "NZUONG/mude"  # Your uploaded model repository
-# DDPM Parameters
-DDPM_NUM_TIMESTEPS = 1000
-DDPM_BETA_START = 0.0001
-DDPM_BETA_END = 0.02
-class AttrDict(dict):
-    def __init__(self, *args, **kwargs):
-        super(AttrDict, self).__init__(*args, **kwargs)
-        self.__dict__ = self
-def download_models():
-    """Download models from Hugging Face Hub"""
-    print("🔄 Downloading models from Hugging Face Hub...")
-    # Create local directories
-    os.makedirs("checkpoints", exist_ok=True)
-    try:
-        # Download the entire repository
-        local_dir = snapshot_download(
-            repo_id=MODEL_REPO,
-            cache_dir="./cache",
-            local_dir="./checkpoints",
-            repo_type="model"
-        )
-        print(f"✅ Models downloaded to: {local_dir}")
-        return True
-    except Exception as e:
-        print(f"❌ Error downloading models: {e}")
-        return False
-class AudioEditor:
-    def __init__(self):
-        self.dcae = None
-        self.tokenizer = None
-        self.text_encoder = None
-        self.model = None
-        self.is_loaded = False
-    def load_models(self):
-        """Load all models once at startup"""
-        if self.is_loaded:
-            return True
-        # Download models if not present
-        if not os.path.exists("checkpoints/music_dcae_f8c8"):
-            print("📥 Models not found locally, downloading...")
-            if not download_models():
-                return False
-        print("🔄 Loading models...")
-        try:
-            # Model paths
-            dcae_path = "checkpoints/music_dcae_f8c8"
-            vocoder_path = "checkpoints/music_vocoder"
-            t5_path = "checkpoints/umt5-base"
-            unet_config_path = "model/ldm/exp_config.json"
-            trained_model_path = "checkpoints/fm_checkpoint_epoch_9.pt"
-            # Load DCAE
-            self.dcae = MusicDCAE(
-                dcae_checkpoint_path=dcae_path,
-                vocoder_checkpoint_path=vocoder_path
-            ).to(DEVICE).eval()
-            # Load text encoder
-            self.tokenizer = AutoTokenizer.from_pretrained(t5_path)
-            self.text_encoder = UMT5EncoderModel.from_pretrained(t5_path).to(DEVICE, dtype=DTYPE).eval()
-            # Load UNet config
-            with open(unet_config_path, 'r') as f:
-                unet_config = AttrDict(json.load(f)['model']['unet'])
-            self.model = EditingUNet(unet_config, use_flow_matching=False).to("cpu", dtype=DTYPE).eval()
-            # Load checkpoint
-            checkpoint = torch.load(trained_model_path, map_location="cpu")
-            model_state_dict = checkpoint.get('model_state_dict', checkpoint)
-            if any(key.startswith('_orig_mod.') for key in model_state_dict.keys()):
-                model_state_dict = {key.replace('_orig_mod.', ''): value for key, value in model_state_dict.items()}
-            self.model.load_state_dict(model_state_dict, strict=False)
-            self.is_loaded = True
-            print("✅ All models loaded successfully!")
-            return True
-        except Exception as e:
-            print(f"❌ Error loading models: {e}")
-            return False
-    def dpm_solver_sampling(self, model, source_latent, instruction_embedding, uncond_embedding,
-                           strength=1.0, steps=25, guidance_scale=7.5, seed=42):
-        """DPM-Solver sampling function"""
-        print(f"🚀 Starting DPM-Solver++ sampling with {steps} steps...")
-        # Setup noise schedule
-        betas = torch.linspace(DDPM_BETA_START, DDPM_BETA_END, DDMP_NUM_TIMESTEPS, dtype=torch.float32)
-        alphas_cumprod = torch.cumprod(1.0 - betas, dim=0)
-        noise_schedule = NoiseScheduleVP(schedule='discrete', alphas_cumprod=alphas_cumprod)
-        # Setup model wrapper
-        model_fn = model_wrapper(
-            model,
-            noise_schedule,
-            model_type="noise",  # DDPM objective only
-            model_kwargs={
-                "source_latent": source_latent,
-            },
-            guidance_type="classifier-free",
-            condition=instruction_embedding,
-            unconditional_condition=uncond_embedding,
-            guidance_scale=guidance_scale,
-        )
-        # Initialize DPM-Solver++
-        dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
-        # Calculate time range
-        t_end = noise_schedule.T / noise_schedule.total_N
-        t_start = t_end + strength * (noise_schedule.T - t_end)
-        # Add initial noise
-        torch.manual_seed(seed)
-        noise = torch.randn_like(source_latent)
-        latents = dpm_solver.add_noise(source_latent, torch.tensor([t_start], device=DEVICE), noise)
-        latents = latents.to(DTYPE)
-        # Run DPM solver sampling
-        with torch.amp.autocast(device_type="cuda", dtype=DTYPE, enabled=(DTYPE != torch.float32)):
-            with torch.no_grad():
-                final_latent, _ = dpm_solver.sample(
-                    latents,
-                    steps=steps,
-                    t_start=t_start,
-                    t_end=t_end,
-                    order=2,
-                    method="multistep",
-                    skip_type="time_uniform",
-                    lower_order_final=True,
-                    return_intermediate=True,
-                )
-        return final_latent
-    def process_audio(self, audio_file, instruction, guidance_scale, steps, strength, seed):
-        """Main audio processing function"""
-        try:
-            if not self.load_models():
-                return None, None, "❌ Failed to load models. Please try again."
-            # Load and preprocess audio
-            print(f"🎵 Processing audio: {audio_file}")
-            audio, sr = torchaudio.load(audio_file)
-            TARGET_SR_DCAE = 44100
-            TARGET_LEN_DCAE = TARGET_SR_DCAE * 10
-            if sr != TARGET_SR_DCAE:
-                audio = torchaudio.transforms.Resample(sr, TARGET_SR_DCAE)(audio)
-            if audio.shape[1] > TARGET_LEN_DCAE:
-                audio = audio[:, :TARGET_LEN_DCAE]
-            elif audio.shape[1] < TARGET_LEN_DCAE:
-                audio = torch.nn.functional.pad(audio, (0, TARGET_LEN_DCAE - audio.shape[1]))
-            if audio.shape[0] == 1:
-                audio = audio.repeat(2, 1)
-            # Encode audio
-            with torch.no_grad():
-                source_latent_scaled, _ = self.dcae.encode(audio.to(DEVICE).unsqueeze(0))
-            # Prepare text embeddings
-            with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=DTYPE, enabled=(DTYPE != torch.float32)):
-                text_input = self.tokenizer([instruction], max_length=32, padding="max_length",
-                                          truncation=True, return_tensors="pt")
-                instruction_embedding = self.text_encoder(text_input.input_ids.to(DEVICE))[0]
-                uncond_input = self.tokenizer([""], max_length=32, padding="max_length",
-                                            truncation=True, return_tensors="pt")
-                uncond_embedding = self.text_encoder(uncond_input.input_ids.to(DEVICE))[0]
-            # Move models for inference
-            self.dcae = self.dcae.cpu()
-            torch.cuda.empty_cache()
-            self.model = self.model.to(DEVICE, dtype=DTYPE)
-            # Generate
-            print("🎨 Generating edited audio...")
-            with torch.amp.autocast(device_type="cuda", dtype=DTYPE, enabled=(DTYPE != torch.float32)):
-                with torch.no_grad():
-                    final_latent = self.dpm_solver_sampling(
-                        model=self.model,
-                        source_latent=source_latent_scaled,
-                        instruction_embedding=instruction_embedding,
-                        uncond_embedding=uncond_embedding,
-                        strength=strength,
-                        steps=int(steps),
-                        guidance_scale=guidance_scale,
-                        seed=int(seed)
-                    )
-            # Decode results
-            self.model = self.model.cpu()
-            torch.cuda.empty_cache()
-            self.dcae = self.dcae.to(DEVICE)
-            final_latent_unscaled = (final_latent.float() / self.dcae.scale_factor) + self.dcae.shift_factor
-            source_latent_raw = (source_latent_scaled / self.dcae.scale_factor) + self.dcae.shift_factor
-            with torch.no_grad():
-                source_mel = self.dcae.decode_to_mel(source_latent_raw)
-                edited_mel = self.dcae.decode_to_mel(final_latent_unscaled)
-                _, pred_wavs = self.dcae.decode(latents=final_latent.float(), sr=44100)
-                edited_audio = pred_wavs[0]
-            # Create comparison plot
-            comparison_plot = self.create_mel_comparison(source_mel, edited_mel, instruction)
-            # Save output audio
-            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-                torchaudio.save(tmp_file.name, edited_audio.cpu().float(), 44100)
-                output_path = tmp_file.name
-            # Cleanup
-            self.dcae = self.dcae.cpu()
-            torch.cuda.empty_cache()
-            return output_path, comparison_plot, f"✅ Audio editing completed! Instruction: '{instruction}'"
-        except Exception as e:
-            import traceback
-            error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
-            print(error_msg)
-            return None, None, error_msg
-    def create_mel_comparison(self, source_mel, edited_mel, instruction):
-        """Create mel-spectrogram comparison plot"""
-        try:
-            source_mel_np = source_mel.squeeze(0)[0].cpu().float().numpy()
-            edited_mel_np = edited_mel.squeeze(0)[0].cpu().float().numpy()
-            fig, axs = plt.subplots(2, 1, figsize=(12, 8), sharex=True, sharey=True)
-            fig.suptitle(f'Mel-Spectrogram Comparison', fontsize=14)
-            # Plot source
-            im1 = axs[0].imshow(source_mel_np, aspect='auto', origin='lower', cmap='viridis')
-            axs[0].set_title('Original Audio')
-            axs[0].set_ylabel('Mel Bins')
-            plt.colorbar(im1, ax=axs[0])
-            # Plot edited
-            im2 = axs[1].imshow(edited_mel_np, aspect='auto', origin='lower', cmap='viridis')
-            axs[1].set_title(f'Edited Audio: "{instruction}"')
-            axs[1].set_ylabel('Mel Bins')
-            axs[1].set_xlabel('Time Frames')
-            plt.colorbar(im2, ax=axs[1])
-            plt.tight_layout()
-            # Save to temporary file for Gradio
-            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
-                plt.savefig(tmp_file.name, dpi=100, bbox_inches='tight')
-                plt.close()
-                return tmp_file.name
-        except Exception as e:
-            print(f"Error creating plot: {e}")
-            plt.close()
-            return None
-# Initialize the audio editor
-audio_editor = AudioEditor()
-def gradio_interface(audio_file, instruction, guidance_scale, steps, strength, seed):
-    """Gradio interface function"""
-    if audio_file is None:
-        return None, None, "Please upload an audio file"
-    if not instruction.strip():
-        return None, None, "Please provide an editing instruction"
-    return audio_editor.process_audio(audio_file, instruction, guidance_scale, steps, strength, seed)
-# Create Gradio interface
-with gr.Blocks(title="🎵 AI Audio Editor", theme=gr.themes.Soft()) as demo:
-    gr.HTML("""
-    <div style="text-align: center; margin-bottom: 20px;">
-        <h1>🎵 AI Audio Editor</h1>
-        <p>Upload an audio file and provide instructions to edit it using AI.<br/>
-        The model uses DPM-Solver++ for fast, high-quality generation.</p>
-    </div>
-    """)
-    with gr.Row():
-        with gr.Column(scale=1):
-            # Input components
-            audio_input = gr.Audio(
-                label="📁 Upload Audio File",
-                type="filepath"
-            )
-            instruction_input = gr.Textbox(
-                label="✏️ Editing Instruction",
-                placeholder="e.g., 'Add drums', 'Make it more energetic', 'Remove vocals'",
-                lines=2
-            )
-            with gr.Accordion("🔧 Advanced Settings", open=False):
-                guidance_scale = gr.Slider(
-                    minimum=1.0, maximum=20.0, value=7.5, step=0.5,
-                    label="Guidance Scale",
-                    info="Higher values follow the instruction more closely"
-                )
-                steps = gr.Slider(
-                    minimum=10, maximum=50, value=25, step=5,
-                    label="Sampling Steps",
-                    info="More steps = better quality, slower generation"
-                )
-                strength = gr.Slider(
-                    minimum=0.1, maximum=1.0, value=1.0, step=0.1,
-                    label="Denoising Strength",
-                    info="1.0 = full denoising, lower = more conservative editing"
-                )
-                seed = gr.Number(
-                    value=42, label="Seed",
-                    info="For reproducible results"
-                )
-            generate_btn = gr.Button("🎨 Generate Edited Audio", variant="primary", size="lg")
-        with gr.Column(scale=1):
-            # Output components
-            status_output = gr.Textbox(label="📊 Status", interactive=False)
-            audio_output = gr.Audio(label="🎵 Generated Audio")
-            plot_output = gr.Image(label="📈 Mel-Spectrogram Comparison")
-    gr.HTML("""
-    <div style="margin-top: 20px; padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
-        <h3>📝 Usage Tips:</h3>
-        <ul>
-            <li><b>Audio Length:</b> Files are automatically processed to 10 seconds</li>
-            <li><b>Instructions:</b> Be specific (e.g., "Add heavy drums" vs "Add drums")</li>
-            <li><b>Guidance Scale:</b> Start with 7.5, increase for stronger effects</li>
-            <li><b>Steps:</b> 25 steps provide good quality/speed balance</li>
-        </ul>
-    </div>
-    """)
-    # Connect the interface
-    generate_btn.click(
-        fn=gradio_interface,
-        inputs=[audio_input, instruction_input, guidance_scale, steps, strength, seed],
-        outputs=[audio_output, plot_output, status_output],
-        show_progress=True
-    )
-# Launch settings
-if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        show_error=True
     )

+import os
+import torch
+import torchaudio
+import gradio as gr
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+from transformers import UMT5EncoderModel, AutoTokenizer
+from huggingface_hub import hf_hub_download, snapshot_download
+import json
+import numpy as np
+import tempfile
+from io import BytesIO
+import warnings
+warnings.filterwarnings("ignore")
+# Import model components
+from model.ae.music_dcae import MusicDCAE
+from model.ldm.editing_unet import EditingUNet
+from model.ldm.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
+# Configuration
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
+# Model repository - UPDATE THIS TO YOUR MODEL REPO
+MODEL_REPO = "NZUONG/mude"  # Your uploaded model repository
+# DDPM Parameters
+DDPM_NUM_TIMESTEPS = 1000
+DDPM_BETA_START = 0.0001
+DDPM_BETA_END = 0.02
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def download_models():
+    """Download models from Hugging Face Hub"""
+    print("🔄 Downloading models from Hugging Face Hub...")
+    # Create local directories
+    os.makedirs("checkpoints", exist_ok=True)
+    try:
+        # Download the entire repository
+        local_dir = snapshot_download(
+            repo_id=MODEL_REPO,
+            cache_dir="./cache",
+            local_dir="./checkpoints",
+            repo_type="model"
+        )
+        print(f"✅ Models downloaded to: {local_dir}")
+        return True
+    except Exception as e:
+        print(f"❌ Error downloading models: {e}")
+        return False
+class AudioEditor:
+    def __init__(self):
+        self.dcae = None
+        self.tokenizer = None
+        self.text_encoder = None
+        self.model = None
+        self.is_loaded = False
+    def load_models(self):
+        """Load all models once at startup"""
+        if self.is_loaded:
+            return True
+        # Download models if not present
+        if not os.path.exists("checkpoints/music_dcae_f8c8"):
+            print("📥 Models not found locally, downloading...")
+            if not download_models():
+                return False
+        print("🔄 Loading models...")
+        try:
+            # Model paths
+            dcae_path = "checkpoints/music_dcae_f8c8"
+            vocoder_path = "checkpoints/music_vocoder"
+            t5_path = "checkpoints/umt5-base"
+            unet_config_path = "model/ldm/exp_config.json"
+            trained_model_path = "checkpoints/fm_checkpoint_epoch_9.pt"
+            # Load DCAE
+            self.dcae = MusicDCAE(
+                dcae_checkpoint_path=dcae_path,
+                vocoder_checkpoint_path=vocoder_path
+            ).to(DEVICE).eval()
+            # Load text encoder
+            self.tokenizer = AutoTokenizer.from_pretrained(t5_path)
+            self.text_encoder = UMT5EncoderModel.from_pretrained(t5_path).to(DEVICE, dtype=DTYPE).eval()
+            # Load UNet config
+            with open(unet_config_path, 'r') as f:
+                unet_config = AttrDict(json.load(f)['model']['unet'])
+            self.model = EditingUNet(unet_config, use_flow_matching=False).to("cpu", dtype=DTYPE).eval()
+            # Load checkpoint
+            checkpoint = torch.load(trained_model_path, map_location="cpu")
+            model_state_dict = checkpoint.get('model_state_dict', checkpoint)
+            if any(key.startswith('_orig_mod.') for key in model_state_dict.keys()):
+                model_state_dict = {key.replace('_orig_mod.', ''): value for key, value in model_state_dict.items()}
+            self.model.load_state_dict(model_state_dict, strict=False)
+            self.is_loaded = True
+            print("✅ All models loaded successfully!")
+            return True
+        except Exception as e:
+            print(f"❌ Error loading models: {e}")
+            return False
+    def dpm_solver_sampling(self, model, source_latent, instruction_embedding, uncond_embedding,
+                           strength=1.0, steps=25, guidance_scale=7.5, seed=42):
+        """DPM-Solver sampling function"""
+        print(f"🚀 Starting DPM-Solver++ sampling with {steps} steps...")
+        # Setup noise schedule - FIXED TYPO HERE
+        betas = torch.linspace(DDPM_BETA_START, DDPM_BETA_END, DDPM_NUM_TIMESTEPS, dtype=torch.float32)
+        alphas_cumprod = torch.cumprod(1.0 - betas, dim=0)
+        noise_schedule = NoiseScheduleVP(schedule='discrete', alphas_cumprod=alphas_cumprod)
+        # Setup model wrapper
+        model_fn = model_wrapper(
+            model,
+            noise_schedule,
+            model_type="noise",  # DDPM objective only
+            model_kwargs={
+                "source_latent": source_latent,
+            },
+            guidance_type="classifier-free",
+            condition=instruction_embedding,
+            unconditional_condition=uncond_embedding,
+            guidance_scale=guidance_scale,
+        )
+        # Initialize DPM-Solver++
+        dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
+        # Calculate time range
+        t_end = noise_schedule.T / noise_schedule.total_N
+        t_start = t_end + strength * (noise_schedule.T - t_end)
+        # Add initial noise
+        torch.manual_seed(seed)
+        noise = torch.randn_like(source_latent)
+        latents = dpm_solver.add_noise(source_latent, torch.tensor([t_start], device=DEVICE), noise)
+        latents = latents.to(DTYPE)
+        # Run DPM solver sampling
+        with torch.amp.autocast(device_type="cuda", dtype=DTYPE, enabled=(DTYPE != torch.float32)):
+            with torch.no_grad():
+                final_latent, _ = dpm_solver.sample(
+                    latents,
+                    steps=steps,
+                    t_start=t_start,
+                    t_end=t_end,
+                    order=2,
+                    method="multistep",
+                    skip_type="time_uniform",
+                    lower_order_final=True,
+                    return_intermediate=True,
+                )
+        return final_latent
+    def process_audio(self, audio_file, instruction, guidance_scale, steps, strength, seed):
+        """Main audio processing function"""
+        try:
+            if not self.load_models():
+                return None, None, "❌ Failed to load models. Please try again."
+            # Load and preprocess audio
+            print(f"🎵 Processing audio: {audio_file}")
+            audio, sr = torchaudio.load(audio_file)
+            TARGET_SR_DCAE = 44100
+            TARGET_LEN_DCAE = TARGET_SR_DCAE * 10
+            if sr != TARGET_SR_DCAE:
+                audio = torchaudio.transforms.Resample(sr, TARGET_SR_DCAE)(audio)
+            if audio.shape[1] > TARGET_LEN_DCAE:
+                audio = audio[:, :TARGET_LEN_DCAE]
+            elif audio.shape[1] < TARGET_LEN_DCAE:
+                audio = torch.nn.functional.pad(audio, (0, TARGET_LEN_DCAE - audio.shape[1]))
+            if audio.shape[0] == 1:
+                audio = audio.repeat(2, 1)
+            # Encode audio
+            with torch.no_grad():
+                source_latent_scaled, _ = self.dcae.encode(audio.to(DEVICE).unsqueeze(0))
+            # Prepare text embeddings
+            with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=DTYPE, enabled=(DTYPE != torch.float32)):
+                text_input = self.tokenizer([instruction], max_length=32, padding="max_length",
+                                          truncation=True, return_tensors="pt")
+                instruction_embedding = self.text_encoder(text_input.input_ids.to(DEVICE))[0]
+                uncond_input = self.tokenizer([""], max_length=32, padding="max_length",
+                                            truncation=True, return_tensors="pt")
+                uncond_embedding = self.text_encoder(uncond_input.input_ids.to(DEVICE))[0]
+            # Move models for inference
+            self.dcae = self.dcae.cpu()
+            torch.cuda.empty_cache()
+            self.model = self.model.to(DEVICE, dtype=DTYPE)
+            # Generate
+            print("🎨 Generating edited audio...")
+            with torch.amp.autocast(device_type="cuda", dtype=DTYPE, enabled=(DTYPE != torch.float32)):
+                with torch.no_grad():
+                    final_latent = self.dpm_solver_sampling(
+                        model=self.model,
+                        source_latent=source_latent_scaled,
+                        instruction_embedding=instruction_embedding,
+                        uncond_embedding=uncond_embedding,
+                        strength=strength,
+                        steps=int(steps),
+                        guidance_scale=guidance_scale,
+                        seed=int(seed)
+                    )
+            # Decode results
+            self.model = self.model.cpu()
+            torch.cuda.empty_cache()
+            self.dcae = self.dcae.to(DEVICE)
+            final_latent_unscaled = (final_latent.float() / self.dcae.scale_factor) + self.dcae.shift_factor
+            source_latent_raw = (source_latent_scaled / self.dcae.scale_factor) + self.dcae.shift_factor
+            with torch.no_grad():
+                source_mel = self.dcae.decode_to_mel(source_latent_raw)
+                edited_mel = self.dcae.decode_to_mel(final_latent_unscaled)
+                _, pred_wavs = self.dcae.decode(latents=final_latent.float(), sr=44100)
+                edited_audio = pred_wavs[0]
+            # Create comparison plot
+            comparison_plot = self.create_mel_comparison(source_mel, edited_mel, instruction)
+            # Save output audio
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                torchaudio.save(tmp_file.name, edited_audio.cpu().float(), 44100)
+                output_path = tmp_file.name
+            # Cleanup
+            self.dcae = self.dcae.cpu()
+            torch.cuda.empty_cache()
+            return output_path, comparison_plot, f"✅ Audio editing completed! Instruction: '{instruction}'"
+        except Exception as e:
+            import traceback
+            error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
+            print(error_msg)
+            return None, None, error_msg
+    def create_mel_comparison(self, source_mel, edited_mel, instruction):
+        """Create mel-spectrogram comparison plot"""
+        try:
+            source_mel_np = source_mel.squeeze(0)[0].cpu().float().numpy()
+            edited_mel_np = edited_mel.squeeze(0)[0].cpu().float().numpy()
+            fig, axs = plt.subplots(2, 1, figsize=(12, 8), sharex=True, sharey=True)
+            fig.suptitle(f'Mel-Spectrogram Comparison', fontsize=14)
+            # Plot source
+            im1 = axs[0].imshow(source_mel_np, aspect='auto', origin='lower', cmap='viridis')
+            axs[0].set_title('Original Audio')
+            axs[0].set_ylabel('Mel Bins')
+            plt.colorbar(im1, ax=axs[0])
+            # Plot edited
+            im2 = axs[1].imshow(edited_mel_np, aspect='auto', origin='lower', cmap='viridis')
+            axs[1].set_title(f'Edited Audio: "{instruction}"')
+            axs[1].set_ylabel('Mel Bins')
+            axs[1].set_xlabel('Time Frames')
+            plt.colorbar(im2, ax=axs[1])
+            plt.tight_layout()
+            # Save to temporary file for Gradio
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
+                plt.savefig(tmp_file.name, dpi=100, bbox_inches='tight')
+                plt.close()
+                return tmp_file.name
+        except Exception as e:
+            print(f"Error creating plot: {e}")
+            plt.close()
+            return None
+# Initialize the audio editor
+audio_editor = AudioEditor()
+def gradio_interface(audio_file, instruction, guidance_scale, steps, strength, seed):
+    """Gradio interface function"""
+    if audio_file is None:
+        return None, None, "Please upload an audio file"
+    if not instruction.strip():
+        return None, None, "Please provide an editing instruction"
+    return audio_editor.process_audio(audio_file, instruction, guidance_scale, steps, strength, seed)
+# Create Gradio interface
+with gr.Blocks(title="🎵 AI Audio Editor", theme=gr.themes.Soft()) as demo:
+    gr.HTML("""
+    <div style="text-align: center; margin-bottom: 20px;">
+        <h1>🎵 AI Audio Editor</h1>
+        <p>Upload an audio file and provide instructions to edit it using AI.<br/>
+        The model uses DPM-Solver++ for fast, high-quality generation.</p>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Input components
+            audio_input = gr.Audio(
+                label="📁 Upload Audio File",
+                type="filepath"
+            )
+            instruction_input = gr.Textbox(
+                label="✏️ Editing Instruction",
+                placeholder="e.g., 'Add drums', 'Make it more energetic', 'Remove vocals'",
+                lines=2
+            )
+            with gr.Accordion("🔧 Advanced Settings", open=False):
+                guidance_scale = gr.Slider(
+                    minimum=1.0, maximum=20.0, value=7.5, step=0.5,
+                    label="Guidance Scale",
+                    info="Higher values follow the instruction more closely"
+                )
+                steps = gr.Slider(
+                    minimum=10, maximum=50, value=25, step=5,
+                    label="Sampling Steps",
+                    info="More steps = better quality, slower generation"
+                )
+                strength = gr.Slider(
+                    minimum=0.1, maximum=1.0, value=1.0, step=0.1,
+                    label="Denoising Strength",
+                    info="1.0 = full denoising, lower = more conservative editing"
+                )
+                seed = gr.Number(
+                    value=42, label="Seed",
+                    info="For reproducible results"
+                )
+            generate_btn = gr.Button("🎨 Generate Edited Audio", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            # Output components
+            status_output = gr.Textbox(label="📊 Status", interactive=False)
+            audio_output = gr.Audio(label="🎵 Generated Audio")
+            plot_output = gr.Image(label="📈 Mel-Spectrogram Comparison")
+    gr.HTML("""
+    <div style="margin-top: 20px; padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
+        <h3>📝 Usage Tips:</h3>
+        <ul>
+            <li><b>Audio Length:</b> Files are automatically processed to 10 seconds</li>
+            <li><b>Instructions:</b> Be specific (e.g., "Add heavy drums" vs "Add drums")</li>
+            <li><b>Guidance Scale:</b> Start with 7.5, increase for stronger effects</li>
+            <li><b>Steps:</b> 25 steps provide good quality/speed balance</li>
+        </ul>
+    </div>
+    """)
+    # Connect the interface
+    generate_btn.click(
+        fn=gradio_interface,
+        inputs=[audio_input, instruction_input, guidance_scale, steps, strength, seed],
+        outputs=[audio_output, plot_output, status_output],
+        show_progress=True
+    )
+# Launch settings
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
     )