ACE-Step-1.5

Sleeping

App Files Files Community

ChuxiJ commited on Dec 22, 2025

Commit

8ff7c0c

2 Parent(s): a3b47b7 70c780d

Merge branch 'main' of github.com:ace-step/ACE-Step-1.5 into main

Browse files

Files changed (4) hide show

.gitignore +1 -0
acestep/handler.py +30 -1
scripts/prepare_vae_calibration_data.py +123 -0
test.py +7 -1

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 *.mp3
 *.wav

+data/
 *.mp3
 *.wav

acestep/handler.py CHANGED Viewed

@@ -156,6 +156,7 @@ class AceStepHandler:
         compile_model: bool = False,
         offload_to_cpu: bool = False,
         offload_dit_to_cpu: bool = False,
     ) -> Tuple[str, bool]:
         """
         Initialize model service
@@ -186,6 +187,14 @@ class AceStepHandler:
             self.offload_dit_to_cpu = offload_dit_to_cpu
             # Set dtype based on device: bfloat16 for cuda, float32 for cpu
             self.dtype = torch.bfloat16 if device in ["cuda","xpu"] else torch.float32
             # Auto-detect project root (independent of passed project_root parameter)
             current_file = os.path.abspath(__file__)
@@ -238,9 +247,26 @@ class AceStepHandler:
                 self.model.eval()
                 if compile_model:
-                    logger.info("Compiling model with torch.compile...")
                     self.model = torch.compile(self.model)
                 silence_latent_path = os.path.join(acestep_v15_checkpoint_path, "silence_latent.pt")
                 if os.path.exists(silence_latent_path):
                     self.silence_latent = torch.load(silence_latent_path).transpose(1, 2)
@@ -267,6 +293,9 @@ class AceStepHandler:
                 self.vae.eval()
             else:
                 raise FileNotFoundError(f"VAE checkpoint not found at {vae_checkpoint_path}")
             # 3. Load text encoder and tokenizer
             text_encoder_path = os.path.join(checkpoint_dir, "Qwen3-Embedding-0.6B")

         compile_model: bool = False,
         offload_to_cpu: bool = False,
         offload_dit_to_cpu: bool = False,
+        quantization: Optional[str] = None,
     ) -> Tuple[str, bool]:
         """
         Initialize model service
             self.offload_dit_to_cpu = offload_dit_to_cpu
             # Set dtype based on device: bfloat16 for cuda, float32 for cpu
             self.dtype = torch.bfloat16 if device in ["cuda","xpu"] else torch.float32
+            self.quantization = quantization
+            if self.quantization is not None:
+                assert compile_model, "Quantization requires compile_model to be True"
+                try:
+                    import torchao
+                except ImportError:
+                    raise ImportError("torchao is required for quantization but is not installed. Please install torchao to use quantization features.")
             # Auto-detect project root (independent of passed project_root parameter)
             current_file = os.path.abspath(__file__)
                 self.model.eval()
                 if compile_model:
                     self.model = torch.compile(self.model)
+                    if self.quantization is not None:
+                        from torchao.quantization import quantize_
+                        if self.quantization == "int8_weight_only":
+                            from torchao.quantization import Int8WeightOnlyConfig
+                            quant_config = Int8WeightOnlyConfig()
+                        elif self.quantization == "fp8_weight_only":
+                            from torchao.quantization import Float8WeightOnlyConfig
+                            quant_config = Float8WeightOnlyConfig()
+                        elif self.quantization == "w8a8_dynamic":
+                            from torchao.quantization import Int8DynamicActivationInt8WeightConfig, MappingType
+                            quant_config = Int8DynamicActivationInt8WeightConfig(act_mapping_type=MappingType.ASYMMETRIC)
+                        else:
+                            raise ValueError(f"Unsupported quantization type: {self.quantization}")
+                        quantize_(self.model, quant_config)
+                        logger.info("DiT quantized with:",self.quantization)
                 silence_latent_path = os.path.join(acestep_v15_checkpoint_path, "silence_latent.pt")
                 if os.path.exists(silence_latent_path):
                     self.silence_latent = torch.load(silence_latent_path).transpose(1, 2)
                 self.vae.eval()
             else:
                 raise FileNotFoundError(f"VAE checkpoint not found at {vae_checkpoint_path}")
+            if compile_model:
+                self.vae = torch.compile(self.vae)
             # 3. Load text encoder and tokenizer
             text_encoder_path = os.path.join(checkpoint_dir, "Qwen3-Embedding-0.6B")

scripts/prepare_vae_calibration_data.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import torch
+import os
+import soundfile as sf
+from diffusers.models import AutoencoderOobleck
+from tqdm import tqdm
+import torch.nn.functional as F
+def process_audio(audio_path, target_sr=48000):
+    try:
+        # Load audio using soundfile
+        audio_np, sr = sf.read(audio_path, dtype='float32')
+        # Convert to torch: [samples, channels] or [samples] -> [channels, samples]
+        if audio_np.ndim == 1:
+            audio = torch.from_numpy(audio_np).unsqueeze(0)
+        else:
+            audio = torch.from_numpy(audio_np.T)
+        # Ensure stereo
+        if audio.shape[0] == 1:
+            audio = torch.cat([audio, audio], dim=0)
+        audio = audio[:2]
+        # Resample if needed
+        if sr != target_sr:
+            ratio = target_sr / sr
+            new_length = int(audio.shape[-1] * ratio)
+            audio = F.interpolate(audio.unsqueeze(0), size=new_length, mode='linear', align_corners=False).squeeze(0)
+        audio = torch.clamp(audio, -1.0, 1.0)
+        return audio.unsqueeze(0) # Add batch dim: [1, 2, samples]
+    except Exception as e:
+        print(f"Error processing {audio_path}: {e}")
+        return None
+def main():
+    print("Initializing Calibration Data Preparation...")
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    project_root = os.path.dirname(current_dir)
+    data_dir = os.path.join(project_root, "data", "quant_data")
+    output_path = os.path.join(project_root, "data", "calibration_latents.pt")
+    vae_path = os.path.join(project_root, "checkpoints", "vae")
+    if not os.path.exists(data_dir):
+        print(f"Error: Data directory not found at {data_dir}")
+        return
+    print(f"Loading VAE from {vae_path}...")
+    try:
+        vae = AutoencoderOobleck.from_pretrained(vae_path)
+    except Exception as e:
+        print(f"Failed to load VAE: {e}")
+        return
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Check for XPU
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
+        device = "xpu"
+    print(f"Using device: {device}")
+    vae = vae.to(device)
+    vae.eval()
+    audio_files = [f for f in os.listdir(data_dir) if f.endswith('.flac')]
+    print(f"Found {len(audio_files)} audio files.")
+    all_chunks = []
+    chunk_size = 512 # Latent frames
+    samples_per_latent = 1920
+    audio_chunk_size = chunk_size * samples_per_latent
+    pbar = tqdm(audio_files, desc="Processing audio")
+    for audio_file in pbar:
+        file_path = os.path.join(data_dir, audio_file)
+        full_audio = process_audio(file_path)
+        if full_audio is None:
+            continue
+        # Split audio into chunks
+        num_samples = full_audio.shape[-1]
+        for start_idx in range(0, num_samples, audio_chunk_size):
+            end_idx = start_idx + audio_chunk_size
+            if end_idx > num_samples:
+                break # Skip incomplete chunks
+            audio_input = full_audio[:, :, start_idx:end_idx].to(device)
+            try:
+                with torch.no_grad():
+                    # Encode
+                    # VAE encode expects [Batch, Channels, Samples]
+                    # Returns DiagonalGaussianDistribution
+                    posterior = vae.encode(audio_input).latent_dist
+                    latents = posterior.sample() # [1, 64, LatentLength]
+                    # It should be exactly chunk_size, but let's be safe
+                    if latents.shape[-1] >= chunk_size:
+                        all_chunks.append(latents[:, :, :chunk_size].cpu())
+                    pbar.set_postfix({"chunks": len(all_chunks)})
+            except Exception as e:
+                print(f"Error encoding chunk {start_idx}-{end_idx} of {audio_file}: {e}")
+                torch.cuda.empty_cache()
+                if device == "xpu":
+                    torch.xpu.empty_cache()
+    print(f"Collected {len(all_chunks)} chunks of size {chunk_size}.")
+    if len(all_chunks) > 0:
+        print(f"Saving to {output_path}...")
+        torch.save(all_chunks, output_path)
+        print("Done.")
+    else:
+        print("No chunks collected.")
+if __name__ == "__main__":
+    main()

test.py CHANGED Viewed

@@ -46,6 +46,7 @@ def main():
         compile_model=True,
         offload_to_cpu=True,
         offload_dit_to_cpu=False, # Keep DiT on GPU
     )
     if not enabled:
@@ -107,7 +108,12 @@ def main():
         print(f"Generated Audio Codes (first 50 chars): {audio_codes[:50]}...")
     else:
         print("Skipping 5Hz LLM generation...")
-        metadata = {}
         audio_codes = None
         lm_status = "Skipped"

         compile_model=True,
         offload_to_cpu=True,
         offload_dit_to_cpu=False, # Keep DiT on GPU
+        quantization="int8_weight_only", # Enable FP8 weight-only quantization
     )
     if not enabled:
         print(f"Generated Audio Codes (first 50 chars): {audio_codes[:50]}...")
     else:
         print("Skipping 5Hz LLM generation...")
+        metadata = {
+            'bpm': 90,
+            'keyscale': 'A major',
+            'timesignature': '4',
+            'duration': 240,
+        }
         audio_codes = None
         lm_status = "Skipped"