test 2?

Files changed (13) hide show

acestep/__init__.py +1 -0
acestep/audio_utils.py +354 -0
acestep/constants.py +193 -0
acestep/constrained_logits_processor.py +0 -0
acestep/dit_alignment_score.py +877 -0
acestep/genres_vocab.txt +0 -0
acestep/gpu_config.py +549 -0
acestep/handler.py +0 -0
acestep/inference.py +1310 -0
acestep/llm_inference.py +0 -0
acestep/model_downloader.py +634 -0
handler.py +262 -272
requirements.txt +8 -2

acestep/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """ACE-Step package."""

acestep/audio_utils.py ADDED Viewed

	@@ -0,0 +1,354 @@

+"""
+Audio saving and transcoding utility module
+Independent audio file operations outside of handler, supporting:
+- Save audio tensor/numpy to files (default FLAC format, fast)
+- Format conversion (FLAC/WAV/MP3)
+- Batch processing
+"""
+import os
+import hashlib
+import json
+from pathlib import Path
+from typing import Union, Optional, List, Tuple
+import torch
+import numpy as np
+import torchaudio
+from loguru import logger
+class AudioSaver:
+    """Audio saving and transcoding utility class"""
+    def __init__(self, default_format: str = "flac"):
+        """
+        Initialize audio saver
+        Args:
+            default_format: Default save format ('flac', 'wav', 'mp3')
+        """
+        self.default_format = default_format.lower()
+        if self.default_format not in ["flac", "wav", "mp3"]:
+            logger.warning(f"Unsupported format {default_format}, using 'flac'")
+            self.default_format = "flac"
+    def save_audio(
+        self,
+        audio_data: Union[torch.Tensor, np.ndarray],
+        output_path: Union[str, Path],
+        sample_rate: int = 48000,
+        format: Optional[str] = None,
+        channels_first: bool = True,
+    ) -> str:
+        """
+        Save audio data to file
+        Args:
+            audio_data: Audio data, torch.Tensor [channels, samples] or numpy.ndarray
+            output_path: Output file path (extension can be omitted)
+            sample_rate: Sample rate
+            format: Audio format ('flac', 'wav', 'mp3'), defaults to default_format
+            channels_first: If True, tensor format is [channels, samples], else [samples, channels]
+        Returns:
+            Actual saved file path
+        """
+        format = (format or self.default_format).lower()
+        if format not in ["flac", "wav", "mp3"]:
+            logger.warning(f"Unsupported format {format}, using {self.default_format}")
+            format = self.default_format
+        # Ensure output path has correct extension
+        output_path = Path(output_path)
+        if output_path.suffix.lower() not in ['.flac', '.wav', '.mp3']:
+            output_path = output_path.with_suffix(f'.{format}')
+        # Convert to torch tensor
+        if isinstance(audio_data, np.ndarray):
+            if channels_first:
+                # numpy [samples, channels] -> tensor [channels, samples]
+                audio_tensor = torch.from_numpy(audio_data.T).float()
+            else:
+                # numpy [samples, channels] -> tensor [samples, channels] -> [channels, samples]
+                audio_tensor = torch.from_numpy(audio_data).float()
+                if audio_tensor.dim() == 2 and audio_tensor.shape[0] < audio_tensor.shape[1]:
+                    audio_tensor = audio_tensor.T
+        else:
+            # torch tensor
+            audio_tensor = audio_data.cpu().float()
+            if not channels_first and audio_tensor.dim() == 2:
+                # [samples, channels] -> [channels, samples]
+                if audio_tensor.shape[0] > audio_tensor.shape[1]:
+                    audio_tensor = audio_tensor.T
+        # Ensure memory is contiguous
+        audio_tensor = audio_tensor.contiguous()
+        # Select backend and save
+        try:
+            if format == "mp3":
+                # MP3 uses ffmpeg backend
+                torchaudio.save(
+                    str(output_path),
+                    audio_tensor,
+                    sample_rate,
+                    channels_first=True,
+                    backend='ffmpeg',
+                )
+            elif format in ["flac", "wav"]:
+                # FLAC and WAV use soundfile backend (fastest)
+                torchaudio.save(
+                    str(output_path),
+                    audio_tensor,
+                    sample_rate,
+                    channels_first=True,
+                    backend='soundfile',
+                )
+            else:
+                # Other formats use default backend
+                torchaudio.save(
+                    str(output_path),
+                    audio_tensor,
+                    sample_rate,
+                    channels_first=True,
+                )
+            logger.debug(f"[AudioSaver] Saved audio to {output_path} ({format}, {sample_rate}Hz)")
+            return str(output_path)
+        except Exception as e:
+            try:
+                import soundfile as sf
+                audio_np = audio_tensor.transpose(0, 1).numpy()  # -> [samples, channels]
+                sf.write(str(output_path), audio_np, sample_rate, format=format.upper())
+                logger.debug(f"[AudioSaver] Fallback soundfile Saved audio to {output_path} ({format}, {sample_rate}Hz)")
+                return str(output_path)
+            except Exception as e:
+                logger.error(f"[AudioSaver] Failed to save audio: {e}")
+                raise
+    def convert_audio(
+        self,
+        input_path: Union[str, Path],
+        output_path: Union[str, Path],
+        output_format: str,
+        remove_input: bool = False,
+    ) -> str:
+        """
+        Convert audio format
+        Args:
+            input_path: Input audio file path
+            output_path: Output audio file path
+            output_format: Target format ('flac', 'wav', 'mp3')
+            remove_input: Whether to delete input file
+        Returns:
+            Output file path
+        """
+        input_path = Path(input_path)
+        output_path = Path(output_path)
+        if not input_path.exists():
+            raise FileNotFoundError(f"Input file not found: {input_path}")
+        # Load audio
+        audio_tensor, sample_rate = torchaudio.load(str(input_path))
+        # Save as new format
+        output_path = self.save_audio(
+            audio_tensor,
+            output_path,
+            sample_rate=sample_rate,
+            format=output_format,
+            channels_first=True
+        )
+        # Delete input file if needed
+        if remove_input:
+            input_path.unlink()
+            logger.debug(f"[AudioSaver] Removed input file: {input_path}")
+        return output_path
+    def save_batch(
+        self,
+        audio_batch: Union[List[torch.Tensor], torch.Tensor],
+        output_dir: Union[str, Path],
+        file_prefix: str = "audio",
+        sample_rate: int = 48000,
+        format: Optional[str] = None,
+        channels_first: bool = True,
+    ) -> List[str]:
+        """
+        Save audio batch
+        Args:
+            audio_batch: Audio batch, List[tensor] or tensor [batch, channels, samples]
+            output_dir: Output directory
+            file_prefix: File prefix
+            sample_rate: Sample rate
+            format: Audio format
+            channels_first: Tensor format flag
+        Returns:
+            List of saved file paths
+        """
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Process batch
+        if isinstance(audio_batch, torch.Tensor) and audio_batch.dim() == 3:
+            # [batch, channels, samples]
+            audio_list = [audio_batch[i] for i in range(audio_batch.shape[0])]
+        elif isinstance(audio_batch, list):
+            audio_list = audio_batch
+        else:
+            audio_list = [audio_batch]
+        saved_paths = []
+        for i, audio in enumerate(audio_list):
+            output_path = output_dir / f"{file_prefix}_{i:04d}"
+            saved_path = self.save_audio(
+                audio,
+                output_path,
+                sample_rate=sample_rate,
+                format=format,
+                channels_first=channels_first
+            )
+            saved_paths.append(saved_path)
+        return saved_paths
+def get_audio_file_hash(audio_file) -> str:
+    """
+    Get hash identifier for an audio file.
+    Args:
+        audio_file: Path to audio file (str) or file-like object
+    Returns:
+        Hash string or empty string
+    """
+    if audio_file is None:
+        return ""
+    try:
+        if isinstance(audio_file, str):
+            if os.path.exists(audio_file):
+                with open(audio_file, 'rb') as f:
+                    return hashlib.md5(f.read()).hexdigest()
+            return hashlib.md5(audio_file.encode('utf-8')).hexdigest()
+        elif hasattr(audio_file, 'name'):
+            return hashlib.md5(str(audio_file.name).encode('utf-8')).hexdigest()
+        return hashlib.md5(str(audio_file).encode('utf-8')).hexdigest()
+    except Exception:
+        return hashlib.md5(str(audio_file).encode('utf-8')).hexdigest()
+def generate_uuid_from_params(params_dict) -> str:
+    """
+    Generate deterministic UUID from generation parameters.
+    Same parameters will always generate the same UUID.
+    Args:
+        params_dict: Dictionary of parameters
+    Returns:
+        UUID string
+    """
+    params_json = json.dumps(params_dict, sort_keys=True, ensure_ascii=False)
+    hash_obj = hashlib.sha256(params_json.encode('utf-8'))
+    hash_hex = hash_obj.hexdigest()
+    uuid_str = f"{hash_hex[0:8]}-{hash_hex[8:12]}-{hash_hex[12:16]}-{hash_hex[16:20]}-{hash_hex[20:32]}"
+    return uuid_str
+def generate_uuid_from_audio_data(
+    audio_data: Union[torch.Tensor, np.ndarray],
+    seed: Optional[int] = None
+) -> str:
+    """
+    Generate UUID from audio data (for caching/deduplication)
+    Args:
+        audio_data: Audio data
+        seed: Optional seed value
+    Returns:
+        UUID string
+    """
+    if isinstance(audio_data, torch.Tensor):
+        # Convert to numpy and calculate hash
+        audio_np = audio_data.cpu().numpy()
+    else:
+        audio_np = audio_data
+    # Calculate data hash
+    data_hash = hashlib.md5(audio_np.tobytes()).hexdigest()
+    if seed is not None:
+        combined = f"{data_hash}_{seed}"
+        return hashlib.md5(combined.encode()).hexdigest()
+    return data_hash
+# Global default instance
+_default_saver = AudioSaver(default_format="flac")
+SILENT_RMS_THRESHOLD = 1e-5
+SILENT_PEAK_THRESHOLD = 1e-5
+def is_audio_silent(
+    audio_data: Union[torch.Tensor, np.ndarray],
+    rms_threshold: float = SILENT_RMS_THRESHOLD,
+    peak_threshold: float = SILENT_PEAK_THRESHOLD,
+    channels_first: bool = True,
+) -> Tuple[bool, float, float]:
+    """
+    Check if audio is silent or near-silent (e.g. zeroed conditioning output).
+    Returns (is_silent, rms, peak) where rms/peak are computed over the full signal.
+    """
+    if audio_data is None:
+        return True, 0.0, 0.0
+    if isinstance(audio_data, np.ndarray):
+        x = np.asarray(audio_data, dtype=np.float64).ravel()
+    else:
+        x = audio_data.cpu().float().numpy().ravel()
+    if x.size == 0:
+        return True, 0.0, 0.0
+    rms = float(np.sqrt(np.mean(x * x)))
+    peak = float(np.max(np.abs(x)))
+    is_silent = rms <= rms_threshold and peak <= peak_threshold
+    return is_silent, rms, peak
+def save_audio(
+    audio_data: Union[torch.Tensor, np.ndarray],
+    output_path: Union[str, Path],
+    sample_rate: int = 48000,
+    format: Optional[str] = None,
+    channels_first: bool = True,
+) -> str:
+    """
+    Convenience function: save audio (using default configuration)
+    Args:
+        audio_data: Audio data
+        output_path: Output path
+        sample_rate: Sample rate
+        format: Format (default flac)
+        channels_first: Tensor format flag
+    Returns:
+        Saved file path
+    """
+    return _default_saver.save_audio(
+        audio_data, output_path, sample_rate, format, channels_first
+    )

acestep/constants.py ADDED Viewed

	@@ -0,0 +1,193 @@

+"""
+Constants for ACE-Step
+Centralized constants used across the codebase
+"""
+# ==============================================================================
+# Language Constants
+# ==============================================================================
+# Supported languages for vocal generation and language detection
+# Covers major world languages with good TTS support in the underlying model
+# 'unknown' is used when language cannot be determined automatically
+VALID_LANGUAGES = [
+    'ar', 'az', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en',
+    'es', 'fa', 'fi', 'fr', 'he', 'hi', 'hr', 'ht', 'hu', 'id',
+    'is', 'it', 'ja', 'ko', 'la', 'lt', 'ms', 'ne', 'nl', 'no',
+    'pa', 'pl', 'pt', 'ro', 'ru', 'sa', 'sk', 'sr', 'sv', 'sw',
+    'ta', 'te', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'yue', 'zh',
+    'unknown'
+]
+# ==============================================================================
+# Keyscale Constants
+# ==============================================================================
+# Musical note names using standard Western notation
+KEYSCALE_NOTES = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
+# Supported accidentals: natural, ASCII sharp/flat, Unicode sharp/flat
+KEYSCALE_ACCIDENTALS = ['', '#', 'b', '♯', '♭']  # empty + ASCII sharp/flat + Unicode sharp/flat
+# Major and minor scale modes
+KEYSCALE_MODES = ['major', 'minor']
+# Generate all valid keyscales: 7 notes × 5 accidentals × 2 modes = 70 combinations
+# Examples: "C major", "F# minor", "B♭ major"
+VALID_KEYSCALES = set()
+for note in KEYSCALE_NOTES:
+    for acc in KEYSCALE_ACCIDENTALS:
+        for mode in KEYSCALE_MODES:
+            VALID_KEYSCALES.add(f"{note}{acc} {mode}")
+# ==============================================================================
+# Metadata Range Constants
+# ==============================================================================
+# BPM (Beats Per Minute) range - covers most musical styles
+# 30 BPM: Very slow ballads, ambient music
+# 300 BPM: Fast electronic dance music, extreme metal
+BPM_MIN = 30
+BPM_MAX = 300
+# Duration range (in seconds) - balances quality vs. computational cost
+# 10s: Short loops, musical excerpts
+# 600s: Full songs, extended compositions (10 minutes)
+DURATION_MIN = 10
+DURATION_MAX = 600
+# Valid time signatures - common musical meter patterns
+# 2: 2/4 time (marches, polka)
+# 3: 3/4 time (waltzes, ballads)
+# 4: 4/4 time (most pop, rock, hip-hop)
+# 6: 6/8 time (compound time, folk dances)
+VALID_TIME_SIGNATURES = [2, 3, 4, 6]
+# ==============================================================================
+# Task Type Constants
+# ==============================================================================
+# All supported generation tasks across different model variants
+TASK_TYPES = ["text2music", "repaint", "cover", "extract", "lego", "complete"]
+# Task types available for turbo models (optimized subset for speed)
+# - text2music: Generate from text descriptions
+# - repaint: Selective audio editing/regeneration
+# - cover: Style transfer using reference audio
+TASK_TYPES_TURBO = ["text2music", "repaint", "cover"]
+# Task types available for base models (full feature set)
+# Additional tasks requiring more computational resources:
+# - extract: Separate individual tracks/stems from audio
+# - lego: Multi-track generation (add layers)
+# - complete: Automatic completion of partial audio
+TASK_TYPES_BASE = ["text2music", "repaint", "cover", "extract", "lego", "complete"]
+# ==============================================================================
+# Instruction Constants
+# ==============================================================================
+# Default instructions
+DEFAULT_DIT_INSTRUCTION = "Fill the audio semantic mask based on the given conditions:"
+DEFAULT_LM_INSTRUCTION = "Generate audio semantic tokens based on the given conditions:"
+DEFAULT_LM_UNDERSTAND_INSTRUCTION = "Understand the given musical conditions and describe the audio semantics accordingly:"
+DEFAULT_LM_INSPIRED_INSTRUCTION = "Expand the user's input into a more detailed and specific musical description:"
+DEFAULT_LM_REWRITE_INSTRUCTION = "Format the user's input into a more detailed and specific musical description:"
+# Instruction templates for each task type
+# Note: Some instructions use placeholders like {TRACK_NAME} or {TRACK_CLASSES}
+# These should be formatted using .format() or f-strings when used
+TASK_INSTRUCTIONS = {
+    "text2music": "Fill the audio semantic mask based on the given conditions:",
+    "repaint": "Repaint the mask area based on the given conditions:",
+    "cover": "Generate audio semantic tokens based on the given conditions:",
+    "extract": "Extract the {TRACK_NAME} track from the audio:",
+    "extract_default": "Extract the track from the audio:",
+    "lego": "Generate the {TRACK_NAME} track based on the audio context:",
+    "lego_default": "Generate the track based on the audio context:",
+    "complete": "Complete the input track with {TRACK_CLASSES}:",
+    "complete_default": "Complete the input track:",
+}
+# ==============================================================================
+# Track/Instrument Constants
+# ==============================================================================
+# Supported instrumental track types for multi-track generation and extraction
+# Organized by instrument families for logical grouping:
+# - Wind instruments: woodwinds, brass
+# - Electronic: fx (effects), synth (synthesizer)
+# - String instruments: strings, guitar, bass
+# - Rhythm section: percussion, drums, keyboard
+# - Vocals: backing_vocals, vocals (lead vocals)
+TRACK_NAMES = [
+    "woodwinds", "brass", "fx", "synth", "strings", "percussion",
+    "keyboard", "guitar", "bass", "drums", "backing_vocals", "vocals"
+]
+# Template for SFT (Supervised Fine-Tuning) model prompts
+# Used to format inputs for the language model with instruction, caption, and metadata
+SFT_GEN_PROMPT = """# Instruction
+{}
+# Caption
+{}
+# Metas
+{}<|endoftext|>
+"""
+# ==============================================================================
+# GPU Memory Configuration Constants
+# ==============================================================================
+# GPU tier thresholds (in GB)
+GPU_TIER_THRESHOLDS = {
+    "tier1": 4,    # <= 4GB
+    "tier2": 6,    # 4-6GB
+    "tier3": 8,    # 6-8GB
+    "tier4": 12,   # 8-12GB
+    "tier5": 16,   # 12-16GB
+    "tier6": 24,   # 16-24GB
+    # "unlimited" for >= 24GB
+}
+# LM model memory requirements (in GB)
+LM_MODEL_MEMORY_GB = {
+    "0.6B": 3.0,
+    "1.7B": 8.0,
+    "4B": 12.0,
+}
+# LM model names mapping
+LM_MODEL_NAMES = {
+    "0.6B": "acestep-5Hz-lm-0.6B",
+    "1.7B": "acestep-5Hz-lm-1.7B",
+    "4B": "acestep-5Hz-lm-4B",
+}
+# ==============================================================================
+# Debug Constants
+# ==============================================================================
+# Tensor debug mode (values: "OFF" | "ON" | "VERBOSE")
+TENSOR_DEBUG_MODE = "OFF"
+# Placeholder debug switches for other main functionality (default "OFF")
+# Update names/usage as features adopt them.
+DEBUG_API_SERVER = "OFF"
+DEBUG_INFERENCE = "OFF"
+DEBUG_TRAINING = "OFF"
+DEBUG_DATASET = "OFF"
+DEBUG_AUDIO = "OFF"
+DEBUG_LLM = "OFF"
+DEBUG_UI = "OFF"
+DEBUG_MODEL_LOADING = "OFF"
+DEBUG_GPU = "OFF"

acestep/constrained_logits_processor.py ADDED Viewed

The diff for this file is too large to render. See raw diff

acestep/dit_alignment_score.py ADDED Viewed

	@@ -0,0 +1,877 @@

+"""
+DiT Alignment Score Module
+This module provides lyrics-to-audio alignment using cross-attention matrices
+from DiT model for generating LRC timestamps.
+Refactored from lyrics_alignment_infos.py for integration with ACE-Step.
+"""
+import numba
+import torch
+import numpy as np
+import torch.nn.functional as F
+from dataclasses import dataclass, asdict
+from typing import List, Dict, Any, Optional, Tuple, Union
+# ================= Data Classes =================
+@dataclass
+class TokenTimestamp:
+    """Stores per-token timing information."""
+    token_id: int
+    text: str
+    start: float
+    end: float
+    probability: float
+@dataclass
+class SentenceTimestamp:
+    """Stores per-sentence timing information with token list."""
+    text: str
+    start: float
+    end: float
+    tokens: List[TokenTimestamp]
+    confidence: float
+# ================= DTW Algorithm (Numba Optimized) =================
+@numba.jit(nopython=True)
+def dtw_cpu(x: np.ndarray):
+    """
+    Dynamic Time Warping algorithm optimized with Numba.
+    Args:
+        x: Cost matrix of shape [N, M]
+    Returns:
+        Tuple of (text_indices, time_indices) arrays
+    """
+    N, M = x.shape
+    # Use float32 for memory efficiency
+    cost = np.ones((N + 1, M + 1), dtype=np.float32) * np.inf
+    trace = -np.ones((N + 1, M + 1), dtype=np.float32)
+    cost[0, 0] = 0
+    for j in range(1, M + 1):
+        for i in range(1, N + 1):
+            c0 = cost[i - 1, j - 1]
+            c1 = cost[i - 1, j]
+            c2 = cost[i, j - 1]
+            if c0 < c1 and c0 < c2:
+                c, t = c0, 0
+            elif c1 < c0 and c1 < c2:
+                c, t = c1, 1
+            else:
+                c, t = c2, 2
+            cost[i, j] = x[i - 1, j - 1] + c
+            trace[i, j] = t
+    return _backtrace(trace, N, M)
+@numba.jit(nopython=True)
+def _backtrace(trace: np.ndarray, N: int, M: int):
+    """
+    Optimized backtrace function for DTW.
+    Args:
+        trace: Trace matrix of shape (N+1, M+1)
+        N, M: Original matrix dimensions
+    Returns:
+        Path array of shape (2, path_len) - first row is text indices, second is time indices
+    """
+    # Boundary handling
+    trace[0, :] = 2
+    trace[:, 0] = 1
+    # Pre-allocate array, max path length is N+M
+    max_path_len = N + M
+    path = np.zeros((2, max_path_len), dtype=np.int32)
+    i, j = N, M
+    path_idx = max_path_len - 1
+    while i > 0 or j > 0:
+        path[0, path_idx] = i - 1  # text index
+        path[1, path_idx] = j - 1  # time index
+        path_idx -= 1
+        t = trace[i, j]
+        if t == 0:
+            i -= 1
+            j -= 1
+        elif t == 1:
+            i -= 1
+        elif t == 2:
+            j -= 1
+        else:
+            break
+    actual_len = max_path_len - path_idx - 1
+    return path[:, path_idx + 1:max_path_len]
+# ================= Utility Functions =================
+def median_filter(x: torch.Tensor, filter_width: int) -> torch.Tensor:
+    """
+    Apply median filter to tensor.
+    Args:
+        x: Input tensor
+        filter_width: Width of median filter
+    Returns:
+        Filtered tensor
+    """
+    pad_width = filter_width // 2
+    if x.shape[-1] <= pad_width:
+        return x
+    if x.ndim == 2:
+        x = x[None, :]
+    x = F.pad(x, (filter_width // 2, filter_width // 2, 0, 0), mode="reflect")
+    result = x.unfold(-1, filter_width, 1).sort()[0][..., filter_width // 2]
+    if result.ndim > 2:
+        result = result.squeeze(0)
+    return result
+# ================= Main Aligner Class =================
+class MusicStampsAligner:
+    """
+    Aligner class for generating lyrics timestamps from cross-attention matrices.
+    Uses bidirectional consensus denoising and DTW for alignment.
+    """
+    def __init__(self, tokenizer):
+        """
+        Initialize the aligner.
+        Args:
+            tokenizer: Text tokenizer for decoding tokens
+        """
+        self.tokenizer = tokenizer
+    def _apply_bidirectional_consensus(
+        self,
+        weights_stack: torch.Tensor,
+        violence_level: float,
+        medfilt_width: int
+    ) -> tuple:
+        """
+        Core denoising logic using bidirectional consensus.
+        Args:
+            weights_stack: Attention weights [Heads, Tokens, Frames]
+            violence_level: Denoising strength coefficient
+            medfilt_width: Median filter width
+        Returns:
+            Tuple of (calc_matrix, energy_matrix) as numpy arrays
+        """
+        # A. Bidirectional Consensus
+        row_prob = F.softmax(weights_stack, dim=-1)  # Token -> Frame
+        col_prob = F.softmax(weights_stack, dim=-2)  # Frame -> Token
+        processed = row_prob * col_prob
+        # 1. Row suppression (kill horizontal crossing lines)
+        row_medians = torch.quantile(processed, 0.5, dim=-1, keepdim=True)
+        processed = processed - (violence_level * row_medians)
+        processed = torch.relu(processed)
+        # 2. Column suppression (kill vertical crossing lines)
+        col_medians = torch.quantile(processed, 0.5, dim=-2, keepdim=True)
+        processed = processed - (violence_level * col_medians)
+        processed = torch.relu(processed)
+        # C. Power sharpening
+        processed = processed ** 2
+        # Energy matrix for confidence
+        energy_matrix = processed.mean(dim=0).cpu().numpy()
+        # D. Z-Score normalization
+        std, mean = torch.std_mean(processed, unbiased=False)
+        weights_processed = (processed - mean) / (std + 1e-9)
+        # E. Median filtering
+        weights_processed = median_filter(weights_processed, filter_width=medfilt_width)
+        calc_matrix = weights_processed.mean(dim=0).numpy()
+        return calc_matrix, energy_matrix
+    def _preprocess_attention(
+        self,
+        attention_matrix: torch.Tensor,
+        custom_config: Dict[int, List[int]],
+        violence_level: float,
+        medfilt_width: int = 7
+    ) -> tuple:
+        """
+        Preprocess attention matrix for alignment.
+        Args:
+            attention_matrix: Attention tensor [Layers, Heads, Tokens, Frames]
+            custom_config: Dict mapping layer indices to head indices
+            violence_level: Denoising strength
+            medfilt_width: Median filter width
+        Returns:
+            Tuple of (calc_matrix, energy_matrix, visual_matrix)
+        """
+        if not isinstance(attention_matrix, torch.Tensor):
+            weights = torch.tensor(attention_matrix)
+        else:
+            weights = attention_matrix.clone()
+        weights = weights.cpu().float()
+        selected_tensors = []
+        for layer_idx, head_indices in custom_config.items():
+            for head_idx in head_indices:
+                if layer_idx < weights.shape[0] and head_idx < weights.shape[1]:
+                    head_matrix = weights[layer_idx, head_idx]
+                    selected_tensors.append(head_matrix)
+        if not selected_tensors:
+            return None, None, None
+        # Stack selected heads: [Heads, Tokens, Frames]
+        weights_stack = torch.stack(selected_tensors, dim=0)
+        visual_matrix = weights_stack.mean(dim=0).numpy()
+        calc_matrix, energy_matrix = self._apply_bidirectional_consensus(
+            weights_stack, violence_level, medfilt_width
+        )
+        return calc_matrix, energy_matrix, visual_matrix
+    def stamps_align_info(
+        self,
+        attention_matrix: torch.Tensor,
+        lyrics_tokens: List[int],
+        total_duration_seconds: float,
+        custom_config: Dict[int, List[int]],
+        return_matrices: bool = False,
+        violence_level: float = 2.0,
+        medfilt_width: int = 1
+    ) -> Dict[str, Any]:
+        """
+        Get alignment information from attention matrix.
+        Args:
+            attention_matrix: Cross-attention tensor [Layers, Heads, Tokens, Frames]
+            lyrics_tokens: List of lyrics token IDs
+            total_duration_seconds: Total audio duration in seconds
+            custom_config: Dict mapping layer indices to head indices
+            return_matrices: Whether to return intermediate matrices
+            violence_level: Denoising strength
+            medfilt_width: Median filter width
+        Returns:
+            Dict containing calc_matrix, lyrics_tokens, total_duration_seconds,
+            and optionally energy_matrix and vis_matrix
+        """
+        calc_matrix, energy_matrix, visual_matrix = self._preprocess_attention(
+            attention_matrix, custom_config, violence_level, medfilt_width
+        )
+        if calc_matrix is None:
+            return {
+                "calc_matrix": None,
+                "lyrics_tokens": lyrics_tokens,
+                "total_duration_seconds": total_duration_seconds,
+                "error": "No valid attention heads found"
+            }
+        return_dict = {
+            "calc_matrix": calc_matrix,
+            "lyrics_tokens": lyrics_tokens,
+            "total_duration_seconds": total_duration_seconds
+        }
+        if return_matrices:
+            return_dict['energy_matrix'] = energy_matrix
+            return_dict['vis_matrix'] = visual_matrix
+        return return_dict
+    def _decode_tokens_incrementally(self, token_ids: List[int]) -> List[str]:
+        """
+        Decode tokens incrementally to properly handle multi-byte UTF-8 characters.
+        For Chinese and other multi-byte characters, the tokenizer may split them
+        into multiple byte-level tokens. Decoding each token individually produces
+        invalid UTF-8 sequences (showing as �). This method uses byte-level comparison
+        to correctly track which characters each token contributes.
+        Args:
+            token_ids: List of token IDs
+        Returns:
+            List of decoded text for each token position
+        """
+        decoded_tokens = []
+        prev_bytes = b""
+        for i in range(len(token_ids)):
+            # Decode tokens from start to current position
+            current_text = self.tokenizer.decode(token_ids[:i+1], skip_special_tokens=False)
+            current_bytes = current_text.encode('utf-8', errors='surrogatepass')
+            # The contribution of current token is the new bytes added
+            if len(current_bytes) >= len(prev_bytes):
+                new_bytes = current_bytes[len(prev_bytes):]
+                # Try to decode the new bytes; if incomplete, use empty string
+                try:
+                    token_text = new_bytes.decode('utf-8')
+                except UnicodeDecodeError:
+                    # Incomplete UTF-8 sequence, this token doesn't complete a character
+                    token_text = ""
+            else:
+                # Edge case: current decode is shorter (shouldn't happen normally)
+                token_text = ""
+            decoded_tokens.append(token_text)
+            prev_bytes = current_bytes
+        return decoded_tokens
+    def token_timestamps(
+        self,
+        calc_matrix: np.ndarray,
+        lyrics_tokens: List[int],
+        total_duration_seconds: float
+    ) -> List[TokenTimestamp]:
+        """
+        Generate per-token timestamps using DTW.
+        Args:
+            calc_matrix: Processed attention matrix [Tokens, Frames]
+            lyrics_tokens: List of token IDs
+            total_duration_seconds: Total audio duration
+        Returns:
+            List of TokenTimestamp objects
+        """
+        n_frames = calc_matrix.shape[-1]
+        text_indices, time_indices = dtw_cpu(-calc_matrix.astype(np.float64))
+        seconds_per_frame = total_duration_seconds / n_frames
+        alignment_results = []
+        # Use incremental decoding to properly handle multi-byte UTF-8 characters
+        decoded_tokens = self._decode_tokens_incrementally(lyrics_tokens)
+        for i in range(len(lyrics_tokens)):
+            mask = (text_indices == i)
+            if not np.any(mask):
+                start = alignment_results[-1].end if alignment_results else 0.0
+                end = start
+                token_conf = 0.0
+            else:
+                times = time_indices[mask] * seconds_per_frame
+                start = times[0]
+                end = times[-1]
+                token_conf = 0.0
+            if end < start:
+                end = start
+            alignment_results.append(TokenTimestamp(
+                token_id=lyrics_tokens[i],
+                text=decoded_tokens[i],
+                start=float(start),
+                end=float(end),
+                probability=token_conf
+            ))
+        return alignment_results
+    def _decode_sentence_from_tokens(self, tokens: List[TokenTimestamp]) -> str:
+        """
+        Decode a sentence by decoding all token IDs together.
+        This avoids UTF-8 encoding issues from joining individual token texts.
+        Args:
+            tokens: List of TokenTimestamp objects
+        Returns:
+            Properly decoded sentence text
+        """
+        token_ids = [t.token_id for t in tokens]
+        return self.tokenizer.decode(token_ids, skip_special_tokens=False)
+    def sentence_timestamps(
+        self,
+        token_alignment: List[TokenTimestamp]
+    ) -> List[SentenceTimestamp]:
+        """
+        Group token timestamps into sentence timestamps.
+        Args:
+            token_alignment: List of TokenTimestamp objects
+        Returns:
+            List of SentenceTimestamp objects
+        """
+        results = []
+        current_tokens = []
+        for token in token_alignment:
+            current_tokens.append(token)
+            if '\n' in token.text:
+                # Decode all token IDs together to avoid UTF-8 issues
+                full_text = self._decode_sentence_from_tokens(current_tokens)
+                if full_text.strip():
+                    valid_scores = [t.probability for t in current_tokens if t.probability > 0]
+                    sent_conf = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
+                    results.append(SentenceTimestamp(
+                        text=full_text.strip(),
+                        start=round(current_tokens[0].start, 3),
+                        end=round(current_tokens[-1].end, 3),
+                        tokens=list(current_tokens),
+                        confidence=sent_conf
+                    ))
+                current_tokens = []
+        # Handle last sentence
+        if current_tokens:
+            # Decode all token IDs together to avoid UTF-8 issues
+            full_text = self._decode_sentence_from_tokens(current_tokens)
+            if full_text.strip():
+                valid_scores = [t.probability for t in current_tokens if t.probability > 0]
+                sent_conf = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
+                results.append(SentenceTimestamp(
+                    text=full_text.strip(),
+                    start=round(current_tokens[0].start, 3),
+                    end=round(current_tokens[-1].end, 3),
+                    tokens=list(current_tokens),
+                    confidence=sent_conf
+                ))
+        # Normalize confidence scores
+        if results:
+            all_scores = [s.confidence for s in results]
+            min_score = min(all_scores)
+            max_score = max(all_scores)
+            score_range = max_score - min_score
+            if score_range > 1e-9:
+                for s in results:
+                    normalized_score = (s.confidence - min_score) / score_range
+                    s.confidence = round(normalized_score, 2)
+            else:
+                for s in results:
+                    s.confidence = round(s.confidence, 2)
+        return results
+    def format_lrc(
+        self,
+        sentence_timestamps: List[SentenceTimestamp],
+        include_end_time: bool = False
+    ) -> str:
+        """
+        Format sentence timestamps as LRC lyrics format.
+        Args:
+            sentence_timestamps: List of SentenceTimestamp objects
+            include_end_time: Whether to include end time (enhanced LRC format)
+        Returns:
+            LRC formatted string
+        """
+        lines = []
+        for sentence in sentence_timestamps:
+            # Convert seconds to mm:ss.xx format
+            start_minutes = int(sentence.start // 60)
+            start_seconds = sentence.start % 60
+            if include_end_time:
+                end_minutes = int(sentence.end // 60)
+                end_seconds = sentence.end % 60
+                timestamp = f"[{start_minutes:02d}:{start_seconds:05.2f}][{end_minutes:02d}:{end_seconds:05.2f}]"
+            else:
+                timestamp = f"[{start_minutes:02d}:{start_seconds:05.2f}]"
+            # Clean the text (remove structural tags like [verse], [chorus])
+            text = sentence.text
+            lines.append(f"{timestamp}{text}")
+        return "\n".join(lines)
+    def get_timestamps_and_lrc(
+        self,
+        calc_matrix: np.ndarray,
+        lyrics_tokens: List[int],
+        total_duration_seconds: float
+    ) -> Dict[str, Any]:
+        """
+        Convenience method to get both timestamps and LRC in one call.
+        Args:
+            calc_matrix: Processed attention matrix
+            lyrics_tokens: List of token IDs
+            total_duration_seconds: Total audio duration
+        Returns:
+            Dict containing token_timestamps, sentence_timestamps, and lrc_text
+        """
+        token_stamps = self.token_timestamps(
+            calc_matrix=calc_matrix,
+            lyrics_tokens=lyrics_tokens,
+            total_duration_seconds=total_duration_seconds
+        )
+        sentence_stamps = self.sentence_timestamps(token_stamps)
+        lrc_text = self.format_lrc(sentence_stamps)
+        return {
+            "token_timestamps": token_stamps,
+            "sentence_timestamps": sentence_stamps,
+            "lrc_text": lrc_text
+        }
+class MusicLyricScorer:
+    """
+    Scorer class for evaluating lyrics-to-audio alignment quality.
+    Focuses on calculating alignment quality metrics (Coverage, Monotonicity, Confidence)
+    using tensor operations for potential differentiability or GPU acceleration.
+    """
+    def __init__(self, tokenizer: Any):
+        """
+        Initialize the aligner.
+        Args:
+            tokenizer: Tokenizer instance (must implement .decode()).
+        """
+        self.tokenizer = tokenizer
+    def _generate_token_type_mask(self, token_ids: List[int]) -> np.ndarray:
+        """
+        Generate a mask distinguishing lyrics (1) from structural tags (0).
+        Uses self.tokenizer to decode tokens.
+        Args:
+            token_ids: List of token IDs.
+        Returns:
+            Numpy array of shape [len(token_ids)] with 1 or 0.
+        """
+        decoded_tokens = [self.tokenizer.decode([tid]) for tid in token_ids]
+        mask = np.ones(len(token_ids), dtype=np.int32)
+        in_bracket = False
+        for i, token_str in enumerate(decoded_tokens):
+            if '[' in token_str:
+                in_bracket = True
+            if in_bracket:
+                mask[i] = 0
+            if ']' in token_str:
+                in_bracket = False
+                mask[i] = 0
+        return mask
+    def _preprocess_attention(
+            self,
+            attention_matrix: Union[torch.Tensor, np.ndarray],
+            custom_config: Dict[int, List[int]],
+            medfilt_width: int = 1
+    ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[torch.Tensor]]:
+        """
+        Extracts and normalizes the attention matrix.
+        Logic V4: Uses Min-Max normalization to highlight energy differences.
+        Args:
+            attention_matrix: Raw attention tensor [Layers, Heads, Tokens, Frames].
+            custom_config: Config mapping layers to heads.
+            medfilt_width: Width for median filtering.
+        Returns:
+            Tuple of (calc_matrix, energy_matrix, avg_weights_tensor).
+        """
+        # 1. Prepare Tensor
+        if not isinstance(attention_matrix, torch.Tensor):
+            weights = torch.tensor(attention_matrix)
+        else:
+            weights = attention_matrix.clone()
+        weights = weights.cpu().float()
+        # 2. Select Heads based on config
+        selected_tensors = []
+        for layer_idx, head_indices in custom_config.items():
+            for head_idx in head_indices:
+                if layer_idx < weights.shape[0] and head_idx < weights.shape[1]:
+                    selected_tensors.append(weights[layer_idx, head_idx])
+        if not selected_tensors:
+            return None, None, None
+        weights_stack = torch.stack(selected_tensors, dim=0)
+        # 3. Average Heads
+        avg_weights = weights_stack.mean(dim=0)  # [Tokens, Frames]
+        # 4. Preprocessing Logic
+        # Min-Max normalization preserving energy distribution
+        # Median filter is applied to the energy matrix
+        energy_tensor = median_filter(avg_weights, filter_width=medfilt_width)
+        energy_matrix = energy_tensor.numpy()
+        e_min, e_max = energy_matrix.min(), energy_matrix.max()
+        if e_max - e_min > 1e-9:
+            energy_matrix = (energy_matrix - e_min) / (e_max - e_min)
+        else:
+            energy_matrix = np.zeros_like(energy_matrix)
+        # Contrast enhancement for DTW pathfinding
+        # calc_matrix is used for pathfinding, energy_matrix for scoring
+        calc_matrix = energy_matrix ** 2
+        return calc_matrix, energy_matrix, avg_weights
+    def _compute_alignment_metrics(
+            self,
+            energy_matrix: torch.Tensor,
+            path_coords: torch.Tensor,
+            type_mask: torch.Tensor,
+            time_weight: float = 0.01,
+            overlap_frames: float = 9.0,
+            instrumental_weight: float = 1.0
+    ) -> Tuple[float, float, float]:
+        """
+        Core metric calculation logic using high-precision Tensor operations.
+        Args:
+            energy_matrix: Normalized energy [Rows, Cols].
+            path_coords: DTW path coordinates [Steps, 2].
+            type_mask: Token type mask [Rows] (1=Lyrics, 0=Tags).
+            time_weight: Minimum energy threshold for monotonicity.
+            overlap_frames: Allowed overlap for monotonicity check.
+            instrumental_weight: Weight for non-lyric tokens in confidence calc.
+        Returns:
+            Tuple of (coverage, monotonicity, confidence).
+        """
+        # Ensure high precision for internal calculation
+        energy_matrix = energy_matrix.to(dtype=torch.float64)
+        path_coords = path_coords.long()
+        type_mask = type_mask.long()
+        device = energy_matrix.device
+        rows, cols = energy_matrix.shape
+        is_lyrics_row = (type_mask == 1)
+        # ================= A. Coverage Score =================
+        # Ratio of lyric lines that have significant energy peak
+        row_max_energies = energy_matrix.max(dim=1).values
+        total_sung_rows = is_lyrics_row.sum().double()
+        coverage_threshold = 0.1
+        valid_sung_mask = is_lyrics_row & (row_max_energies > coverage_threshold)
+        valid_sung_rows = valid_sung_mask.sum().double()
+        if total_sung_rows > 0:
+            coverage_score = valid_sung_rows / total_sung_rows
+        else:
+            coverage_score = torch.tensor(1.0, device=device, dtype=torch.float64)
+        # ================= B. Monotonicity Score =================
+        # Check if the "center of mass" of lyric lines moves forward in time
+        col_indices = torch.arange(cols, device=device, dtype=torch.float64)
+        # Zero out low energy noise
+        weights = torch.where(
+            energy_matrix > time_weight,
+            energy_matrix,
+            torch.zeros_like(energy_matrix)
+        )
+        sum_w = weights.sum(dim=1)
+        sum_t = (weights * col_indices).sum(dim=1)
+        # Calculate centroids
+        centroids = torch.full((rows,), -1.0, device=device, dtype=torch.float64)
+        valid_w_mask = sum_w > 1e-9
+        centroids[valid_w_mask] = sum_t[valid_w_mask] / sum_w[valid_w_mask]
+        # Extract sequence of valid lyrics centroids
+        valid_sequence_mask = is_lyrics_row & (centroids >= 0)
+        sung_centroids = centroids[valid_sequence_mask]
+        cnt = sung_centroids.shape[0]
+        if cnt > 1:
+            curr_c = sung_centroids[:-1]
+            next_c = sung_centroids[1:]
+            # Check non-decreasing order with overlap tolerance
+            non_decreasing = (next_c >= (curr_c - overlap_frames)).double().sum()
+            pairs = torch.tensor(cnt - 1, device=device, dtype=torch.float64)
+            monotonicity_score = non_decreasing / pairs
+        else:
+            monotonicity_score = torch.tensor(1.0, device=device, dtype=torch.float64)
+        # ================= C. Path Confidence =================
+        # Average energy along the optimal path
+        if path_coords.shape[0] > 0:
+            p_rows = path_coords[:, 0]
+            p_cols = path_coords[:, 1]
+            path_energies = energy_matrix[p_rows, p_cols]
+            step_weights = torch.ones_like(path_energies)
+            # Lower weight for instrumental/tag steps
+            is_inst_step = (type_mask[p_rows] == 0)
+            step_weights[is_inst_step] = instrumental_weight
+            total_energy = (path_energies * step_weights).sum()
+            total_steps = step_weights.sum()
+            if total_steps > 0:
+                path_confidence = total_energy / total_steps
+            else:
+                path_confidence = torch.tensor(0.0, device=device, dtype=torch.float64)
+        else:
+            path_confidence = torch.tensor(0.0, device=device, dtype=torch.float64)
+        return coverage_score.item(), monotonicity_score.item(), path_confidence.item()
+    def lyrics_alignment_info(
+            self,
+            attention_matrix: Union[torch.Tensor, np.ndarray],
+            token_ids: List[int],
+            custom_config: Dict[int, List[int]],
+            return_matrices: bool = False,
+            medfilt_width: int = 1
+    ) -> Dict[str, Any]:
+        """
+        Generates alignment path and processed matrices.
+        Args:
+            attention_matrix: Input attention tensor.
+            token_ids: Corresponding token IDs.
+            custom_config: Layer/Head configuration.
+            return_matrices: If True, returns matrices in the output.
+            medfilt_width: Median filter width.
+        Returns:
+            Dict or AlignmentInfo object containing path and masks.
+        """
+        calc_matrix, energy_matrix, vis_matrix = self._preprocess_attention(
+            attention_matrix, custom_config, medfilt_width
+        )
+        if calc_matrix is None:
+            return {
+                "calc_matrix": None,
+                "error": "No valid attention heads found"
+            }
+        # 1. Generate Semantic Mask (1=Lyrics, 0=Tags)
+        # Uses self.tokenizer internally
+        type_mask = self._generate_token_type_mask(token_ids)
+        # Safety check for shape mismatch
+        if len(type_mask) != energy_matrix.shape[0]:
+            # Fallback to all lyrics if shapes don't align
+            type_mask = np.ones(energy_matrix.shape[0], dtype=np.int32)
+        # 2. DTW Pathfinding
+        # Using negative calc_matrix because DTW minimizes cost
+        text_indices, time_indices = dtw_cpu(-calc_matrix.astype(np.float32))
+        path_coords = np.stack([text_indices, time_indices], axis=1)
+        return_dict = {
+            "path_coords": path_coords,
+            "type_mask": type_mask,
+            "energy_matrix": energy_matrix
+        }
+        if return_matrices:
+            return_dict['calc_matrix'] = calc_matrix
+            return_dict['vis_matrix'] = vis_matrix
+        return return_dict
+    def calculate_score(
+            self,
+            energy_matrix: Union[torch.Tensor, np.ndarray],
+            type_mask: Union[torch.Tensor, np.ndarray],
+            path_coords: Union[torch.Tensor, np.ndarray],
+            time_weight: float = 0.01,
+            overlap_frames: float = 9.0,
+            instrumental_weight: float = 1.0
+    ) -> Dict[str, Any]:
+        """
+        Calculates the final alignment score based on pre-computed components.
+        Args:
+            energy_matrix: Processed energy matrix.
+            type_mask: Token type mask.
+            path_coords: DTW path coordinates.
+            time_weight: Minimum energy threshold for monotonicity.
+            overlap_frames: Allowed backward movement frames.
+            instrumental_weight: Weight for non-lyric path steps.
+        Returns:
+            AlignmentScore object containing individual metrics and final score.
+        """
+        # Ensure Inputs are Tensors on the correct device
+        if not isinstance(energy_matrix, torch.Tensor):
+            # Use available accelerator device; fallback to CPU if none
+            if torch.cuda.is_available():
+                _score_device = "cuda"
+            elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+                _score_device = "mps"
+            else:
+                _score_device = "cpu"
+            energy_matrix = torch.tensor(energy_matrix, device=_score_device, dtype=torch.float32)
+        device = energy_matrix.device
+        if not isinstance(type_mask, torch.Tensor):
+            type_mask = torch.tensor(type_mask, device=device, dtype=torch.long)
+        else:
+            type_mask = type_mask.to(device=device, dtype=torch.long)
+        if not isinstance(path_coords, torch.Tensor):
+            path_coords = torch.tensor(path_coords, device=device, dtype=torch.long)
+        else:
+            path_coords = path_coords.to(device=device, dtype=torch.long)
+        # Compute Metrics
+        coverage, monotonicity, confidence = self._compute_alignment_metrics(
+            energy_matrix=energy_matrix,
+            path_coords=path_coords,
+            type_mask=type_mask,
+            time_weight=time_weight,
+            overlap_frames=overlap_frames,
+            instrumental_weight=instrumental_weight
+        )
+        # Final Score Calculation
+        # (Cov^2 * Mono^2 * Conf)
+        final_score = (coverage ** 2) * (monotonicity ** 2) * confidence
+        final_score = float(np.clip(final_score, 0.0, 1.0))
+        return {
+            "lyrics_score": round(final_score, 4)
+        }

acestep/genres_vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

acestep/gpu_config.py ADDED Viewed

	@@ -0,0 +1,549 @@

+"""
+GPU Configuration Module
+Centralized GPU memory detection and adaptive configuration management
+    Debug Mode:
+        Set environment variable MAX_CUDA_VRAM to simulate different GPU memory sizes.
+        Example: MAX_CUDA_VRAM=8 python acestep  # Simulates 8GB GPU
+        For MPS testing, use MAX_MPS_VRAM to simulate MPS memory.
+        Example: MAX_MPS_VRAM=16 python acestep  # Simulates 16GB MPS
+    This is useful for testing GPU tier configurations on high-end hardware.
+"""
+import os
+import sys
+from dataclasses import dataclass
+from typing import Optional, List, Dict, Tuple
+from loguru import logger
+# Environment variable for debugging/testing different GPU memory configurations
+DEBUG_MAX_CUDA_VRAM_ENV = "MAX_CUDA_VRAM"
+DEBUG_MAX_MPS_VRAM_ENV = "MAX_MPS_VRAM"
+# Tolerance for 16GB detection: reported VRAM like 15.5GB is effectively 16GB hardware
+# Real-world 16GB GPUs often report 15.7-15.9GB due to system/driver reservations
+VRAM_16GB_TOLERANCE_GB = 0.5
+VRAM_16GB_MIN_GB = 16.0 - VRAM_16GB_TOLERANCE_GB  # treat as 16GB class if >= this
+# PyTorch installation URLs for diagnostics
+PYTORCH_CUDA_INSTALL_URL = "https://download.pytorch.org/whl/cu121"
+PYTORCH_ROCM_INSTALL_URL = "https://download.pytorch.org/whl/rocm6.0"
+@dataclass
+class GPUConfig:
+    """GPU configuration based on available memory"""
+    tier: str  # "tier1", "tier2", etc. or "unlimited"
+    gpu_memory_gb: float
+    # Duration limits (in seconds)
+    max_duration_with_lm: int  # When LM is initialized
+    max_duration_without_lm: int  # When LM is not initialized
+    # Batch size limits
+    max_batch_size_with_lm: int
+    max_batch_size_without_lm: int
+    # LM configuration
+    init_lm_default: bool  # Whether to initialize LM by default
+    available_lm_models: List[str]  # Available LM models for this tier
+    # LM memory allocation (GB) for each model size
+    lm_memory_gb: Dict[str, float]  # e.g., {"0.6B": 3, "1.7B": 8, "4B": 12}
+# GPU tier configurations
+GPU_TIER_CONFIGS = {
+    "tier1": {  # <= 4GB
+        "max_duration_with_lm": 180,  # 3 minutes
+        "max_duration_without_lm": 180,  # 3 minutes
+        "max_batch_size_with_lm": 1,
+        "max_batch_size_without_lm": 1,
+        "init_lm_default": False,
+        "available_lm_models": [],
+        "lm_memory_gb": {},
+    },
+    "tier2": {  # 4-6GB
+        "max_duration_with_lm": 360,  # 6 minutes
+        "max_duration_without_lm": 360,  # 6 minutes
+        "max_batch_size_with_lm": 1,
+        "max_batch_size_without_lm": 1,
+        "init_lm_default": False,
+        "available_lm_models": [],
+        "lm_memory_gb": {},
+    },
+    "tier3": {  # 6-8GB
+        "max_duration_with_lm": 240,  # 4 minutes with LM
+        "max_duration_without_lm": 360,  # 6 minutes without LM
+        "max_batch_size_with_lm": 1,
+        "max_batch_size_without_lm": 2,
+        "init_lm_default": False,  # Don't init by default due to limited memory
+        "available_lm_models": ["acestep-5Hz-lm-0.6B"],
+        "lm_memory_gb": {"0.6B": 3},
+    },
+    "tier4": {  # 8-12GB
+        "max_duration_with_lm": 240,  # 4 minutes with LM
+        "max_duration_without_lm": 360,  # 6 minutes without LM
+        "max_batch_size_with_lm": 2,
+        "max_batch_size_without_lm": 4,
+        "init_lm_default": False,  # Don't init by default
+        "available_lm_models": ["acestep-5Hz-lm-0.6B"],
+        "lm_memory_gb": {"0.6B": 3},
+    },
+    "tier5": {  # 12-16GB
+        "max_duration_with_lm": 240,  # 4 minutes with LM
+        "max_duration_without_lm": 360,  # 6 minutes without LM
+        "max_batch_size_with_lm": 2,
+        "max_batch_size_without_lm": 4,
+        "init_lm_default": True,
+        "available_lm_models": ["acestep-5Hz-lm-0.6B", "acestep-5Hz-lm-1.7B"],
+        "lm_memory_gb": {"0.6B": 3, "1.7B": 8},
+    },
+    "tier6": {  # 16-24GB
+        "max_duration_with_lm": 480,  # 8 minutes
+        "max_duration_without_lm": 480,  # 8 minutes
+        "max_batch_size_with_lm": 4,
+        "max_batch_size_without_lm": 8,
+        "init_lm_default": True,
+        "available_lm_models": ["acestep-5Hz-lm-0.6B", "acestep-5Hz-lm-1.7B", "acestep-5Hz-lm-4B"],
+        "lm_memory_gb": {"0.6B": 3, "1.7B": 8, "4B": 12},
+    },
+    "unlimited": {  # >= 24GB
+        "max_duration_with_lm": 600,  # 10 minutes (max supported)
+        "max_duration_without_lm": 600,  # 10 minutes
+        "max_batch_size_with_lm": 8,
+        "max_batch_size_without_lm": 8,
+        "init_lm_default": True,
+        "available_lm_models": ["acestep-5Hz-lm-0.6B", "acestep-5Hz-lm-1.7B", "acestep-5Hz-lm-4B"],
+        "lm_memory_gb": {"0.6B": 3, "1.7B": 8, "4B": 12},
+    },
+}
+def get_gpu_memory_gb() -> float:
+    """
+    Get GPU memory in GB. Returns 0 if no GPU is available.
+    Debug Mode:
+        Set environment variable MAX_CUDA_VRAM to override the detected GPU memory.
+        Example: MAX_CUDA_VRAM=8 python acestep  # Simulates 8GB GPU
+        For MPS testing, set MAX_MPS_VRAM to override MPS memory detection.
+        Example: MAX_MPS_VRAM=16 python acestep  # Simulates 16GB MPS
+        This allows testing different GPU tier configurations on high-end hardware.
+    """
+    # Check for debug override first
+    debug_vram = os.environ.get(DEBUG_MAX_CUDA_VRAM_ENV)
+    if debug_vram is not None:
+        try:
+            simulated_gb = float(debug_vram)
+            logger.warning(f"⚠️ DEBUG MODE: Simulating GPU memory as {simulated_gb:.1f}GB (set via {DEBUG_MAX_CUDA_VRAM_ENV} environment variable)")
+            return simulated_gb
+        except ValueError:
+            logger.warning(f"Invalid {DEBUG_MAX_CUDA_VRAM_ENV} value: {debug_vram}, ignoring")
+    debug_mps_vram = os.environ.get(DEBUG_MAX_MPS_VRAM_ENV)
+    if debug_mps_vram is not None:
+        try:
+            simulated_gb = float(debug_mps_vram)
+            logger.warning(f"⚠️ DEBUG MODE: Simulating MPS memory as {simulated_gb:.1f}GB (set via {DEBUG_MAX_MPS_VRAM_ENV} environment variable)")
+            return simulated_gb
+        except ValueError:
+            logger.warning(f"Invalid {DEBUG_MAX_MPS_VRAM_ENV} value: {debug_mps_vram}, ignoring")
+    try:
+        import torch
+        if torch.cuda.is_available():
+            # Get total memory of the first GPU in GB
+            total_memory = torch.cuda.get_device_properties(0).total_memory
+            memory_gb = total_memory / (1024**3)  # Convert bytes to GB
+            device_name = torch.cuda.get_device_name(0)
+            is_rocm = hasattr(torch.version, 'hip') and torch.version.hip is not None
+            if is_rocm:
+                logger.info(f"ROCm GPU detected: {device_name} ({memory_gb:.1f} GB, HIP {torch.version.hip})")
+            else:
+                logger.info(f"CUDA GPU detected: {device_name} ({memory_gb:.1f} GB)")
+            return memory_gb
+        elif hasattr(torch, 'xpu') and torch.xpu.is_available():
+            # Get total memory of the first XPU in GB
+            total_memory = torch.xpu.get_device_properties(0).total_memory
+            memory_gb = total_memory / (1024**3)  # Convert bytes to GB
+            return memory_gb
+        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            mps_module = getattr(torch, "mps", None)
+            try:
+                if mps_module is not None and hasattr(mps_module, "recommended_max_memory"):
+                    total_memory = mps_module.recommended_max_memory()
+                    memory_gb = total_memory / (1024**3)  # Convert bytes to GB
+                    return memory_gb
+                if mps_module is not None and hasattr(mps_module, "get_device_properties"):
+                    props = mps_module.get_device_properties(0)
+                    total_memory = getattr(props, "total_memory", None)
+                    if total_memory:
+                        memory_gb = total_memory / (1024**3)
+                        return memory_gb
+            except Exception as e:
+                logger.warning(f"Failed to detect MPS memory: {e}")
+            # Fallback: estimate from system unified memory (Apple Silicon shares CPU/GPU RAM)
+            try:
+                import subprocess
+                result = subprocess.run(
+                    ["sysctl", "-n", "hw.memsize"],
+                    capture_output=True, text=True, timeout=5
+                )
+                total_system_bytes = int(result.stdout.strip())
+                # MPS can use up to ~75% of unified memory for GPU workloads
+                memory_gb = (total_system_bytes / (1024**3)) * 0.75
+                return memory_gb
+            except Exception:
+                logger.warning(f"MPS available but total memory not exposed. Set {DEBUG_MAX_MPS_VRAM_ENV} to enable tiering.")
+                # Conservative fallback for M1/M2
+                return 8.0
+        else:
+            # No GPU detected - provide diagnostic information
+            _log_gpu_diagnostic_info(torch)
+            return 0
+    except Exception as e:
+        logger.warning(f"Failed to detect GPU memory: {e}")
+        return 0
+def _log_gpu_diagnostic_info(torch_module):
+    """
+    Log diagnostic information when GPU is not detected to help users troubleshoot.
+    Args:
+        torch_module: The torch module to inspect for build information
+    """
+    logger.warning("=" * 80)
+    logger.warning("⚠️ GPU NOT DETECTED - DIAGNOSTIC INFORMATION")
+    logger.warning("=" * 80)
+    # Check PyTorch build type
+    is_rocm_build = hasattr(torch_module.version, 'hip') and torch_module.version.hip is not None
+    is_cuda_build = hasattr(torch_module.version, 'cuda') and torch_module.version.cuda is not None
+    if is_rocm_build:
+        logger.warning("✓ PyTorch ROCm build detected")
+        logger.warning(f"  HIP version: {torch_module.version.hip}")
+        logger.warning("")
+        logger.warning("❌ torch.cuda.is_available() returned False")
+        logger.warning("")
+        logger.warning("Common causes for AMD/ROCm GPUs:")
+        logger.warning("  1. ROCm drivers not installed or not properly configured")
+        logger.warning("  2. GPU not supported by installed ROCm version")
+        logger.warning("  3. Missing or incorrect HSA_OVERRIDE_GFX_VERSION environment variable")
+        logger.warning("  4. ROCm runtime libraries not in system path")
+        logger.warning("")
+        # Check for common environment variables
+        hsa_override = os.environ.get('HSA_OVERRIDE_GFX_VERSION')
+        if hsa_override:
+            logger.warning(f"  HSA_OVERRIDE_GFX_VERSION is set to: {hsa_override}")
+        else:
+            logger.warning("  ⚠️ HSA_OVERRIDE_GFX_VERSION is not set")
+            logger.warning("     For RDNA3 GPUs (RX 7000 series, RX 9000 series):")
+            logger.warning("       - RX 7900 XT/XTX, RX 9070 XT: set HSA_OVERRIDE_GFX_VERSION=11.0.0")
+            logger.warning("       - RX 7800 XT, RX 7700 XT: set HSA_OVERRIDE_GFX_VERSION=11.0.1")
+            logger.warning("       - RX 7600: set HSA_OVERRIDE_GFX_VERSION=11.0.2")
+        logger.warning("")
+        logger.warning("Troubleshooting steps:")
+        logger.warning("  1. Verify ROCm installation:")
+        logger.warning("     rocm-smi  # Should list your GPU")
+        logger.warning("  2. Check PyTorch ROCm build:")
+        logger.warning("     python -c \"import torch; print(f'ROCm: {torch.version.hip}')\"")
+        logger.warning("  3. Set HSA_OVERRIDE_GFX_VERSION for your GPU (see above)")
+        logger.warning("  4. On Windows: Use start_gradio_ui_rocm.bat which sets required env vars")
+        logger.warning("  5. See docs/en/ACE-Step1.5-Rocm-Manual-Linux.md for Linux setup")
+        logger.warning("  6. See requirements-rocm.txt for Windows ROCm setup instructions")
+    elif is_cuda_build:
+        logger.warning("✓ PyTorch CUDA build detected")
+        logger.warning(f"  CUDA version: {torch_module.version.cuda}")
+        logger.warning("")
+        logger.warning("❌ torch.cuda.is_available() returned False")
+        logger.warning("")
+        logger.warning("Common causes for NVIDIA GPUs:")
+        logger.warning("  1. NVIDIA drivers not installed")
+        logger.warning("  2. CUDA runtime not installed or version mismatch")
+        logger.warning("  3. GPU not supported by installed CUDA version")
+        logger.warning("")
+        logger.warning("Troubleshooting steps:")
+        logger.warning("  1. Verify NVIDIA driver installation:")
+        logger.warning("     nvidia-smi  # Should list your GPU")
+        logger.warning("  2. Check CUDA version compatibility")
+        logger.warning("  3. Reinstall PyTorch with CUDA support:")
+        logger.warning(f"     pip install torch --index-url {PYTORCH_CUDA_INSTALL_URL}")
+    else:
+        logger.warning("⚠️ PyTorch build type: CPU-only")
+        logger.warning("")
+        logger.warning("You have installed a CPU-only version of PyTorch!")
+        logger.warning("")
+        logger.warning("For NVIDIA GPUs:")
+        logger.warning(f"  pip install torch --index-url {PYTORCH_CUDA_INSTALL_URL}")
+        logger.warning("")
+        logger.warning("For AMD GPUs with ROCm:")
+        logger.warning("  Windows: See requirements-rocm.txt for detailed instructions")
+        logger.warning(f"  Linux: pip install torch --index-url {PYTORCH_ROCM_INSTALL_URL}")
+        logger.warning("")
+        logger.warning("For more information, see README.md section 'AMD / ROCm GPUs'")
+    logger.warning("=" * 80)
+def get_gpu_tier(gpu_memory_gb: float) -> str:
+    """
+    Determine GPU tier based on available memory.
+    Args:
+        gpu_memory_gb: GPU memory in GB
+    Returns:
+        Tier string: "tier1", "tier2", "tier3", "tier4", "tier5", "tier6", or "unlimited"
+    """
+    if gpu_memory_gb <= 0:
+        # CPU mode - use tier1 limits
+        return "tier1"
+    elif gpu_memory_gb <= 4:
+        return "tier1"
+    elif gpu_memory_gb <= 6:
+        return "tier2"
+    elif gpu_memory_gb <= 8:
+        return "tier3"
+    elif gpu_memory_gb <= 12:
+        return "tier4"
+    elif gpu_memory_gb < VRAM_16GB_MIN_GB:
+        return "tier5"
+    elif gpu_memory_gb <= 24:
+        if gpu_memory_gb < 16.0:
+            logger.info(f"Detected {gpu_memory_gb:.2f}GB VRAM — treating as 16GB class GPU")
+        return "tier6"
+    else:
+        return "unlimited"
+def get_gpu_config(gpu_memory_gb: Optional[float] = None) -> GPUConfig:
+    """
+    Get GPU configuration based on detected or provided GPU memory.
+    Args:
+        gpu_memory_gb: GPU memory in GB. If None, will be auto-detected.
+    Returns:
+        GPUConfig object with all configuration parameters
+    """
+    if gpu_memory_gb is None:
+        gpu_memory_gb = get_gpu_memory_gb()
+    tier = get_gpu_tier(gpu_memory_gb)
+    config = GPU_TIER_CONFIGS[tier]
+    return GPUConfig(
+        tier=tier,
+        gpu_memory_gb=gpu_memory_gb,
+        max_duration_with_lm=config["max_duration_with_lm"],
+        max_duration_without_lm=config["max_duration_without_lm"],
+        max_batch_size_with_lm=config["max_batch_size_with_lm"],
+        max_batch_size_without_lm=config["max_batch_size_without_lm"],
+        init_lm_default=config["init_lm_default"],
+        available_lm_models=config["available_lm_models"],
+        lm_memory_gb=config["lm_memory_gb"],
+    )
+def get_lm_model_size(model_path: str) -> str:
+    """
+    Extract LM model size from model path.
+    Args:
+        model_path: Model path string (e.g., "acestep-5Hz-lm-0.6B")
+    Returns:
+        Model size string: "0.6B", "1.7B", or "4B"
+    """
+    if "0.6B" in model_path:
+        return "0.6B"
+    elif "1.7B" in model_path:
+        return "1.7B"
+    elif "4B" in model_path:
+        return "4B"
+    else:
+        # Default to smallest model assumption
+        return "0.6B"
+def get_lm_gpu_memory_ratio(model_path: str, total_gpu_memory_gb: float) -> Tuple[float, float]:
+    """
+    Calculate GPU memory utilization ratio for LM model.
+    Args:
+        model_path: LM model path (e.g., "acestep-5Hz-lm-0.6B")
+        total_gpu_memory_gb: Total GPU memory in GB
+    Returns:
+        Tuple of (gpu_memory_utilization_ratio, target_memory_gb)
+    """
+    model_size = get_lm_model_size(model_path)
+    # Target memory allocation for each model size
+    target_memory = {
+        "0.6B": 3.0,
+        "1.7B": 8.0,
+        "4B": 12.0,
+    }
+    target_gb = target_memory.get(model_size, 3.0)
+    # For large GPUs (>=24GB), don't restrict memory too much
+    if total_gpu_memory_gb >= 24:
+        # Use a reasonable ratio that allows the model to run efficiently
+        ratio = min(0.9, max(0.2, target_gb / total_gpu_memory_gb))
+    else:
+        # For smaller GPUs, strictly limit memory usage
+        ratio = min(0.9, max(0.1, target_gb / total_gpu_memory_gb))
+    return ratio, target_gb
+def check_duration_limit(
+    duration: float,
+    gpu_config: GPUConfig,
+    lm_initialized: bool
+) -> Tuple[bool, str]:
+    """
+    Check if requested duration is within limits for current GPU configuration.
+    Args:
+        duration: Requested duration in seconds
+        gpu_config: Current GPU configuration
+        lm_initialized: Whether LM is initialized
+    Returns:
+        Tuple of (is_valid, warning_message)
+    """
+    max_duration = gpu_config.max_duration_with_lm if lm_initialized else gpu_config.max_duration_without_lm
+    if duration > max_duration:
+        warning_msg = (
+            f"⚠️ Requested duration ({duration:.0f}s) exceeds the limit for your GPU "
+            f"({gpu_config.gpu_memory_gb:.1f}GB). Maximum allowed: {max_duration}s "
+            f"({'with' if lm_initialized else 'without'} LM). "
+            f"Duration will be clamped to {max_duration}s."
+        )
+        return False, warning_msg
+    return True, ""
+def check_batch_size_limit(
+    batch_size: int,
+    gpu_config: GPUConfig,
+    lm_initialized: bool
+) -> Tuple[bool, str]:
+    """
+    Check if requested batch size is within limits for current GPU configuration.
+    Args:
+        batch_size: Requested batch size
+        gpu_config: Current GPU configuration
+        lm_initialized: Whether LM is initialized
+    Returns:
+        Tuple of (is_valid, warning_message)
+    """
+    max_batch_size = gpu_config.max_batch_size_with_lm if lm_initialized else gpu_config.max_batch_size_without_lm
+    if batch_size > max_batch_size:
+        warning_msg = (
+            f"⚠️ Requested batch size ({batch_size}) exceeds the limit for your GPU "
+            f"({gpu_config.gpu_memory_gb:.1f}GB). Maximum allowed: {max_batch_size} "
+            f"({'with' if lm_initialized else 'without'} LM). "
+            f"Batch size will be clamped to {max_batch_size}."
+        )
+        return False, warning_msg
+    return True, ""
+def is_lm_model_supported(model_path: str, gpu_config: GPUConfig) -> Tuple[bool, str]:
+    """
+    Check if the specified LM model is supported for current GPU configuration.
+    Args:
+        model_path: LM model path
+        gpu_config: Current GPU configuration
+    Returns:
+        Tuple of (is_supported, warning_message)
+    """
+    if not gpu_config.available_lm_models:
+        return False, (
+            f"⚠️ Your GPU ({gpu_config.gpu_memory_gb:.1f}GB) does not have enough memory "
+            f"to run any LM model. Please disable LM initialization."
+        )
+    model_size = get_lm_model_size(model_path)
+    # Check if model size is in available models
+    for available_model in gpu_config.available_lm_models:
+        if model_size in available_model:
+            return True, ""
+    return False, (
+        f"⚠️ LM model {model_path} ({model_size}) is not supported for your GPU "
+        f"({gpu_config.gpu_memory_gb:.1f}GB). Available models: {', '.join(gpu_config.available_lm_models)}"
+    )
+def get_recommended_lm_model(gpu_config: GPUConfig) -> Optional[str]:
+    """
+    Get recommended LM model for current GPU configuration.
+    Args:
+        gpu_config: Current GPU configuration
+    Returns:
+        Recommended LM model path, or None if LM is not supported
+    """
+    if not gpu_config.available_lm_models:
+        return None
+    # Return the largest available model (last in the list)
+    return gpu_config.available_lm_models[-1]
+def print_gpu_config_info(gpu_config: GPUConfig):
+    """Print GPU configuration information for debugging."""
+    logger.info(f"GPU Configuration:")
+    logger.info(f"  - GPU Memory: {gpu_config.gpu_memory_gb:.1f} GB")
+    logger.info(f"  - Tier: {gpu_config.tier}")
+    logger.info(f"  - Max Duration (with LM): {gpu_config.max_duration_with_lm}s ({gpu_config.max_duration_with_lm // 60} min)")
+    logger.info(f"  - Max Duration (without LM): {gpu_config.max_duration_without_lm}s ({gpu_config.max_duration_without_lm // 60} min)")
+    logger.info(f"  - Max Batch Size (with LM): {gpu_config.max_batch_size_with_lm}")
+    logger.info(f"  - Max Batch Size (without LM): {gpu_config.max_batch_size_without_lm}")
+    logger.info(f"  - Init LM by Default: {gpu_config.init_lm_default}")
+    logger.info(f"  - Available LM Models: {gpu_config.available_lm_models or 'None'}")
+# Global GPU config instance (initialized lazily)
+_global_gpu_config: Optional[GPUConfig] = None
+def get_global_gpu_config() -> GPUConfig:
+    """Get the global GPU configuration, initializing if necessary."""
+    global _global_gpu_config
+    if _global_gpu_config is None:
+        _global_gpu_config = get_gpu_config()
+    return _global_gpu_config
+def set_global_gpu_config(config: GPUConfig):
+    """Set the global GPU configuration."""
+    global _global_gpu_config
+    _global_gpu_config = config

acestep/handler.py ADDED Viewed

The diff for this file is too large to render. See raw diff

acestep/inference.py ADDED Viewed

	@@ -0,0 +1,1310 @@

+"""
+ACE-Step Inference API Module
+This module provides a standardized inference interface for music generation,
+designed for third-party integration. It offers both a simplified API and
+backward-compatible Gradio UI support.
+"""
+import math
+import os
+import tempfile
+import shutil
+import subprocess
+import sys
+from typing import Optional, Union, List, Dict, Any, Tuple
+from dataclasses import dataclass, field, asdict
+from loguru import logger
+from acestep.audio_utils import AudioSaver, generate_uuid_from_params, is_audio_silent
+from acestep.constants import TASK_INSTRUCTIONS
+from acestep.gpu_config import get_gpu_config
+# HuggingFace Space environment detection
+IS_HUGGINGFACE_SPACE = os.environ.get("SPACE_ID") is not None
+def _get_spaces_gpu_decorator(duration=180):
+    """
+    Get the @spaces.GPU decorator if running in HuggingFace Space environment.
+    Returns identity decorator if not in Space environment.
+    """
+    if IS_HUGGINGFACE_SPACE:
+        try:
+            import spaces
+            return spaces.GPU(duration=duration)
+        except ImportError:
+            logger.warning("spaces package not found, GPU decorator disabled")
+            return lambda func: func
+    return lambda func: func
+@dataclass
+class GenerationParams:
+    """Configuration for music generation parameters.
+    Attributes:
+        # Text Inputs
+        caption: A short text prompt describing the desired music (main prompt). < 512 characters
+        lyrics: Lyrics for the music. Use "[Instrumental]" for instrumental songs. < 4096 characters
+        instrumental: If True, generate instrumental music regardless of lyrics.
+        # Music Metadata
+        bpm: BPM (beats per minute), e.g., 120. Set to None for automatic estimation. 30 ~ 300
+        keyscale: Musical key (e.g., "C Major", "Am"). Leave empty for auto-detection. A-G, #/♭, major/minor
+        timesignature: Time signature (2 for '2/4', 3 for '3/4', 4 for '4/4', 6 for '6/8'). Leave empty for auto-detection.
+        vocal_language: Language code for vocals, e.g., "en", "zh", "ja", or "unknown". see acestep/constants.py:VALID_LANGUAGES
+        duration: Target audio length in seconds. If <0 or None, model chooses automatically. 10 ~ 600
+        # Generation Parameters
+        inference_steps: Number of diffusion steps (e.g., 8 for turbo, 32–100 for base model).
+        guidance_scale: CFG (classifier-free guidance) strength. Higher means following the prompt more strictly. Only support for non-turbo model.
+        seed: Integer seed for reproducibility. -1 means use random seed each time.
+        # Advanced DiT Parameters
+        use_adg: Whether to use Adaptive Dual Guidance (only works for base model).
+        cfg_interval_start: Start ratio (0.0–1.0) to apply CFG.
+        cfg_interval_end: End ratio (0.0–1.0) to apply CFG.
+        shift: Timestep shift factor (default 1.0). When != 1.0, applies t = shift * t / (1 + (shift - 1) * t) to timesteps.
+        # Task-Specific Parameters
+        task_type: Type of generation task. One of: "text2music", "cover", "repaint", "lego", "extract", "complete".
+        reference_audio: Path to a reference audio file for style transfer or cover tasks.
+        src_audio: Path to a source audio file for audio-to-audio tasks.
+        audio_codes: Audio semantic codes as a string (advanced use, for code-control generation).
+        repainting_start: For repaint/lego tasks: start time in seconds for region to repaint.
+        repainting_end: For repaint/lego tasks: end time in seconds for region to repaint (-1 for until end).
+        audio_cover_strength: Strength of reference audio/codes influence (range 0.0–1.0). set smaller (0.2) for style transfer tasks.
+        instruction: Optional task instruction prompt. If empty, auto-generated by system.
+        # 5Hz Language Model Parameters for CoT reasoning
+        thinking: If True, enable 5Hz Language Model "Chain-of-Thought" reasoning for semantic/music metadata and codes.
+        lm_temperature: Sampling temperature for the LLM (0.0–2.0). Higher = more creative/varied results.
+        lm_cfg_scale: Classifier-free guidance scale for the LLM.
+        lm_top_k: LLM top-k sampling (0 = disabled).
+        lm_top_p: LLM top-p nucleus sampling (1.0 = disabled).
+        lm_negative_prompt: Negative prompt to use for LLM (for control).
+        use_cot_metas: Whether to let LLM generate music metadata via CoT reasoning.
+        use_cot_caption: Whether to let LLM rewrite or format the input caption via CoT reasoning.
+        use_cot_language: Whether to let LLM detect vocal language via CoT.
+    """
+    # Required Inputs
+    task_type: str = "text2music"
+    instruction: str = "Fill the audio semantic mask based on the given conditions:"
+    # Audio Uploads
+    reference_audio: Optional[str] = None
+    src_audio: Optional[str] = None
+    # LM Codes Hints
+    audio_codes: str = ""
+    # Text Inputs
+    caption: str = ""
+    lyrics: str = ""
+    instrumental: bool = False
+    # Metadata
+    vocal_language: str = "unknown"
+    bpm: Optional[int] = None
+    keyscale: str = ""
+    timesignature: str = ""
+    duration: float = -1.0
+    # Advanced Settings
+    inference_steps: int = 8
+    seed: int = -1
+    guidance_scale: float = 7.0
+    use_adg: bool = False
+    cfg_interval_start: float = 0.0
+    cfg_interval_end: float = 1.0
+    shift: float = 1.0
+    infer_method: str = "ode"  # "ode" or "sde" - diffusion inference method
+    # Custom timesteps (parsed from string like "0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0")
+    # If provided, overrides inference_steps and shift
+    timesteps: Optional[List[float]] = None
+    repainting_start: float = 0.0
+    repainting_end: float = -1
+    audio_cover_strength: float = 1.0
+    # 5Hz Language Model Parameters
+    thinking: bool = True
+    lm_temperature: float = 0.85
+    lm_cfg_scale: float = 2.0
+    lm_top_k: int = 0
+    lm_top_p: float = 0.9
+    lm_negative_prompt: str = "NO USER INPUT"
+    use_cot_metas: bool = True
+    use_cot_caption: bool = True
+    use_cot_lyrics: bool = False  # TODO: not used yet
+    use_cot_language: bool = True
+    use_constrained_decoding: bool = True
+    cot_bpm: Optional[int] = None
+    cot_keyscale: str = ""
+    cot_timesignature: str = ""
+    cot_duration: Optional[float] = None
+    cot_vocal_language: str = "unknown"
+    cot_caption: str = ""
+    cot_lyrics: str = ""
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert config to dictionary for JSON serialization."""
+        return asdict(self)
+@dataclass
+class GenerationConfig:
+    """Configuration for music generation.
+    Attributes:
+        batch_size: Number of audio samples to generate
+        allow_lm_batch: Whether to allow batch processing in LM
+        use_random_seed: Whether to use random seed
+        seeds: Seed(s) for batch generation. Can be:
+            - None: Use random seeds (when use_random_seed=True) or params.seed (when use_random_seed=False)
+            - List[int]: List of seeds, will be padded with random seeds if fewer than batch_size
+            - int: Single seed value (will be converted to list and padded)
+        lm_batch_chunk_size: Batch chunk size for LM processing
+        constrained_decoding_debug: Whether to enable constrained decoding debug
+        audio_format: Output audio format, one of "mp3", "wav", "flac". Default: "flac"
+    """
+    batch_size: int = 2
+    allow_lm_batch: bool = False
+    use_random_seed: bool = True
+    seeds: Optional[List[int]] = None
+    lm_batch_chunk_size: int = 8
+    constrained_decoding_debug: bool = False
+    audio_format: str = "flac"  # Default to FLAC for fast saving
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert config to dictionary for JSON serialization."""
+        return asdict(self)
+@dataclass
+class GenerationResult:
+    """Result of music generation.
+    Attributes:
+        # Audio Outputs
+        audios: List of audio dictionaries with paths, keys, params
+        status_message: Status message from generation
+        extra_outputs: Extra outputs from generation
+        success: Whether generation completed successfully
+        error: Error message if generation failed
+    """
+    # Audio Outputs
+    audios: List[Dict[str, Any]] = field(default_factory=list)
+    # Generation Information
+    status_message: str = ""
+    extra_outputs: Dict[str, Any] = field(default_factory=dict)
+    # Success Status
+    success: bool = True
+    error: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert result to dictionary for JSON serialization."""
+        return asdict(self)
+@dataclass
+class UnderstandResult:
+    """Result of music understanding from audio codes.
+    Attributes:
+        # Metadata Fields
+        caption: Generated caption describing the music
+        lyrics: Generated or extracted lyrics
+        bpm: Beats per minute (None if not detected)
+        duration: Duration in seconds (None if not detected)
+        keyscale: Musical key (e.g., "C Major")
+        language: Vocal language code (e.g., "en", "zh")
+        timesignature: Time signature (e.g., "4/4")
+        # Status
+        status_message: Status message from understanding
+        success: Whether understanding completed successfully
+        error: Error message if understanding failed
+    """
+    # Metadata Fields
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    # Status
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert result to dictionary for JSON serialization."""
+        return asdict(self)
+def _update_metadata_from_lm(
+    metadata: Dict[str, Any],
+    bpm: Optional[int],
+    key_scale: str,
+    time_signature: str,
+    audio_duration: Optional[float],
+    vocal_language: str,
+    caption: str,
+    lyrics: str,
+) -> Tuple[Optional[int], str, str, Optional[float], str, str, str]:
+    """Update metadata fields from LM output if not provided by user."""
+    if bpm is None and metadata.get('bpm'):
+        bpm_value = metadata.get('bpm')
+        if bpm_value not in ["N/A", ""]:
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+    if not key_scale and metadata.get('keyscale'):
+        key_scale_value = metadata.get('keyscale', metadata.get('key_scale', ""))
+        if key_scale_value != "N/A":
+            key_scale = key_scale_value
+    if not time_signature and metadata.get('timesignature'):
+        time_signature_value = metadata.get('timesignature', metadata.get('time_signature', ""))
+        if time_signature_value != "N/A":
+            time_signature = time_signature_value
+    if audio_duration is None or audio_duration <= 0:
+        audio_duration_value = metadata.get('duration', -1)
+        if audio_duration_value not in ["N/A", ""]:
+            try:
+                audio_duration = float(audio_duration_value)
+            except (ValueError, TypeError):
+                pass
+    if not vocal_language and metadata.get('vocal_language'):
+        vocal_language = metadata.get('vocal_language')
+    if not caption and metadata.get('caption'):
+        caption = metadata.get('caption')
+    if not lyrics and metadata.get('lyrics'):
+        lyrics = metadata.get('lyrics')
+    return bpm, key_scale, time_signature, audio_duration, vocal_language, caption, lyrics
+@_get_spaces_gpu_decorator(duration=180)
+def generate_music(
+    dit_handler,
+    llm_handler,
+    params: GenerationParams,
+    config: GenerationConfig,
+    save_dir: Optional[str] = None,
+    progress=None,
+) -> GenerationResult:
+    """Generate music using ACE-Step model with optional LM reasoning.
+    Args:
+        dit_handler: Initialized DiT model handler (AceStepHandler instance)
+        llm_handler: Initialized LLM handler (LLMHandler instance)
+        params: Generation parameters (GenerationParams instance)
+        config: Generation configuration (GenerationConfig instance)
+    Returns:
+        GenerationResult with generated audio files and metadata
+    """
+    try:
+        # Phase 1: LM-based metadata and code generation (if enabled)
+        audio_code_string_to_use = params.audio_codes
+        lm_generated_metadata = None
+        lm_generated_audio_codes_list = []
+        lm_total_time_costs = {
+            "phase1_time": 0.0,
+            "phase2_time": 0.0,
+            "total_time": 0.0,
+        }
+        # Extract mutable copies of metadata (will be updated by LM if needed)
+        bpm = params.bpm
+        key_scale = params.keyscale
+        time_signature = params.timesignature
+        audio_duration = params.duration
+        dit_input_caption = params.caption
+        dit_input_vocal_language = params.vocal_language
+        dit_input_lyrics = params.lyrics
+        # Determine if we need to generate audio codes
+        # If user has provided audio_codes, we don't need to generate them
+        # Otherwise, check if we need audio codes (lm_dit mode) or just metas (dit mode)
+        user_provided_audio_codes = bool(params.audio_codes and str(params.audio_codes).strip())
+        # Safety: cover task without any source audio or codes produces silence.
+        if params.task_type == "cover":
+            no_src_audio = not (params.reference_audio or params.src_audio)
+            if no_src_audio and not user_provided_audio_codes:
+                logger.warning("Cover task requested without source audio or audio codes. Falling back to text2music.")
+                params.task_type = "text2music"
+                if params.instruction == TASK_INSTRUCTIONS.get("cover"):
+                    params.instruction = TASK_INSTRUCTIONS.get("text2music", params.instruction)
+        # Determine infer_type: use "llm_dit" if we need audio codes, "dit" if only metas needed
+        # For now, we use "llm_dit" if batch mode or if user hasn't provided codes
+        # Use "dit" if user has provided codes (only need metas) or if explicitly only need metas
+        # Note: This logic can be refined based on specific requirements
+        need_audio_codes = not user_provided_audio_codes
+        # Determine if we should use chunk-based LM generation (always use chunks for consistency)
+        # Determine actual batch size for chunk processing
+        actual_batch_size = config.batch_size if config.batch_size is not None else 1
+        # Prepare seeds for batch generation
+        # Use config.seed if provided, otherwise fallback to params.seed
+        # Convert config.seed (None, int, or List[int]) to format that prepare_seeds accepts
+        seed_for_generation = ""
+        # Original code (commented out because it crashes on int seeds):
+        # if config.seeds is not None and len(config.seeds) > 0:
+        #     if isinstance(config.seeds, list):
+        #         # Convert List[int] to comma-separated string
+        #         seed_for_generation = ",".join(str(s) for s in config.seeds)
+        if config.seeds is not None:
+            if isinstance(config.seeds, list) and len(config.seeds) > 0:
+                # Convert List[int] to comma-separated string
+                seed_for_generation = ",".join(str(s) for s in config.seeds)
+            elif isinstance(config.seeds, int):
+                # Fix: Explicitly handle single integer seeds by converting to string.
+                # Previously, this would crash because 'len()' was called on an int.
+                seed_for_generation = str(config.seeds)
+        # Use dit_handler.prepare_seeds to handle seed list generation and padding
+        # This will handle all the logic: padding with random seeds if needed, etc.
+        actual_seed_list, _ = dit_handler.prepare_seeds(actual_batch_size, seed_for_generation, config.use_random_seed)
+        # LM-based Chain-of-Thought reasoning
+        # Skip LM for cover/repaint tasks - these tasks use reference/src audio directly
+        # and don't need LM to generate audio codes
+        skip_lm_tasks = {"cover", "repaint"}
+        # Determine if we should use LLM
+        # LLM is needed for:
+        # 1. thinking=True: generate audio codes via LM
+        # 2. use_cot_caption=True: enhance/generate caption via CoT
+        # 3. use_cot_language=True: detect vocal language via CoT
+        # 4. use_cot_metas=True: fill missing metadata via CoT
+        need_lm_for_cot = params.use_cot_caption or params.use_cot_language or params.use_cot_metas
+        use_lm = (params.thinking or need_lm_for_cot) and llm_handler is not None and llm_handler.llm_initialized and params.task_type not in skip_lm_tasks
+        lm_status = []
+        if params.task_type in skip_lm_tasks:
+            logger.info(f"Skipping LM for task_type='{params.task_type}' - using DiT directly")
+        logger.info(f"[generate_music] LLM usage decision: thinking={params.thinking}, "
+                   f"use_cot_caption={params.use_cot_caption}, use_cot_language={params.use_cot_language}, "
+                   f"use_cot_metas={params.use_cot_metas}, need_lm_for_cot={need_lm_for_cot}, "
+                   f"llm_initialized={llm_handler.llm_initialized if llm_handler else False}, use_lm={use_lm}")
+        def _infer_audio_duration_seconds(audio_path: str) -> Optional[float]:
+            """Best-effort duration inference for common audio formats."""
+            if not audio_path:
+                return None
+            # Try torchaudio (supports more formats when ffmpeg backend is available)
+            try:
+                import torchaudio
+                info = torchaudio.info(audio_path)
+                if info and info.num_frames and info.sample_rate:
+                    return float(info.num_frames) / float(info.sample_rate)
+            except Exception:
+                pass
+            # Try soundfile (fast for wav/flac)
+            try:
+                import soundfile as sf
+                info = sf.info(audio_path)
+                if info and info.frames and info.samplerate:
+                    return float(info.frames) / float(info.samplerate)
+            except Exception:
+                pass
+            # macOS fallback: use afinfo for m4a/aac
+            if sys.platform == "darwin" and shutil.which("afinfo"):
+                try:
+                    result = subprocess.run(
+                        ["afinfo", audio_path],
+                        check=False,
+                        capture_output=True,
+                        text=True,
+                    )
+                    if result.stdout:
+                        for line in result.stdout.splitlines():
+                            if "duration:" in line:
+                                # Example: "duration:  183.165s"
+                                parts = line.strip().split()
+                                for p in parts:
+                                    if p.endswith("s"):
+                                        try:
+                                            return float(p.rstrip("s"))
+                                        except ValueError:
+                                            continue
+                except Exception:
+                    pass
+            return None
+        # Clamp duration and batch size to GPU limits (applies to non-Gradio callers too)
+        try:
+            # If duration not provided, try to infer from source audio to enable safe clamping.
+            if (audio_duration is None or float(audio_duration) <= 0) and (params.src_audio or params.reference_audio):
+                audio_path = params.src_audio or params.reference_audio
+                try:
+                    inferred = _infer_audio_duration_seconds(audio_path)
+                    if inferred and inferred > 0:
+                        audio_duration = inferred
+                        params.duration = inferred
+                        logger.info(f"[generate_music] Inferred duration from audio file: {inferred:.2f}s")
+                except Exception as e:
+                    logger.warning(f"[generate_music] Failed to infer duration from audio file: {e}")
+            gpu_config = get_gpu_config()
+            max_duration = gpu_config.max_duration_with_lm if use_lm else gpu_config.max_duration_without_lm
+            if audio_duration is not None and float(audio_duration) > 0 and float(audio_duration) > max_duration:
+                logger.warning(f"[generate_music] Duration {audio_duration}s exceeds GPU limit {max_duration}s. Clamping.")
+                audio_duration = float(max_duration)
+                params.duration = float(max_duration)
+            max_batch = gpu_config.max_batch_size_with_lm if use_lm else gpu_config.max_batch_size_without_lm
+            if config.batch_size is not None and config.batch_size > max_batch:
+                logger.warning(f"[generate_music] Batch size {config.batch_size} exceeds GPU limit {max_batch}. Clamping.")
+                config.batch_size = max_batch
+            # Extra safety for MPS: large durations can OOM with batch > 1
+            if (
+                hasattr(dit_handler, "device")
+                and dit_handler.device == "mps"
+                and audio_duration is not None
+                and float(audio_duration) > 180
+                and config.batch_size is not None
+                and config.batch_size > 1
+            ):
+                logger.warning("[generate_music] MPS with long duration detected; reducing batch size to 1 to avoid OOM.")
+                config.batch_size = 1
+        except Exception as e:
+            logger.warning(f"[generate_music] Failed to clamp duration/batch to GPU limits: {e}")
+        if use_lm:
+            # Convert sampling parameters - handle None values safely
+            top_k_value = None if not params.lm_top_k or params.lm_top_k == 0 else int(params.lm_top_k)
+            top_p_value = None if not params.lm_top_p or params.lm_top_p >= 1.0 else params.lm_top_p
+            # Build user_metadata from user-provided values
+            user_metadata = {}
+            if bpm is not None:
+                try:
+                    bpm_value = float(bpm)
+                    if bpm_value > 0:
+                        user_metadata['bpm'] = int(bpm_value)
+                except (ValueError, TypeError):
+                    pass
+            if key_scale and key_scale.strip():
+                key_scale_clean = key_scale.strip()
+                if key_scale_clean.lower() not in ["n/a", ""]:
+                    user_metadata['keyscale'] = key_scale_clean
+            if time_signature and time_signature.strip():
+                time_sig_clean = time_signature.strip()
+                if time_sig_clean.lower() not in ["n/a", ""]:
+                    user_metadata['timesignature'] = time_sig_clean
+            if audio_duration is not None:
+                try:
+                    duration_value = float(audio_duration)
+                    if duration_value > 0:
+                        user_metadata['duration'] = int(duration_value)
+                except (ValueError, TypeError):
+                    pass
+            user_metadata_to_pass = user_metadata if user_metadata else None
+            # Determine infer_type based on whether we need audio codes
+            # - "llm_dit": generates both metas and audio codes (two-phase internally)
+            # - "dit": generates only metas (single phase)
+            infer_type = "llm_dit" if need_audio_codes and params.thinking else "dit"
+            # Use chunk size from config, or default to batch_size if not set
+            max_inference_batch_size = int(config.lm_batch_chunk_size) if config.lm_batch_chunk_size > 0 else actual_batch_size
+            num_chunks = math.ceil(actual_batch_size / max_inference_batch_size)
+            all_metadata_list = []
+            all_audio_codes_list = []
+            for chunk_idx in range(num_chunks):
+                chunk_start = chunk_idx * max_inference_batch_size
+                chunk_end = min(chunk_start + max_inference_batch_size, actual_batch_size)
+                chunk_size = chunk_end - chunk_start
+                chunk_seeds = actual_seed_list[chunk_start:chunk_end] if chunk_start < len(actual_seed_list) else None
+                logger.info(f"LM chunk {chunk_idx+1}/{num_chunks} (infer_type={infer_type}) "
+                            f"(size: {chunk_size}, seeds: {chunk_seeds})")
+                # Use the determined infer_type
+                # - "llm_dit" will internally run two phases (metas + codes)
+                # - "dit" will only run phase 1 (metas only)
+                result = llm_handler.generate_with_stop_condition(
+                    caption=params.caption or "",
+                    lyrics=params.lyrics or "",
+                    infer_type=infer_type,
+                    temperature=params.lm_temperature,
+                    cfg_scale=params.lm_cfg_scale,
+                    negative_prompt=params.lm_negative_prompt,
+                    top_k=top_k_value,
+                    top_p=top_p_value,
+                    target_duration=audio_duration,  # Pass duration to limit audio codes generation
+                    user_metadata=user_metadata_to_pass,
+                    use_cot_caption=params.use_cot_caption,
+                    use_cot_language=params.use_cot_language,
+                    use_cot_metas=params.use_cot_metas,
+                    use_constrained_decoding=params.use_constrained_decoding,
+                    constrained_decoding_debug=config.constrained_decoding_debug,
+                    batch_size=chunk_size,
+                    seeds=chunk_seeds,
+                    progress=progress,
+                )
+                # Check if LM generation failed
+                if not result.get("success", False):
+                    error_msg = result.get("error", "Unknown LM error")
+                    lm_status.append(f"❌ LM Error: {error_msg}")
+                    # Return early with error
+                    return GenerationResult(
+                        audios=[],
+                        status_message=f"❌ LM generation failed: {error_msg}",
+                        extra_outputs={},
+                        success=False,
+                        error=error_msg,
+                    )
+                # Extract metadata and audio_codes from result dict
+                if chunk_size > 1:
+                    metadata_list = result.get("metadata", [])
+                    audio_codes_list = result.get("audio_codes", [])
+                    all_metadata_list.extend(metadata_list)
+                    all_audio_codes_list.extend(audio_codes_list)
+                else:
+                    metadata = result.get("metadata", {})
+                    audio_codes = result.get("audio_codes", "")
+                    all_metadata_list.append(metadata)
+                    all_audio_codes_list.append(audio_codes)
+                # Collect time costs from LM extra_outputs
+                lm_extra = result.get("extra_outputs", {})
+                lm_chunk_time_costs = lm_extra.get("time_costs", {})
+                if lm_chunk_time_costs:
+                    # Accumulate time costs from all chunks
+                    for key in ["phase1_time", "phase2_time", "total_time"]:
+                        if key in lm_chunk_time_costs:
+                            lm_total_time_costs[key] += lm_chunk_time_costs[key]
+                    time_str = ", ".join([f"{k}: {v:.2f}s" for k, v in lm_chunk_time_costs.items()])
+                    lm_status.append(f"✅ LM chunk {chunk_idx+1}: {time_str}")
+            lm_generated_metadata = all_metadata_list[0] if all_metadata_list else None
+            lm_generated_audio_codes_list = all_audio_codes_list
+            # Set audio_code_string_to_use based on infer_type
+            if infer_type == "llm_dit":
+                # If batch mode, use list; otherwise use single string
+                if actual_batch_size > 1:
+                    audio_code_string_to_use = all_audio_codes_list
+                else:
+                    audio_code_string_to_use = all_audio_codes_list[0] if all_audio_codes_list else ""
+            else:
+                # For "dit" mode, keep user-provided codes or empty
+                audio_code_string_to_use = params.audio_codes
+            # Update metadata from LM if not provided by user
+            if lm_generated_metadata:
+                bpm, key_scale, time_signature, audio_duration, vocal_language, caption, lyrics = _update_metadata_from_lm(
+                    metadata=lm_generated_metadata,
+                    bpm=bpm,
+                    key_scale=key_scale,
+                    time_signature=time_signature,
+                    audio_duration=audio_duration,
+                    vocal_language=dit_input_vocal_language,
+                    caption=dit_input_caption,
+                    lyrics=dit_input_lyrics)
+                if not params.bpm:
+                    params.cot_bpm = bpm
+                if not params.keyscale:
+                    params.cot_keyscale = key_scale
+                if not params.timesignature:
+                    params.cot_timesignature = time_signature
+                if not params.duration:
+                    params.cot_duration = audio_duration
+                if not params.vocal_language:
+                    params.cot_vocal_language = vocal_language
+                if not params.caption:
+                    params.cot_caption = caption
+                if not params.lyrics:
+                    params.cot_lyrics = lyrics
+                dit_input_lyrics = lyrics
+            # set cot caption and language if needed
+            if params.use_cot_caption:
+                dit_input_caption = lm_generated_metadata.get("caption", dit_input_caption)
+            if params.use_cot_language:
+                dit_input_vocal_language = lm_generated_metadata.get("vocal_language", dit_input_vocal_language)
+        # Phase 2: DiT music generation
+        # Use seed_for_generation (from config.seed or params.seed) instead of params.seed for actual generation
+        result = dit_handler.generate_music(
+            captions=dit_input_caption,
+            lyrics=dit_input_lyrics,
+            bpm=bpm,
+            key_scale=key_scale,
+            time_signature=time_signature,
+            vocal_language=dit_input_vocal_language,
+            inference_steps=params.inference_steps,
+            guidance_scale=params.guidance_scale,
+            use_random_seed=config.use_random_seed,
+            seed=seed_for_generation,  # Use config.seed (or params.seed fallback) instead of params.seed directly
+            reference_audio=params.reference_audio,
+            audio_duration=audio_duration,
+            batch_size=config.batch_size if config.batch_size is not None else 1,
+            src_audio=params.src_audio,
+            audio_code_string=audio_code_string_to_use,
+            repainting_start=params.repainting_start,
+            repainting_end=params.repainting_end,
+            instruction=params.instruction,
+            audio_cover_strength=params.audio_cover_strength,
+            task_type=params.task_type,
+            use_adg=params.use_adg,
+            cfg_interval_start=params.cfg_interval_start,
+            cfg_interval_end=params.cfg_interval_end,
+            shift=params.shift,
+            infer_method=params.infer_method,
+            timesteps=params.timesteps,
+            progress=progress,
+        )
+        # Check if generation failed
+        if not result.get("success", False):
+            return GenerationResult(
+                audios=[],
+                status_message=result.get("status_message", ""),
+                extra_outputs={},
+                success=False,
+                error=result.get("error"),
+            )
+        # Extract results from dit_handler.generate_music dict
+        dit_audios = result.get("audios", [])
+        status_message = result.get("status_message", "")
+        dit_extra_outputs = result.get("extra_outputs", {})
+        # Use the seed list already prepared above (from config.seed or params.seed fallback)
+        # actual_seed_list was computed earlier using dit_handler.prepare_seeds
+        seed_list = actual_seed_list
+        # Get base params dictionary
+        base_params_dict = params.to_dict()
+        # Save audio files using AudioSaver (format from config)
+        audio_format = config.audio_format if config.audio_format else "flac"
+        audio_saver = AudioSaver(default_format=audio_format)
+        # Use handler's temp_dir for saving files
+        if save_dir is not None:
+            os.makedirs(save_dir, exist_ok=True)
+        # Build audios list for GenerationResult with params and save files
+        # Audio saving and UUID generation handled here, outside of handler
+        audios = []
+        silent_warnings = []
+        for idx, dit_audio in enumerate(dit_audios):
+            # Create a copy of params dict for this audio
+            audio_params = base_params_dict.copy()
+            # Update audio-specific values
+            audio_params["seed"] = seed_list[idx] if idx < len(seed_list) else None
+            # Add audio codes if batch mode
+            if lm_generated_audio_codes_list and idx < len(lm_generated_audio_codes_list):
+                audio_params["audio_codes"] = lm_generated_audio_codes_list[idx]
+            # Get audio tensor and metadata
+            audio_tensor = dit_audio.get("tensor")
+            sample_rate = dit_audio.get("sample_rate", 48000)
+            # Generate UUID for this audio (moved from handler)
+            batch_seed = seed_list[idx] if idx < len(seed_list) else seed_list[0] if seed_list else -1
+            audio_code_str = lm_generated_audio_codes_list[idx] if (
+                lm_generated_audio_codes_list and idx < len(lm_generated_audio_codes_list)) else audio_code_string_to_use
+            if isinstance(audio_code_str, list):
+                audio_code_str = audio_code_str[idx] if idx < len(audio_code_str) else ""
+            audio_key = generate_uuid_from_params(audio_params)
+            silent_check = False
+            if audio_tensor is not None:
+                silent_check, rms_val, peak_val = is_audio_silent(audio_tensor, channels_first=True)
+                if silent_check:
+                    logger.warning(
+                        f"[generate_music] Silent output detected (idx={idx}, RMS={rms_val:.2e}, peak={peak_val:.2e}). "
+                        "Likely cause: LLM backend returned empty conditioning, or incompatible torch/triton/flash-attn. "
+                        "Suggest running with --backend pt."
+                    )
+                    silent_warnings.append(
+                        f"Output {idx + 1}: silent or near-silent (RMS≈{rms_val:.2e}). "
+                        "Likely causes: LLM backend failure, incompatible torch/triton/flash-attn, or CPU/fallback path. "
+                        "Try running with --backend pt."
+                    )
+            audio_path = None
+            if audio_tensor is not None and save_dir is not None and not silent_check:
+                try:
+                    audio_file = os.path.join(save_dir, f"{audio_key}.{audio_format}")
+                    audio_path = audio_saver.save_audio(audio_tensor,
+                                                        audio_file,
+                                                        sample_rate=sample_rate,
+                                                        format=audio_format,
+                                                        channels_first=True)
+                except Exception as e:
+                    logger.error(f"[generate_music] Failed to save audio file: {e}")
+                    audio_path = ""
+            audio_dict = {
+                "path": audio_path or "",
+                "tensor": audio_tensor,
+                "key": audio_key,
+                "sample_rate": sample_rate,
+                "params": audio_params,
+                "silent": silent_check,
+            }
+            audios.append(audio_dict)
+        # Merge extra_outputs: include dit_extra_outputs (latents, masks) and add LM metadata
+        extra_outputs = dit_extra_outputs.copy()
+        extra_outputs["lm_metadata"] = lm_generated_metadata
+        # Merge time_costs from both LM and DiT into a unified dictionary
+        unified_time_costs = {}
+        # Add LM time costs (if LM was used)
+        if use_lm and lm_total_time_costs:
+            for key, value in lm_total_time_costs.items():
+                unified_time_costs[f"lm_{key}"] = value
+        # Add DiT time costs (if available)
+        dit_time_costs = dit_extra_outputs.get("time_costs", {})
+        if dit_time_costs:
+            for key, value in dit_time_costs.items():
+                unified_time_costs[f"dit_{key}"] = value
+        # Calculate total pipeline time
+        if unified_time_costs:
+            lm_total = unified_time_costs.get("lm_total_time", 0.0)
+            dit_total = unified_time_costs.get("dit_total_time_cost", 0.0)
+            unified_time_costs["pipeline_total_time"] = lm_total + dit_total
+        # Update extra_outputs with unified time_costs
+        extra_outputs["time_costs"] = unified_time_costs
+        if lm_status:
+            status_message = "\n".join(lm_status) + "\n" + status_message
+        else:
+            status_message = status_message
+        if silent_warnings:
+            status_message = "⚠️ Silent output detected:\n" + "\n".join(silent_warnings) + "\n\nSuggested fix: try running with --backend pt\n\n" + (status_message or "")
+        # Create and return GenerationResult
+        return GenerationResult(
+            audios=audios,
+            status_message=status_message,
+            extra_outputs=extra_outputs,
+            success=True,
+            error=None,
+        )
+    except Exception as e:
+        logger.exception("Music generation failed")
+        return GenerationResult(
+            audios=[],
+            status_message=f"Error: {str(e)}",
+            extra_outputs={},
+            success=False,
+            error=str(e),
+        )
+def understand_music(
+    llm_handler,
+    audio_codes: str,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> UnderstandResult:
+    """Understand music from audio codes using the 5Hz Language Model.
+    This function analyzes audio semantic codes and generates metadata about the music,
+    including caption, lyrics, BPM, duration, key scale, language, and time signature.
+    If audio_codes is empty or "NO USER INPUT", the LM will generate a sample example
+    instead of analyzing existing codes.
+    Note: cfg_scale and negative_prompt are not supported in understand mode.
+    Args:
+        llm_handler: Initialized LLM handler (LLMHandler instance)
+        audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
+                     Use empty string or "NO USER INPUT" to generate a sample example.
+        temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
+        top_k: Top-K sampling (None or 0 = disabled)
+        top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
+        repetition_penalty: Repetition penalty (1.0 = no penalty)
+        use_constrained_decoding: Whether to use FSM-based constrained decoding for metadata
+        constrained_decoding_debug: Whether to enable debug logging for constrained decoding
+    Returns:
+        UnderstandResult with parsed metadata fields and status
+    Example:
+        >>> result = understand_music(llm_handler, audio_codes="<|audio_code_123|>...")
+        >>> if result.success:
+        ...     print(f"Caption: {result.caption}")
+        ...     print(f"BPM: {result.bpm}")
+        ...     print(f"Lyrics: {result.lyrics}")
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        return UnderstandResult(
+            status_message="5Hz LM not initialized. Please initialize it first.",
+            success=False,
+            error="LLM not initialized",
+        )
+    # If codes are empty, use "NO USER INPUT" to generate a sample example
+    if not audio_codes or not audio_codes.strip():
+        audio_codes = "NO USER INPUT"
+    try:
+        # Call LLM understanding
+        metadata, status = llm_handler.understand_audio_from_codes(
+            audio_codes=audio_codes,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+        )
+        # Check if LLM returned empty metadata (error case)
+        if not metadata:
+            return UnderstandResult(
+                status_message=status or "Failed to understand audio codes",
+                success=False,
+                error=status or "Empty metadata returned",
+            )
+        # Extract and convert fields
+        caption = metadata.get('caption', '')
+        lyrics = metadata.get('lyrics', '')
+        keyscale = metadata.get('keyscale', '')
+        language = metadata.get('language', metadata.get('vocal_language', ''))
+        timesignature = metadata.get('timesignature', '')
+        # Convert BPM to int
+        bpm = None
+        bpm_value = metadata.get('bpm')
+        if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+        # Convert duration to float
+        duration = None
+        duration_value = metadata.get('duration')
+        if duration_value is not None and duration_value != 'N/A' and duration_value != '':
+            try:
+                duration = float(duration_value)
+            except (ValueError, TypeError):
+                pass
+        # Clean up N/A values
+        if keyscale == 'N/A':
+            keyscale = ''
+        if language == 'N/A':
+            language = ''
+        if timesignature == 'N/A':
+            timesignature = ''
+        return UnderstandResult(
+            caption=caption,
+            lyrics=lyrics,
+            bpm=bpm,
+            duration=duration,
+            keyscale=keyscale,
+            language=language,
+            timesignature=timesignature,
+            status_message=status,
+            success=True,
+            error=None,
+        )
+    except Exception as e:
+        logger.exception("Music understanding failed")
+        return UnderstandResult(
+            status_message=f"Error: {str(e)}",
+            success=False,
+            error=str(e),
+        )
+@dataclass
+class CreateSampleResult:
+    """Result of creating a music sample from a natural language query.
+    This is used by the "Simple Mode" / "Inspiration Mode" feature where users
+    provide a natural language description and the LLM generates a complete
+    sample with caption, lyrics, and metadata.
+    Attributes:
+        # Metadata Fields
+        caption: Generated detailed music description/caption
+        lyrics: Generated lyrics (or "[Instrumental]" for instrumental music)
+        bpm: Beats per minute (None if not generated)
+        duration: Duration in seconds (None if not generated)
+        keyscale: Musical key (e.g., "C Major")
+        language: Vocal language code (e.g., "en", "zh")
+        timesignature: Time signature (e.g., "4")
+        instrumental: Whether this is an instrumental piece
+        # Status
+        status_message: Status message from sample creation
+        success: Whether sample creation completed successfully
+        error: Error message if sample creation failed
+    """
+    # Metadata Fields
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    instrumental: bool = False
+    # Status
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert result to dictionary for JSON serialization."""
+        return asdict(self)
+def create_sample(
+    llm_handler,
+    query: str,
+    instrumental: bool = False,
+    vocal_language: Optional[str] = None,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> CreateSampleResult:
+    """Create a music sample from a natural language query using the 5Hz Language Model.
+    This is the "Simple Mode" / "Inspiration Mode" feature that takes a user's natural
+    language description of music and generates a complete sample including:
+    - Detailed caption/description
+    - Lyrics (unless instrumental)
+    - Metadata (BPM, duration, key, language, time signature)
+    Note: cfg_scale and negative_prompt are not supported in create_sample mode.
+    Args:
+        llm_handler: Initialized LLM handler (LLMHandler instance)
+        query: User's natural language music description (e.g., "a soft Bengali love song")
+        instrumental: Whether to generate instrumental music (no vocals)
+        vocal_language: Allowed vocal language for constrained decoding (e.g., "en", "zh").
+                       If provided, the model will be constrained to generate lyrics in this language.
+                       If None or "unknown", no language constraint is applied.
+        temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
+        top_k: Top-K sampling (None or 0 = disabled)
+        top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
+        repetition_penalty: Repetition penalty (1.0 = no penalty)
+        use_constrained_decoding: Whether to use FSM-based constrained decoding
+        constrained_decoding_debug: Whether to enable debug logging
+    Returns:
+        CreateSampleResult with generated sample fields and status
+    Example:
+        >>> result = create_sample(llm_handler, "a soft Bengali love song for a quiet evening", vocal_language="bn")
+        >>> if result.success:
+        ...     print(f"Caption: {result.caption}")
+        ...     print(f"Lyrics: {result.lyrics}")
+        ...     print(f"BPM: {result.bpm}")
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        return CreateSampleResult(
+            status_message="5Hz LM not initialized. Please initialize it first.",
+            success=False,
+            error="LLM not initialized",
+        )
+    try:
+        # Call LLM to create sample
+        metadata, status = llm_handler.create_sample_from_query(
+            query=query,
+            instrumental=instrumental,
+            vocal_language=vocal_language,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+        )
+        # Check if LLM returned empty metadata (error case)
+        if not metadata:
+            return CreateSampleResult(
+                status_message=status or "Failed to create sample",
+                success=False,
+                error=status or "Empty metadata returned",
+            )
+        # Extract and convert fields
+        caption = metadata.get('caption', '')
+        lyrics = metadata.get('lyrics', '')
+        keyscale = metadata.get('keyscale', '')
+        language = metadata.get('language', metadata.get('vocal_language', ''))
+        timesignature = metadata.get('timesignature', '')
+        is_instrumental = metadata.get('instrumental', instrumental)
+        # Convert BPM to int
+        bpm = None
+        bpm_value = metadata.get('bpm')
+        if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+        # Convert duration to float
+        duration = None
+        duration_value = metadata.get('duration')
+        if duration_value is not None and duration_value != 'N/A' and duration_value != '':
+            try:
+                duration = float(duration_value)
+            except (ValueError, TypeError):
+                pass
+        # Clean up N/A values
+        if keyscale == 'N/A':
+            keyscale = ''
+        if language == 'N/A':
+            language = ''
+        if timesignature == 'N/A':
+            timesignature = ''
+        return CreateSampleResult(
+            caption=caption,
+            lyrics=lyrics,
+            bpm=bpm,
+            duration=duration,
+            keyscale=keyscale,
+            language=language,
+            timesignature=timesignature,
+            instrumental=is_instrumental,
+            status_message=status,
+            success=True,
+            error=None,
+        )
+    except Exception as e:
+        logger.exception("Sample creation failed")
+        return CreateSampleResult(
+            status_message=f"Error: {str(e)}",
+            success=False,
+            error=str(e),
+        )
+@dataclass
+class FormatSampleResult:
+    """Result of formatting user-provided caption and lyrics.
+    This is used by the "Format" feature where users provide caption and lyrics,
+    and the LLM formats them into structured music metadata and an enhanced description.
+    Attributes:
+        # Metadata Fields
+        caption: Enhanced/formatted music description/caption
+        lyrics: Formatted lyrics (may be same as input or reformatted)
+        bpm: Beats per minute (None if not detected)
+        duration: Duration in seconds (None if not detected)
+        keyscale: Musical key (e.g., "C Major")
+        language: Vocal language code (e.g., "en", "zh")
+        timesignature: Time signature (e.g., "4")
+        # Status
+        status_message: Status message from formatting
+        success: Whether formatting completed successfully
+        error: Error message if formatting failed
+    """
+    # Metadata Fields
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    # Status
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert result to dictionary for JSON serialization."""
+        return asdict(self)
+def format_sample(
+    llm_handler,
+    caption: str,
+    lyrics: str,
+    user_metadata: Optional[Dict[str, Any]] = None,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> FormatSampleResult:
+    """Format user-provided caption and lyrics using the 5Hz Language Model.
+    This function takes user input (caption and lyrics) and generates structured
+    music metadata including an enhanced caption, BPM, duration, key, language,
+    and time signature.
+    If user_metadata is provided, those values will be used to constrain the
+    decoding, ensuring the output matches user-specified values.
+    Note: cfg_scale and negative_prompt are not supported in format mode.
+    Args:
+        llm_handler: Initialized LLM handler (LLMHandler instance)
+        caption: User's caption/description (e.g., "Latin pop, reggaeton")
+        lyrics: User's lyrics with structure tags
+        user_metadata: Optional dict with user-provided metadata to constrain decoding.
+                      Supported keys: bpm, duration, keyscale, timesignature, language
+        temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
+        top_k: Top-K sampling (None or 0 = disabled)
+        top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
+        repetition_penalty: Repetition penalty (1.0 = no penalty)
+        use_constrained_decoding: Whether to use FSM-based constrained decoding for metadata
+        constrained_decoding_debug: Whether to enable debug logging for constrained decoding
+    Returns:
+        FormatSampleResult with formatted metadata fields and status
+    Example:
+        >>> result = format_sample(llm_handler, "Latin pop, reggaeton", "[Verse 1]\\nHola mundo...")
+        >>> if result.success:
+        ...     print(f"Caption: {result.caption}")
+        ...     print(f"BPM: {result.bpm}")
+        ...     print(f"Lyrics: {result.lyrics}")
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        return FormatSampleResult(
+            status_message="5Hz LM not initialized. Please initialize it first.",
+            success=False,
+            error="LLM not initialized",
+        )
+    try:
+        # Call LLM formatting
+        metadata, status = llm_handler.format_sample_from_input(
+            caption=caption,
+            lyrics=lyrics,
+            user_metadata=user_metadata,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+        )
+        # Check if LLM returned empty metadata (error case)
+        if not metadata:
+            return FormatSampleResult(
+                status_message=status or "Failed to format input",
+                success=False,
+                error=status or "Empty metadata returned",
+            )
+        # Extract and convert fields
+        result_caption = metadata.get('caption', '')
+        result_lyrics = metadata.get('lyrics', lyrics)  # Fall back to input lyrics
+        keyscale = metadata.get('keyscale', '')
+        language = metadata.get('language', metadata.get('vocal_language', ''))
+        timesignature = metadata.get('timesignature', '')
+        # Convert BPM to int
+        bpm = None
+        bpm_value = metadata.get('bpm')
+        if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+        # Convert duration to float
+        duration = None
+        duration_value = metadata.get('duration')
+        if duration_value is not None and duration_value != 'N/A' and duration_value != '':
+            try:
+                duration = float(duration_value)
+            except (ValueError, TypeError):
+                pass
+        # Clean up N/A values
+        if keyscale == 'N/A':
+            keyscale = ''
+        if language == 'N/A':
+            language = ''
+        if timesignature == 'N/A':
+            timesignature = ''
+        return FormatSampleResult(
+            caption=result_caption,
+            lyrics=result_lyrics,
+            bpm=bpm,
+            duration=duration,
+            keyscale=keyscale,
+            language=language,
+            timesignature=timesignature,
+            status_message=status,
+            success=True,
+            error=None,
+        )
+    except Exception as e:
+        logger.exception("Format sample failed")
+        return FormatSampleResult(
+            status_message=f"Error: {str(e)}",
+            success=False,
+            error=str(e),
+        )

acestep/llm_inference.py ADDED Viewed

The diff for this file is too large to render. See raw diff

acestep/model_downloader.py ADDED Viewed

	@@ -0,0 +1,634 @@

+"""
+ACE-Step Model Downloader
+This module provides functionality to download models from HuggingFace Hub or ModelScope.
+It supports automatic downloading when models are not found locally,
+with intelligent fallback between download sources.
+"""
+import os
+import sys
+import argparse
+from typing import Optional, List, Dict, Tuple
+from pathlib import Path
+from loguru import logger
+# =============================================================================
+# Network Detection & Smart Download
+# =============================================================================
+def _can_access_google(timeout: float = 3.0) -> bool:
+    """
+    Check if Google is accessible (to determine HuggingFace vs ModelScope).
+    Args:
+        timeout: Connection timeout in seconds
+    Returns:
+        True if Google is accessible, False otherwise
+    """
+    import socket
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    try:
+        sock.settimeout(timeout)
+        sock.connect(("www.google.com", 443))
+        return True
+    except (socket.timeout, socket.error, OSError):
+        return False
+    finally:
+        sock.close()
+def _download_from_huggingface_internal(
+    repo_id: str,
+    local_dir: Path,
+    token: Optional[str] = None,
+) -> None:
+    """
+    Internal function to download from HuggingFace Hub.
+    Args:
+        repo_id: HuggingFace repository ID (e.g., "ACE-Step/Ace-Step1.5")
+        local_dir: Local directory to save the model
+        token: HuggingFace token for private repos (optional)
+    Raises:
+        Exception: If download fails
+    """
+    from huggingface_hub import snapshot_download
+    logger.info(f"[Model Download] Downloading from HuggingFace: {repo_id} -> {local_dir}")
+    snapshot_download(
+        repo_id=repo_id,
+        local_dir=str(local_dir),
+        local_dir_use_symlinks=False,
+        token=token,
+    )
+def _download_from_modelscope_internal(
+    repo_id: str,
+    local_dir: Path,
+) -> None:
+    """
+    Internal function to download from ModelScope.
+    Args:
+        repo_id: ModelScope repository ID (e.g., "ACE-Step/Ace-Step1.5")
+        local_dir: Local directory to save the model
+    Raises:
+        Exception: If download fails
+    """
+    from modelscope import snapshot_download
+    logger.info(f"[Model Download] Downloading from ModelScope: {repo_id} -> {local_dir}")
+    snapshot_download(
+        model_id=repo_id,
+        local_dir=str(local_dir),
+    )
+def _smart_download(
+    repo_id: str,
+    local_dir: Path,
+    token: Optional[str] = None,
+    prefer_source: Optional[str] = None,
+) -> Tuple[bool, str]:
+    """
+    Smart download with automatic fallback between HuggingFace and ModelScope.
+    Automatically detects network environment and chooses the best download source.
+    If the primary source fails, automatically falls back to the alternative.
+    Args:
+        repo_id: Repository ID (same format for both HF and ModelScope)
+        local_dir: Local directory to save the model
+        token: HuggingFace token for private repos (optional)
+        prefer_source: Preferred download source ("huggingface", "modelscope", or None for auto-detect)
+    Returns:
+        Tuple of (success, message)
+    """
+    # Ensure directory exists
+    local_dir.mkdir(parents=True, exist_ok=True)
+    # Determine primary source
+    if prefer_source == "huggingface":
+        use_huggingface_first = True
+        logger.info("[Model Download] User preference: HuggingFace Hub")
+    elif prefer_source == "modelscope":
+        use_huggingface_first = False
+        logger.info("[Model Download] User preference: ModelScope")
+    else:
+        # Auto-detect network environment
+        can_access_google = _can_access_google()
+        use_huggingface_first = can_access_google
+        logger.info(f"[Model Download] Auto-detected: {'HuggingFace Hub' if can_access_google else 'ModelScope'}")
+    if use_huggingface_first:
+        logger.info("[Model Download] Using HuggingFace Hub...")
+        try:
+            _download_from_huggingface_internal(repo_id, local_dir, token)
+            return True, f"Successfully downloaded from HuggingFace: {repo_id}"
+        except Exception as e:
+            logger.warning(f"[Model Download] HuggingFace download failed: {e}")
+            logger.info("[Model Download] Falling back to ModelScope...")
+            try:
+                _download_from_modelscope_internal(repo_id, local_dir)
+                return True, f"Successfully downloaded from ModelScope: {repo_id}"
+            except Exception as e2:
+                error_msg = f"Both HuggingFace and ModelScope downloads failed. HF: {e}, MS: {e2}"
+                logger.error(error_msg)
+                return False, error_msg
+    else:
+        logger.info("[Model Download] Using ModelScope...")
+        try:
+            _download_from_modelscope_internal(repo_id, local_dir)
+            return True, f"Successfully downloaded from ModelScope: {repo_id}"
+        except Exception as e:
+            logger.warning(f"[Model Download] ModelScope download failed: {e}")
+            logger.info("[Model Download] Falling back to HuggingFace Hub...")
+            try:
+                _download_from_huggingface_internal(repo_id, local_dir, token)
+                return True, f"Successfully downloaded from HuggingFace: {repo_id}"
+            except Exception as e2:
+                error_msg = f"Both ModelScope and HuggingFace downloads failed. MS: {e}, HF: {e2}"
+                logger.error(error_msg)
+                return False, error_msg
+# =============================================================================
+# Model Registry
+# =============================================================================
+# Main model contains core components (vae, text_encoder, default DiT)
+MAIN_MODEL_REPO = "ACE-Step/Ace-Step1.5"
+# Sub-models that can be downloaded separately into the checkpoints directory
+SUBMODEL_REGISTRY: Dict[str, str] = {
+    # LM models
+    "acestep-5Hz-lm-0.6B": "ACE-Step/acestep-5Hz-lm-0.6B",
+    "acestep-5Hz-lm-4B": "ACE-Step/acestep-5Hz-lm-4B",
+    # DiT models
+    "acestep-v15-turbo-shift3": "ACE-Step/acestep-v15-turbo-shift3",
+    "acestep-v15-sft": "ACE-Step/acestep-v15-sft",
+    "acestep-v15-base": "ACE-Step/acestep-v15-base",
+    "acestep-v15-turbo-shift1": "ACE-Step/acestep-v15-turbo-shift1",
+    "acestep-v15-turbo-continuous": "ACE-Step/acestep-v15-turbo-continuous",
+}
+# Components that come from the main model repo (ACE-Step/Ace-Step1.5)
+MAIN_MODEL_COMPONENTS = [
+    "acestep-v15-turbo",      # Default DiT model
+    "vae",                     # VAE for audio encoding/decoding
+    "Qwen3-Embedding-0.6B",    # Text encoder
+    "acestep-5Hz-lm-1.7B",     # Default LM model (1.7B)
+]
+# Default LM model (included in main model)
+DEFAULT_LM_MODEL = "acestep-5Hz-lm-1.7B"
+def get_project_root() -> Path:
+    """Get the project root directory."""
+    current_file = Path(__file__).resolve()
+    return current_file.parent.parent
+def get_checkpoints_dir(custom_dir: Optional[str] = None) -> Path:
+    """Get the checkpoints directory path."""
+    if custom_dir:
+        return Path(custom_dir)
+    return get_project_root() / "checkpoints"
+def check_main_model_exists(checkpoints_dir: Optional[Path] = None) -> bool:
+    """
+    Check if the main model components exist in the checkpoints directory.
+    Returns:
+        True if all main model components exist, False otherwise.
+    """
+    if checkpoints_dir is None:
+        checkpoints_dir = get_checkpoints_dir()
+    for component in MAIN_MODEL_COMPONENTS:
+        component_path = checkpoints_dir / component
+        if not component_path.exists():
+            return False
+    return True
+def check_model_exists(model_name: str, checkpoints_dir: Optional[Path] = None) -> bool:
+    """
+    Check if a specific model exists in the checkpoints directory.
+    Args:
+        model_name: Name of the model to check
+        checkpoints_dir: Custom checkpoints directory (optional)
+    Returns:
+        True if the model exists, False otherwise.
+    """
+    if checkpoints_dir is None:
+        checkpoints_dir = get_checkpoints_dir()
+    model_path = checkpoints_dir / model_name
+    return model_path.exists()
+def list_available_models() -> Dict[str, str]:
+    """
+    List all available models for download.
+    Returns:
+        Dictionary mapping local names to HuggingFace repo IDs.
+    """
+    models = {
+        "main": MAIN_MODEL_REPO,
+        **SUBMODEL_REGISTRY
+    }
+    return models
+def download_main_model(
+    checkpoints_dir: Optional[Path] = None,
+    force: bool = False,
+    token: Optional[str] = None,
+    prefer_source: Optional[str] = None,
+) -> Tuple[bool, str]:
+    """
+    Download the main ACE-Step model from HuggingFace or ModelScope.
+    The main model includes:
+    - acestep-v15-turbo (default DiT model)
+    - vae (audio encoder/decoder)
+    - Qwen3-Embedding-0.6B (text encoder)
+    - acestep-5Hz-lm-1.7B (default LM model)
+    Args:
+        checkpoints_dir: Custom checkpoints directory (optional)
+        force: Force re-download even if model exists
+        token: HuggingFace token for private repos (optional)
+        prefer_source: Preferred download source ("huggingface", "modelscope", or None for auto-detect)
+    Returns:
+        Tuple of (success, message)
+    """
+    if checkpoints_dir is None:
+        checkpoints_dir = get_checkpoints_dir()
+    # Ensure checkpoints directory exists
+    checkpoints_dir.mkdir(parents=True, exist_ok=True)
+    if not force and check_main_model_exists(checkpoints_dir):
+        return True, f"Main model already exists at {checkpoints_dir}"
+    print(f"Downloading main model from {MAIN_MODEL_REPO}...")
+    print(f"Destination: {checkpoints_dir}")
+    print("This may take a while depending on your internet connection...")
+    # Use smart download with automatic fallback
+    return _smart_download(MAIN_MODEL_REPO, checkpoints_dir, token, prefer_source)
+def download_submodel(
+    model_name: str,
+    checkpoints_dir: Optional[Path] = None,
+    force: bool = False,
+    token: Optional[str] = None,
+    prefer_source: Optional[str] = None,
+) -> Tuple[bool, str]:
+    """
+    Download a specific sub-model from HuggingFace or ModelScope.
+    Args:
+        model_name: Name of the model to download (must be in SUBMODEL_REGISTRY)
+        checkpoints_dir: Custom checkpoints directory (optional)
+        force: Force re-download even if model exists
+        token: HuggingFace token for private repos (optional)
+        prefer_source: Preferred download source ("huggingface", "modelscope", or None for auto-detect)
+    Returns:
+        Tuple of (success, message)
+    """
+    if model_name not in SUBMODEL_REGISTRY:
+        available = ", ".join(SUBMODEL_REGISTRY.keys())
+        return False, f"Unknown model '{model_name}'. Available models: {available}"
+    if checkpoints_dir is None:
+        checkpoints_dir = get_checkpoints_dir()
+    # Ensure checkpoints directory exists
+    checkpoints_dir.mkdir(parents=True, exist_ok=True)
+    model_path = checkpoints_dir / model_name
+    if not force and model_path.exists():
+        return True, f"Model '{model_name}' already exists at {model_path}"
+    repo_id = SUBMODEL_REGISTRY[model_name]
+    print(f"Downloading {model_name} from {repo_id}...")
+    print(f"Destination: {model_path}")
+    # Use smart download with automatic fallback
+    return _smart_download(repo_id, model_path, token, prefer_source)
+def download_all_models(
+    checkpoints_dir: Optional[Path] = None,
+    force: bool = False,
+    token: Optional[str] = None,
+) -> Tuple[bool, List[str]]:
+    """
+    Download all available models.
+    Args:
+        checkpoints_dir: Custom checkpoints directory (optional)
+        force: Force re-download even if models exist
+        token: HuggingFace token for private repos (optional)
+    Returns:
+        Tuple of (all_success, list of messages)
+    """
+    if checkpoints_dir is None:
+        checkpoints_dir = get_checkpoints_dir()
+    messages = []
+    all_success = True
+    # Download main model first
+    success, msg = download_main_model(checkpoints_dir, force, token)
+    messages.append(msg)
+    if not success:
+        all_success = False
+    # Download all sub-models
+    for model_name in SUBMODEL_REGISTRY:
+        success, msg = download_submodel(model_name, checkpoints_dir, force, token)
+        messages.append(msg)
+        if not success:
+            all_success = False
+    return all_success, messages
+def ensure_main_model(
+    checkpoints_dir: Optional[Path] = None,
+    token: Optional[str] = None,
+    prefer_source: Optional[str] = None,
+) -> Tuple[bool, str]:
+    """
+    Ensure the main model is available, downloading if necessary.
+    This function is designed to be called during initialization.
+    It will only download if the model doesn't exist.
+    Args:
+        checkpoints_dir: Custom checkpoints directory (optional)
+        token: HuggingFace token for private repos (optional)
+        prefer_source: Preferred download source ("huggingface", "modelscope", or None for auto-detect)
+    Returns:
+        Tuple of (success, message)
+    """
+    if checkpoints_dir is None:
+        checkpoints_dir = get_checkpoints_dir()
+    if check_main_model_exists(checkpoints_dir):
+        return True, "Main model is available"
+    print("\n" + "=" * 60)
+    print("Main model not found. Starting automatic download...")
+    print("=" * 60 + "\n")
+    return download_main_model(checkpoints_dir, token=token, prefer_source=prefer_source)
+def ensure_lm_model(
+    model_name: Optional[str] = None,
+    checkpoints_dir: Optional[Path] = None,
+    token: Optional[str] = None,
+    prefer_source: Optional[str] = None,
+) -> Tuple[bool, str]:
+    """
+    Ensure an LM model is available, downloading if necessary.
+    Args:
+        model_name: Name of the LM model (defaults to DEFAULT_LM_MODEL)
+        checkpoints_dir: Custom checkpoints directory (optional)
+        token: HuggingFace token for private repos (optional)
+        prefer_source: Preferred download source ("huggingface", "modelscope", or None for auto-detect)
+    Returns:
+        Tuple of (success, message)
+    """
+    if model_name is None:
+        model_name = DEFAULT_LM_MODEL
+    if checkpoints_dir is None:
+        checkpoints_dir = get_checkpoints_dir()
+    if check_model_exists(model_name, checkpoints_dir):
+        return True, f"LM model '{model_name}' is available"
+    # Check if this is a known LM model
+    if model_name not in SUBMODEL_REGISTRY:
+        # Check if it might be a variant name
+        for known_model in SUBMODEL_REGISTRY:
+            if "lm" in known_model.lower() and model_name.lower() in known_model.lower():
+                model_name = known_model
+                break
+        else:
+            return False, f"Unknown LM model: {model_name}"
+    print("\n" + "=" * 60)
+    print(f"LM model '{model_name}' not found. Starting automatic download...")
+    print("=" * 60 + "\n")
+    return download_submodel(model_name, checkpoints_dir, token=token, prefer_source=prefer_source)
+def ensure_dit_model(
+    model_name: str,
+    checkpoints_dir: Optional[Path] = None,
+    token: Optional[str] = None,
+    prefer_source: Optional[str] = None,
+) -> Tuple[bool, str]:
+    """
+    Ensure a DiT model is available, downloading if necessary.
+    Args:
+        model_name: Name of the DiT model
+        checkpoints_dir: Custom checkpoints directory (optional)
+        token: HuggingFace token for private repos (optional)
+        prefer_source: Preferred download source ("huggingface", "modelscope", or None for auto-detect)
+    Returns:
+        Tuple of (success, message)
+    """
+    if checkpoints_dir is None:
+        checkpoints_dir = get_checkpoints_dir()
+    if check_model_exists(model_name, checkpoints_dir):
+        return True, f"DiT model '{model_name}' is available"
+    # Check if this is the default turbo model (part of main)
+    if model_name == "acestep-v15-turbo":
+        return ensure_main_model(checkpoints_dir, token, prefer_source)
+    # Check if it's a known sub-model
+    if model_name in SUBMODEL_REGISTRY:
+        print("\n" + "=" * 60)
+        print(f"DiT model '{model_name}' not found. Starting automatic download...")
+        print("=" * 60 + "\n")
+        return download_submodel(model_name, checkpoints_dir, token=token, prefer_source=prefer_source)
+    return False, f"Unknown DiT model: {model_name}"
+def print_model_list():
+    """Print formatted list of available models."""
+    print("\nAvailable Models for Download:")
+    print("=" * 60)
+    print("\nSupported Sources: HuggingFace Hub <-> ModelScope (auto-fallback)")
+    print("\n[Main Model]")
+    print(f"  main -> {MAIN_MODEL_REPO}")
+    print("  Contains: vae, Qwen3-Embedding-0.6B, acestep-v15-turbo, acestep-5Hz-lm-1.7B")
+    print("\n[Optional LM Models]")
+    for name, repo in SUBMODEL_REGISTRY.items():
+        if "lm" in name.lower():
+            print(f"  {name} -> {repo}")
+    print("\n[Optional DiT Models]")
+    for name, repo in SUBMODEL_REGISTRY.items():
+        if "lm" not in name.lower():
+            print(f"  {name} -> {repo}")
+    print("\n" + "=" * 60)
+def main():
+    """CLI entry point for model downloading."""
+    parser = argparse.ArgumentParser(
+        description="Download ACE-Step models with automatic fallback (HuggingFace <-> ModelScope)",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  acestep-download                          # Download main model (includes LM 1.7B)
+  acestep-download --all                    # Download all available models
+  acestep-download --model acestep-v15-sft  # Download a specific model
+  acestep-download --list                   # List all available models
+Network Detection:
+  Automatically detects network environment and chooses the best download source:
+  - Google accessible -> HuggingFace (fallback to ModelScope)
+  - Google blocked -> ModelScope (fallback to HuggingFace)
+Alternative using huggingface-cli:
+  huggingface-cli download ACE-Step/Ace-Step1.5 --local-dir ./checkpoints
+  huggingface-cli download ACE-Step/acestep-5Hz-lm-0.6B --local-dir ./checkpoints/acestep-5Hz-lm-0.6B
+        """
+    )
+    parser.add_argument(
+        "--model", "-m",
+        type=str,
+        help="Specific model to download (use --list to see available models)"
+    )
+    parser.add_argument(
+        "--all", "-a",
+        action="store_true",
+        help="Download all available models"
+    )
+    parser.add_argument(
+        "--list", "-l",
+        action="store_true",
+        help="List all available models"
+    )
+    parser.add_argument(
+        "--dir", "-d",
+        type=str,
+        default=None,
+        help="Custom checkpoints directory (default: ./checkpoints)"
+    )
+    parser.add_argument(
+        "--force", "-f",
+        action="store_true",
+        help="Force re-download even if model exists"
+    )
+    parser.add_argument(
+        "--token", "-t",
+        type=str,
+        default=None,
+        help="HuggingFace token for private repos"
+    )
+    parser.add_argument(
+        "--skip-main",
+        action="store_true",
+        help="Skip downloading the main model (only download specified sub-model)"
+    )
+    args = parser.parse_args()
+    # Handle --list
+    if args.list:
+        print_model_list()
+        return 0
+    # Get checkpoints directory
+    checkpoints_dir = get_checkpoints_dir(args.dir) if args.dir else get_checkpoints_dir()
+    print(f"Checkpoints directory: {checkpoints_dir}")
+    # Handle --all
+    if args.all:
+        success, messages = download_all_models(checkpoints_dir, args.force, args.token)
+        for msg in messages:
+            print(msg)
+        return 0 if success else 1
+    # Handle --model
+    if args.model:
+        if args.model == "main":
+            success, msg = download_main_model(checkpoints_dir, args.force, args.token)
+        elif args.model in SUBMODEL_REGISTRY:
+            # Download main model first if needed (unless --skip-main)
+            if not args.skip_main and not check_main_model_exists(checkpoints_dir):
+                print("Main model not found. Downloading main model first...")
+                main_success, main_msg = download_main_model(checkpoints_dir, args.force, args.token)
+                print(main_msg)
+                if not main_success:
+                    return 1
+            success, msg = download_submodel(args.model, checkpoints_dir, args.force, args.token)
+        else:
+            print(f"Unknown model: {args.model}")
+            print("Use --list to see available models")
+            return 1
+        print(msg)
+        return 0 if success else 1
+    # Default: download main model (includes default LM 1.7B)
+    print("Downloading main model (includes vae, text encoder, DiT, and LM 1.7B)...")
+    # Download main model
+    success, msg = download_main_model(checkpoints_dir, args.force, args.token)
+    print(msg)
+    if success:
+        print("\nDownload complete!")
+        print(f"Models are available at: {checkpoints_dir}")
+    return 0 if success else 1
+if __name__ == "__main__":
+    sys.exit(main())

handler.py CHANGED Viewed

@@ -1,15 +1,13 @@
 # handler.py
 import base64
-import inspect
 import io
 import os
 import traceback
-from typing import Any, Dict, Tuple
 import numpy as np
 import soundfile as sf
-# Optional torch import for dtype/device handling
 try:
     import torch
 except Exception:
@@ -20,7 +18,7 @@ class EndpointHandler:
     """
     Hugging Face Inference Endpoints custom handler for ACE-Step 1.5.
-    Request body shape:
     {
       "inputs": {
         "prompt": "upbeat pop rap, emotional guitar",
@@ -29,130 +27,144 @@ class EndpointHandler:
         "sample_rate": 44100,
         "seed": 42,
         "guidance_scale": 7.0,
-        "steps": 50,
         "use_lm": true,
         "simple_prompt": false,
-        "model_repo": "ACE-Step/Ace-Step1.5"
       }
     }
-    Also supported for simple mode:
     {
       "inputs": "upbeat pop rap with emotional guitar"
     }
-    Response:
-    {
-      "audio_base64_wav": "...",
-      "sample_rate": 44100,
-      "duration_sec": 12,
-      "used_fallback": false,
-      "model_loaded": true,
-      "model_error": null,
-      "meta": {...}
-    }
     """
     def __init__(self, path: str = ""):
         self.path = path
-        self.model = None
-        self.model_error = None
         self.model_repo = os.getenv("ACE_MODEL_REPO", "ACE-Step/Ace-Step1.5")
         self.default_sr = int(os.getenv("DEFAULT_SAMPLE_RATE", "44100"))
-        # Runtime knobs
         self.device = "cuda" if (torch is not None and torch.cuda.is_available()) else "cpu"
         self.dtype = "float16" if self.device == "cuda" else "float32"
-        # Try to initialize ACE-Step pipeline from repo code paths.
-        # Repo mentions Python API and module path `acestep.acestep_v15_pipeline`.
-        self._init_model()
     # --------------------------
-    # Initialization helpers
     # --------------------------
     def _init_model(self) -> None:
         err_msgs = []
-        # Strategy A: class/factory in acestep.acestep_v15_pipeline
         try:
-            from acestep import acestep_v15_pipeline as m  # type: ignore
-            # Try common factory patterns
-            if hasattr(m, "from_pretrained"):
-                self.model = m.from_pretrained(self.model_repo)  # type: ignore
-            elif hasattr(m, "AceStepV15Pipeline"):
-                cls = getattr(m, "AceStepV15Pipeline")
-                if hasattr(cls, "from_pretrained"):
-                    self.model = cls.from_pretrained(self.model_repo)
-                else:
-                    self.model = cls(model_path=self.model_repo)
-            elif hasattr(m, "Pipeline"):
-                cls = getattr(m, "Pipeline")
-                if hasattr(cls, "from_pretrained"):
-                    self.model = cls.from_pretrained(self.model_repo)
-                else:
-                    self.model = cls(self.model_repo)
-            else:
-                raise RuntimeError("No known pipeline class/factory found in acestep_v15_pipeline")
-            # Move device if supported
-            if self.model is not None and hasattr(self.model, "to"):
-                try:
-                    self.model.to(self.device)
-                except Exception:
-                    pass
-            return
         except Exception as e:
-            err_msgs.append(f"Strategy A failed: {type(e).__name__}: {e}")
-        # Strategy B: import root `acestep` and find a likely pipeline symbol
         try:
-            import acestep  # type: ignore
-            candidates = [
-                "AceStepV15Pipeline",
-                "AceStepPipeline",
-                "Pipeline",
-                "create_pipeline",
-                "build_pipeline",
-                "load_pipeline",
-            ]
-            obj = None
-            for name in candidates:
-                if hasattr(acestep, name):
-                    obj = getattr(acestep, name)
-                    break
-            if obj is None:
-                raise RuntimeError("No known pipeline symbol found in `acestep` package")
-            if callable(obj):
-                # class or factory
-                if hasattr(obj, "from_pretrained"):
-                    self.model = obj.from_pretrained(self.model_repo)
-                else:
-                    # try keyword variants
-                    try:
-                        self.model = obj(model_path=self.model_repo)
-                    except TypeError:
-                        self.model = obj(self.model_repo)
-            else:
-                self.model = obj
-            if self.model is not None and hasattr(self.model, "to"):
-                try:
-                    self.model.to(self.device)
-                except Exception:
-                    pass
-            return
         except Exception as e:
-            err_msgs.append(f"Strategy B failed: {type(e).__name__}: {e}")
-        self.model = None
-        self.model_error = " | ".join(err_msgs)
     # --------------------------
     # Audio helpers
@@ -166,7 +178,6 @@ class EndpointHandler:
         else:
             arr = np.asarray(audio)
-        # Convert common tensor shape [channels, samples] to [samples, channels].
         if arr.ndim == 2 and arr.shape[0] in (1, 2) and arr.shape[1] > arr.shape[0]:
             arr = arr.T
@@ -188,6 +199,9 @@ class EndpointHandler:
         y = (0.07 * np.sin(2 * np.pi * 440 * t) + 0.01 * rng.standard_normal(len(t))).astype(np.float32)
         return np.clip(y, -1.0, 1.0)
     @staticmethod
     def _to_bool(value: Any, default: bool = False) -> bool:
         if value is None:
@@ -242,39 +256,18 @@ class EndpointHandler:
         if not lyrics and (instrumental or simple_prompt):
             lyrics = "[Instrumental]"
-        duration_sec = self._to_int(raw_inputs.get("duration_sec", raw_inputs.get("duration", 10)), 10)
-        duration_sec = max(1, min(duration_sec, 600))
         sample_rate = self._to_int(raw_inputs.get("sample_rate", self.default_sr), self.default_sr)
         sample_rate = max(8000, min(sample_rate, 48000))
         seed = self._to_int(raw_inputs.get("seed", 42), 42)
         guidance_scale = self._to_float(raw_inputs.get("guidance_scale", 7.0), 7.0)
-        steps = self._to_int(raw_inputs.get("steps", raw_inputs.get("inference_steps", 50)), 50)
-        steps = max(1, min(steps, 500))
         use_lm = self._to_bool(raw_inputs.get("use_lm", raw_inputs.get("thinking", True)), True)
-        task_type = self._pick_text(raw_inputs, "task_type") or "text2music"
-        model_repo = raw_inputs.get("model_repo")
-        model_kwargs = {
-            "task_type": task_type,
-            "prompt": prompt,
-            "caption": prompt,
-            "query": prompt,
-            "lyrics": lyrics,
-            "duration_sec": duration_sec,
-            "duration": duration_sec,
-            "sample_rate": sample_rate,
-            "seed": seed,
-            "guidance_scale": guidance_scale,
-            "steps": steps,
-            "inference_steps": steps,
-            "num_inference_steps": steps,
-            "use_lm": use_lm,
-            "thinking": use_lm,
-            "instrumental": instrumental,
-        }
         return {
             "prompt": prompt,
@@ -287,165 +280,144 @@ class EndpointHandler:
             "use_lm": use_lm,
             "instrumental": instrumental,
             "simple_prompt": simple_prompt,
-            "model_repo": model_repo,
-            "model_kwargs": model_kwargs,
         }
-    @staticmethod
-    def _invoke_with_supported_kwargs(fn: Any, kwargs: Dict[str, Any]) -> Any:
-        try:
-            sig = inspect.signature(fn)
-            has_var_kw = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values())
-            if has_var_kw:
-                return fn(**kwargs)
-            accepted = {
-                name
-                for name, p in sig.parameters.items()
-                if p.kind in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY)
-            }
-            filtered = {k: v for k, v in kwargs.items() if k in accepted}
-            return fn(**filtered)
-        except Exception:
-            # Fallback for C-extension callables or dynamic signatures.
-            return fn(**kwargs)
-    def _normalize_model_output(self, out: Any, default_sr: int) -> Tuple[np.ndarray, int]:
-        if out is None:
-            raise RuntimeError("Model returned None")
-        if hasattr(out, "success") and not getattr(out, "success"):
-            err = getattr(out, "error", "unknown model error")
-            raise RuntimeError(str(err))
-        if hasattr(out, "audios"):
-            audios = getattr(out, "audios") or []
-            if not audios:
-                raise RuntimeError("Model result has no audios")
-            first = audios[0]
-            if isinstance(first, dict):
-                audio = first.get("tensor", first.get("audio", first.get("waveform", first.get("wav"))))
-                sr = first.get("sample_rate", default_sr)
-            else:
-                audio = getattr(first, "tensor", getattr(first, "audio", None))
-                sr = getattr(first, "sample_rate", default_sr)
-            if audio is None:
-                raise RuntimeError("Model result audio entry is missing tensor/audio")
-            return self._as_float32(audio), int(sr)
-        if isinstance(out, tuple) and len(out) >= 1:
-            audio = out[0]
-            sr = int(out[1]) if len(out) > 1 and out[1] is not None else default_sr
-            return self._as_float32(audio), sr
-        if isinstance(out, dict):
-            if "audios" in out:
-                audios = out.get("audios") or []
-                if not audios:
-                    raise RuntimeError("Model output `audios` is empty")
-                first = audios[0]
-                if not isinstance(first, dict):
-                    raise RuntimeError("Model output `audios[0]` must be a dict")
-                audio = first.get("tensor", first.get("audio", first.get("waveform", first.get("wav"))))
-                sr = first.get("sample_rate", default_sr)
-                if audio is None:
-                    raise RuntimeError("Model output `audios[0]` missing tensor/audio")
-                return self._as_float32(audio), int(sr)
-            audio = out.get("audio", out.get("waveform", out.get("wav", out.get("tensor"))))
-            sr = out.get("sample_rate", out.get("sr", default_sr))
-            if audio is None:
-                raise RuntimeError("Model dict output missing audio/waveform field")
-            return self._as_float32(audio), int(sr)
-        for name in ("audio", "waveform", "wav", "tensor"):
-            if hasattr(out, name):
-                audio = getattr(out, name)
-                if audio is not None:
-                    sr = getattr(out, "sample_rate", getattr(out, "sr", default_sr))
-                    return self._as_float32(audio), int(sr)
-        return self._as_float32(out), default_sr
     # --------------------------
-    # Inference
     # --------------------------
-    def _call_model(
-        self,
-        model_kwargs: Dict[str, Any],
-        sample_rate: int,
-    ) -> Tuple[np.ndarray, int]:
-        """
-        Tries multiple invocation styles to tolerate minor ACE-Step API differences.
-        Returns (audio_np, sample_rate).
-        """
-        if self.model is None:
-            raise RuntimeError("Model is not loaded")
-        # Common callable entrypoints
-        methods = [
-            "__call__",
-            "generate",
-            "infer",
-            "inference",
-            "text_to_music",
-            "run",
-        ]
-        last_err = None
-        for m in methods:
-            try:
-                fn = self.model if m == "__call__" else getattr(self.model, m, None)
-                if fn is None:
-                    continue
-                # Try full kwargs
-                try:
-                    out = self._invoke_with_supported_kwargs(fn, model_kwargs)
-                except TypeError:
-                    # Narrow payload if signature is strict
-                    skinny = {
-                        "prompt": model_kwargs.get("prompt"),
-                        "caption": model_kwargs.get("caption"),
-                        "lyrics": model_kwargs.get("lyrics"),
-                        "duration": model_kwargs.get("duration"),
-                        "seed": model_kwargs.get("seed"),
-                    }
-                    skinny = {k: v for k, v in skinny.items() if v is not None and (k != "prompt" or str(v).strip())}
-                    out = self._invoke_with_supported_kwargs(fn, skinny)
-                return self._normalize_model_output(out, sample_rate)
             except Exception as e:
-                last_err = e
-                continue
-        raise RuntimeError(f"No compatible inference method worked: {last_err}")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         try:
             req = self._normalize_request(data)
-            # Optional override
-            model_repo = req.get("model_repo")
-            if model_repo and model_repo != self.model_repo:
-                # hot-switch model only if user asks
-                self.model_repo = str(model_repo)
-                self._init_model()
             used_fallback = False
-            if self.model is not None:
-                try:
-                    audio, out_sr = self._call_model(
-                        model_kwargs=req["model_kwargs"],
-                        sample_rate=req["sample_rate"],
-                    )
-                except Exception as e:
-                    used_fallback = True
-                    self.model_error = f"Inference failed: {type(e).__name__}: {e}"
-                    audio = self._fallback_sine(req["duration_sec"], req["sample_rate"], req["seed"])
-                    out_sr = req["sample_rate"]
-            else:
                 used_fallback = True
                 audio = self._fallback_sine(req["duration_sec"], req["sample_rate"], req["seed"])
                 out_sr = req["sample_rate"]
@@ -455,7 +427,7 @@ class EndpointHandler:
                 "sample_rate": int(out_sr),
                 "duration_sec": int(req["duration_sec"]),
                 "used_fallback": used_fallback,
-                "model_loaded": self.model is not None,
                 "model_repo": self.model_repo,
                 "model_error": self.model_error,
                 "meta": {
@@ -469,16 +441,34 @@ class EndpointHandler:
                     "use_lm": req["use_lm"],
                     "simple_prompt": req["simple_prompt"],
                     "instrumental": req["instrumental"],
-                    "resolved_prompt": req["prompt"],
-                    "resolved_lyrics": req["lyrics"],
                 },
             }
         except Exception as e:
             return {
                 "error": f"{type(e).__name__}: {e}",
-                "traceback": traceback.format_exc(limit=3),
                 "audio_base64_wav": None,
                 "sample_rate": None,
                 "duration_sec": None,
-            }

 # handler.py
 import base64
 import io
 import os
 import traceback
+from typing import Any, Dict, Optional, Tuple
 import numpy as np
 import soundfile as sf
 try:
     import torch
 except Exception:
     """
     Hugging Face Inference Endpoints custom handler for ACE-Step 1.5.
+    Supported request shapes:
     {
       "inputs": {
         "prompt": "upbeat pop rap, emotional guitar",
         "sample_rate": 44100,
         "seed": 42,
         "guidance_scale": 7.0,
+        "steps": 8,
         "use_lm": true,
         "simple_prompt": false,
+        "instrumental": false,
+        "allow_fallback": false
       }
     }
+    Or simple mode:
     {
       "inputs": "upbeat pop rap with emotional guitar"
     }
+    Notes:
+    - This handler uses ACE-Step's official Python API internally.
+    - Fallback sine generation is disabled by default so model failures are explicit.
     """
     def __init__(self, path: str = ""):
         self.path = path
+        self.project_root = os.path.dirname(os.path.abspath(__file__))
         self.model_repo = os.getenv("ACE_MODEL_REPO", "ACE-Step/Ace-Step1.5")
+        self.config_path = os.getenv("ACE_CONFIG_PATH", "acestep-v15-turbo")
+        self.lm_model_path = os.getenv("ACE_LM_MODEL_PATH", "acestep-5Hz-lm-1.7B")
+        self.lm_backend = os.getenv("ACE_LM_BACKEND", "pt")
+        self.download_source = os.getenv("ACE_DOWNLOAD_SOURCE", "huggingface")
         self.default_sr = int(os.getenv("DEFAULT_SAMPLE_RATE", "44100"))
+        self.enable_fallback = self._to_bool(os.getenv("ACE_ENABLE_FALLBACK"), False)
+        self.init_lm_on_start = self._to_bool(os.getenv("ACE_INIT_LLM"), False)
+        self.skip_init = self._to_bool(os.getenv("ACE_SKIP_INIT"), False)
         self.device = "cuda" if (torch is not None and torch.cuda.is_available()) else "cpu"
         self.dtype = "float16" if self.device == "cuda" else "float32"
+        self.model_loaded = False
+        self.model_error: Optional[str] = None
+        self.init_details: Dict[str, Any] = {}
+        self.dit_handler = None
+        self.llm_handler = None
+        self.llm_initialized = False
+        self.llm_error: Optional[str] = None
+        self._GenerationParams = None
+        self._GenerationConfig = None
+        self._generate_music = None
+        self._create_sample = None
+        if self.skip_init:
+            self.model_error = "Initialization skipped because ACE_SKIP_INIT=true"
+        else:
+            self._init_model()
     # --------------------------
+    # Initialization
     # --------------------------
     def _init_model(self) -> None:
         err_msgs = []
         try:
+            from acestep.handler import AceStepHandler
+            from acestep.inference import GenerationConfig, GenerationParams, create_sample, generate_music
+            from acestep.llm_inference import LLMHandler
+        except Exception as e:
+            self.model_error = f"ACE-Step import failed: {type(e).__name__}: {e}"
+            return
+        self._GenerationParams = GenerationParams
+        self._GenerationConfig = GenerationConfig
+        self._generate_music = generate_music
+        self._create_sample = create_sample
+        try:
+            self.dit_handler = AceStepHandler()
+            prefer_source = self.download_source if self.download_source in {"huggingface", "modelscope"} else None
+            init_status, ok = self.dit_handler.initialize_service(
+                project_root=self.project_root,
+                config_path=self.config_path,
+                device=self.device,
+                use_flash_attention=False,
+                compile_model=False,
+                offload_to_cpu=False,
+                offload_dit_to_cpu=False,
+                prefer_source=prefer_source,
+            )
+            self.init_details["dit_status"] = init_status
+            if not ok:
+                raise RuntimeError(init_status)
         except Exception as e:
+            err_msgs.append(f"DiT init failed: {type(e).__name__}: {e}")
         try:
+            self.llm_handler = LLMHandler()
+            if self.init_lm_on_start:
+                self._ensure_llm_initialized()
         except Exception as e:
+            err_msgs.append(f"LLM bootstrap failed: {type(e).__name__}: {e}")
+        if err_msgs:
+            self.model_loaded = False
+            self.model_error = " | ".join(err_msgs)
+            return
+        self.model_loaded = True
+        self.model_error = None
+    def _ensure_llm_initialized(self) -> bool:
+        if self.llm_handler is None:
+            self.llm_error = "LLM handler is not available"
+            return False
+        if self.llm_initialized:
+            return True
+        try:
+            checkpoint_dir = os.path.join(self.project_root, "checkpoints")
+            status, ok = self.llm_handler.initialize(
+                checkpoint_dir=checkpoint_dir,
+                lm_model_path=self.lm_model_path,
+                backend=self.lm_backend,
+                device=self.device,
+                offload_to_cpu=False,
+            )
+            self.init_details["llm_status"] = status
+            if not ok:
+                self.llm_error = status
+                self.llm_initialized = False
+                return False
+            self.llm_error = None
+            self.llm_initialized = True
+            return True
+        except Exception as e:
+            self.llm_error = f"LLM init exception: {type(e).__name__}: {e}"
+            self.llm_initialized = False
+            return False
     # --------------------------
     # Audio helpers
         else:
             arr = np.asarray(audio)
         if arr.ndim == 2 and arr.shape[0] in (1, 2) and arr.shape[1] > arr.shape[0]:
             arr = arr.T
         y = (0.07 * np.sin(2 * np.pi * 440 * t) + 0.01 * rng.standard_normal(len(t))).astype(np.float32)
         return np.clip(y, -1.0, 1.0)
+    # --------------------------
+    # Request normalization
+    # --------------------------
     @staticmethod
     def _to_bool(value: Any, default: bool = False) -> bool:
         if value is None:
         if not lyrics and (instrumental or simple_prompt):
             lyrics = "[Instrumental]"
+        duration_sec = self._to_int(raw_inputs.get("duration_sec", raw_inputs.get("duration", 12)), 12)
+        duration_sec = max(10, min(duration_sec, 600))
         sample_rate = self._to_int(raw_inputs.get("sample_rate", self.default_sr), self.default_sr)
         sample_rate = max(8000, min(sample_rate, 48000))
         seed = self._to_int(raw_inputs.get("seed", 42), 42)
         guidance_scale = self._to_float(raw_inputs.get("guidance_scale", 7.0), 7.0)
+        steps = self._to_int(raw_inputs.get("steps", raw_inputs.get("inference_steps", 8)), 8)
+        steps = max(1, min(steps, 200))
         use_lm = self._to_bool(raw_inputs.get("use_lm", raw_inputs.get("thinking", True)), True)
+        allow_fallback = self._to_bool(raw_inputs.get("allow_fallback"), self.enable_fallback)
         return {
             "prompt": prompt,
             "use_lm": use_lm,
             "instrumental": instrumental,
             "simple_prompt": simple_prompt,
+            "allow_fallback": allow_fallback,
         }
     # --------------------------
+    # ACE-Step invocation
     # --------------------------
+    def _build_generation_inputs(self, req: Dict[str, Any], llm_ready: bool) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        caption = req["prompt"]
+        lyrics = req["lyrics"]
+        extras: Dict[str, Any] = {
+            "simple_expansion_used": False,
+            "simple_expansion_error": None,
+        }
+        bpm = None
+        keyscale = ""
+        timesignature = ""
+        vocal_language = "unknown"
+        duration = float(req["duration_sec"])
+        if req["simple_prompt"] and req["use_lm"] and llm_ready and caption:
+            try:
+                sample = self._create_sample(
+                    llm_handler=self.llm_handler,
+                    query=caption,
+                    instrumental=req["instrumental"],
+                )
+                if getattr(sample, "success", False):
+                    caption = getattr(sample, "caption", "") or caption
+                    lyrics = getattr(sample, "lyrics", "") or lyrics
+                    bpm = getattr(sample, "bpm", None)
+                    keyscale = getattr(sample, "keyscale", "") or ""
+                    timesignature = getattr(sample, "timesignature", "") or ""
+                    vocal_language = getattr(sample, "language", "") or "unknown"
+                    sample_duration = getattr(sample, "duration", None)
+                    if sample_duration:
+                        duration = float(sample_duration)
+                    extras["simple_expansion_used"] = True
+                else:
+                    extras["simple_expansion_error"] = getattr(sample, "error", "create_sample failed")
             except Exception as e:
+                extras["simple_expansion_error"] = f"{type(e).__name__}: {e}"
+        params = self._GenerationParams(
+            task_type="text2music",
+            caption=caption,
+            lyrics=lyrics,
+            instrumental=req["instrumental"],
+            duration=duration,
+            inference_steps=req["steps"],
+            guidance_scale=req["guidance_scale"],
+            seed=req["seed"],
+            bpm=bpm,
+            keyscale=keyscale,
+            timesignature=timesignature,
+            vocal_language=vocal_language,
+            thinking=bool(req["use_lm"] and llm_ready),
+            use_cot_metas=bool(req["use_lm"] and llm_ready),
+            use_cot_caption=bool(req["use_lm"] and llm_ready and not req["simple_prompt"]),
+            use_cot_language=bool(req["use_lm"] and llm_ready),
+        )
+        config = self._GenerationConfig(
+            batch_size=1,
+            allow_lm_batch=False,
+            use_random_seed=False,
+            seeds=[req["seed"]],
+            audio_format="wav",
+        )
+        extras["resolved_prompt"] = caption
+        extras["resolved_lyrics"] = lyrics
+        extras["resolved_duration"] = duration
+        return {"params": params, "config": config}, extras
+    def _call_model(self, req: Dict[str, Any]) -> Tuple[np.ndarray, int, Dict[str, Any]]:
+        if not self.model_loaded or self.dit_handler is None:
+            raise RuntimeError(self.model_error or "Model is not loaded")
+        llm_ready = False
+        if req["use_lm"]:
+            llm_ready = self._ensure_llm_initialized()
+        generation_inputs, extras = self._build_generation_inputs(req, llm_ready)
+        result = self._generate_music(
+            self.dit_handler,
+            self.llm_handler if llm_ready else None,
+            generation_inputs["params"],
+            generation_inputs["config"],
+            save_dir=None,
+            progress=None,
+        )
+        if not getattr(result, "success", False):
+            raise RuntimeError(getattr(result, "error", "generation failed"))
+        audios = getattr(result, "audios", None) or []
+        if not audios:
+            raise RuntimeError("generation succeeded but no audio was returned")
+        first = audios[0]
+        audio_tensor = first.get("tensor") if isinstance(first, dict) else None
+        if audio_tensor is None:
+            raise RuntimeError("generated audio tensor is missing")
+        sample_rate = int(first.get("sample_rate", req["sample_rate"]))
+        status_message = getattr(result, "status_message", "")
+        meta = {
+            "llm_requested": req["use_lm"],
+            "llm_initialized": llm_ready,
+            "llm_error": self.llm_error,
+            "status_message": status_message,
+        }
+        meta.update(extras)
+        return self._as_float32(audio_tensor), sample_rate, meta
+    # --------------------------
+    # Endpoint entry
+    # --------------------------
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         try:
             req = self._normalize_request(data)
             used_fallback = False
+            runtime_meta: Dict[str, Any] = {}
+            try:
+                audio, out_sr, runtime_meta = self._call_model(req)
+            except Exception as model_exc:
+                self.model_error = f"Inference failed: {type(model_exc).__name__}: {model_exc}"
+                if not req["allow_fallback"]:
+                    raise RuntimeError(self.model_error)
                 used_fallback = True
                 audio = self._fallback_sine(req["duration_sec"], req["sample_rate"], req["seed"])
                 out_sr = req["sample_rate"]
                 "sample_rate": int(out_sr),
                 "duration_sec": int(req["duration_sec"]),
                 "used_fallback": used_fallback,
+                "model_loaded": self.model_loaded,
                 "model_repo": self.model_repo,
                 "model_error": self.model_error,
                 "meta": {
                     "use_lm": req["use_lm"],
                     "simple_prompt": req["simple_prompt"],
                     "instrumental": req["instrumental"],
+                    "allow_fallback": req["allow_fallback"],
+                    "resolved_prompt": runtime_meta.get("resolved_prompt", req["prompt"]),
+                    "resolved_lyrics": runtime_meta.get("resolved_lyrics", req["lyrics"]),
+                    "simple_expansion_used": runtime_meta.get("simple_expansion_used", False),
+                    "simple_expansion_error": runtime_meta.get("simple_expansion_error"),
+                    "llm_requested": runtime_meta.get("llm_requested", False),
+                    "llm_initialized": runtime_meta.get("llm_initialized", False),
+                    "llm_error": runtime_meta.get("llm_error"),
+                    "status_message": runtime_meta.get("status_message", ""),
+                    "init_details": self.init_details,
                 },
             }
         except Exception as e:
             return {
                 "error": f"{type(e).__name__}: {e}",
+                "traceback": traceback.format_exc(limit=4),
                 "audio_base64_wav": None,
                 "sample_rate": None,
                 "duration_sec": None,
+                "used_fallback": False,
+                "model_loaded": self.model_loaded,
+                "model_repo": self.model_repo,
+                "model_error": self.model_error,
+                "meta": {
+                    "device": self.device,
+                    "dtype": self.dtype,
+                    "init_details": self.init_details,
+                    "llm_error": self.llm_error,
+                },
+            }

requirements.txt CHANGED Viewed

@@ -2,7 +2,13 @@ numpy
 soundfile
 torch
 torchaudio
-transformers
 accelerate
 huggingface_hub
-git+https://github.com/ace-step/ACE-Step-1.5.git@f30aee4c186c33b7b8a6ea59a4b7fc36f795b49f

 soundfile
 torch
 torchaudio
+transformers>=4.51.0,<4.58.0
 accelerate
 huggingface_hub
+diffusers
+loguru
+tqdm
+numba>=0.63.1
+PyYAML
+modelscope
+filelock>=3.13.0