Ace-Step-v1.5

Running

App Files Files Community

ChuxiJ commited on Jan 9

Commit

11860f1

1 Parent(s): 24f370e

add inference code and doc

Browse files

Files changed (12) hide show

acestep/api_server.py +12 -2
acestep/audio_utils.py +396 -0
acestep/constrained_logits_processor.py +76 -97
acestep/gradio_ui/event.py +0 -0
acestep/gradio_ui/events/results_handlers.py +22 -5
acestep/handler.py +329 -297
acestep/inference.py +383 -571
acestep/llm_inference.py +544 -598
acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py +60 -44
acestep/third_parts/nano-vllm/nanovllm/layers/sampler.py +101 -47
acestep/third_parts/nano-vllm/pyproject.toml +0 -2
profile_inference.py +223 -0

acestep/api_server.py CHANGED Viewed

@@ -868,7 +868,7 @@ def create_app() -> FastAPI:
                     if s in {"", "N/A"}:
                         return None
                     return s
-                first, second, paths, gen_info, status_msg, seed_value, *_ = h.generate_music(
                     captions=req.caption,
                     lyrics=req.lyrics,
                     bpm=bpm_val,
@@ -896,10 +896,20 @@ def create_app() -> FastAPI:
                     use_tiled_decode=req.use_tiled_decode,
                     progress=None,
                 )
                 return {
                     "first_audio_path": _path_to_audio_url(first) if first else None,
                     "second_audio_path": _path_to_audio_url(second) if second else None,
-                    "audio_paths": [_path_to_audio_url(p) for p in (paths or [])],
                     "generation_info": gen_info,
                     "status_message": status_msg,
                     "seed_value": seed_value,

                     if s in {"", "N/A"}:
                         return None
                     return s
+                result = h.generate_music(
                     captions=req.caption,
                     lyrics=req.lyrics,
                     bpm=bpm_val,
                     use_tiled_decode=req.use_tiled_decode,
                     progress=None,
                 )
+                # Extract values from new dict structure
+                audios = result.get("audios", [])
+                audio_paths = [audio.get("path") for audio in audios]
+                first = audio_paths[0] if len(audio_paths) > 0 else None
+                second = audio_paths[1] if len(audio_paths) > 1 else None
+                gen_info = result.get("generation_info", "")
+                status_msg = result.get("status_message", "")
+                seed_value = result.get("extra_outputs", {}).get("seed_value", "")
                 return {
                     "first_audio_path": _path_to_audio_url(first) if first else None,
                     "second_audio_path": _path_to_audio_url(second) if second else None,
+                    "audio_paths": [_path_to_audio_url(p) for p in (audio_paths or [])],
                     "generation_info": gen_info,
                     "status_message": status_msg,
                     "seed_value": seed_value,

acestep/audio_utils.py ADDED Viewed

	@@ -0,0 +1,396 @@

+"""
+Audio saving and transcoding utility module
+Independent audio file operations outside of handler, supporting:
+- Save audio tensor/numpy to files (default FLAC format, fast)
+- Format conversion (FLAC/WAV/MP3)
+- Batch processing
+"""
+import os
+import hashlib
+import json
+from pathlib import Path
+from typing import Union, Optional, List, Tuple
+import torch
+import numpy as np
+import torchaudio
+from loguru import logger
+class AudioSaver:
+    """Audio saving and transcoding utility class"""
+    def __init__(self, default_format: str = "flac"):
+        """
+        Initialize audio saver
+        Args:
+            default_format: Default save format ('flac', 'wav', 'mp3')
+        """
+        self.default_format = default_format.lower()
+        if self.default_format not in ["flac", "wav", "mp3"]:
+            logger.warning(f"Unsupported format {default_format}, using 'flac'")
+            self.default_format = "flac"
+    def save_audio(
+        self,
+        audio_data: Union[torch.Tensor, np.ndarray],
+        output_path: Union[str, Path],
+        sample_rate: int = 48000,
+        format: Optional[str] = None,
+        channels_first: bool = True,
+    ) -> str:
+        """
+        Save audio data to file
+        Args:
+            audio_data: Audio data, torch.Tensor [channels, samples] or numpy.ndarray
+            output_path: Output file path (extension can be omitted)
+            sample_rate: Sample rate
+            format: Audio format ('flac', 'wav', 'mp3'), defaults to default_format
+            channels_first: If True, tensor format is [channels, samples], else [samples, channels]
+        Returns:
+            Actual saved file path
+        """
+        format = (format or self.default_format).lower()
+        if format not in ["flac", "wav", "mp3"]:
+            logger.warning(f"Unsupported format {format}, using {self.default_format}")
+            format = self.default_format
+        # Ensure output path has correct extension
+        output_path = Path(output_path)
+        if output_path.suffix.lower() not in ['.flac', '.wav', '.mp3']:
+            output_path = output_path.with_suffix(f'.{format}')
+        # Convert to torch tensor
+        if isinstance(audio_data, np.ndarray):
+            if channels_first:
+                # numpy [samples, channels] -> tensor [channels, samples]
+                audio_tensor = torch.from_numpy(audio_data.T).float()
+            else:
+                # numpy [samples, channels] -> tensor [samples, channels] -> [channels, samples]
+                audio_tensor = torch.from_numpy(audio_data).float()
+                if audio_tensor.dim() == 2 and audio_tensor.shape[0] < audio_tensor.shape[1]:
+                    audio_tensor = audio_tensor.T
+        else:
+            # torch tensor
+            audio_tensor = audio_data.cpu().float()
+            if not channels_first and audio_tensor.dim() == 2:
+                # [samples, channels] -> [channels, samples]
+                if audio_tensor.shape[0] > audio_tensor.shape[1]:
+                    audio_tensor = audio_tensor.T
+        # Ensure memory is contiguous
+        audio_tensor = audio_tensor.contiguous()
+        # Select backend and save
+        try:
+            if format == "mp3":
+                # MP3 uses ffmpeg backend
+                from torchaudio.io import CodecConfig
+                config = CodecConfig(bit_rate=192000, compression_level=1)
+                torchaudio.save(
+                    str(output_path),
+                    audio_tensor,
+                    sample_rate,
+                    channels_first=True,
+                    backend='ffmpeg',
+                    compression=config,
+                    buffer_size=65536
+                )
+            elif format in ["flac", "wav"]:
+                # FLAC and WAV use soundfile backend (fastest)
+                torchaudio.save(
+                    str(output_path),
+                    audio_tensor,
+                    sample_rate,
+                    channels_first=True,
+                    backend='soundfile',
+                    buffer_size=65536
+                )
+            else:
+                # Other formats use default backend
+                torchaudio.save(
+                    str(output_path),
+                    audio_tensor,
+                    sample_rate,
+                    channels_first=True,
+                    buffer_size=65536
+                )
+            logger.debug(f"[AudioSaver] Saved audio to {output_path} ({format}, {sample_rate}Hz)")
+            return str(output_path)
+        except Exception as e:
+            logger.error(f"[AudioSaver] Failed to save audio: {e}")
+            raise
+    def convert_audio(
+        self,
+        input_path: Union[str, Path],
+        output_path: Union[str, Path],
+        output_format: str,
+        remove_input: bool = False,
+    ) -> str:
+        """
+        Convert audio format
+        Args:
+            input_path: Input audio file path
+            output_path: Output audio file path
+            output_format: Target format ('flac', 'wav', 'mp3')
+            remove_input: Whether to delete input file
+        Returns:
+            Output file path
+        """
+        input_path = Path(input_path)
+        output_path = Path(output_path)
+        if not input_path.exists():
+            raise FileNotFoundError(f"Input file not found: {input_path}")
+        # Load audio
+        audio_tensor, sample_rate = torchaudio.load(str(input_path))
+        # Save as new format
+        output_path = self.save_audio(
+            audio_tensor,
+            output_path,
+            sample_rate=sample_rate,
+            format=output_format,
+            channels_first=True
+        )
+        # Delete input file if needed
+        if remove_input:
+            input_path.unlink()
+            logger.debug(f"[AudioSaver] Removed input file: {input_path}")
+        return output_path
+    def save_batch(
+        self,
+        audio_batch: Union[List[torch.Tensor], torch.Tensor],
+        output_dir: Union[str, Path],
+        file_prefix: str = "audio",
+        sample_rate: int = 48000,
+        format: Optional[str] = None,
+        channels_first: bool = True,
+    ) -> List[str]:
+        """
+        Save audio batch
+        Args:
+            audio_batch: Audio batch, List[tensor] or tensor [batch, channels, samples]
+            output_dir: Output directory
+            file_prefix: File prefix
+            sample_rate: Sample rate
+            format: Audio format
+            channels_first: Tensor format flag
+        Returns:
+            List of saved file paths
+        """
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Process batch
+        if isinstance(audio_batch, torch.Tensor) and audio_batch.dim() == 3:
+            # [batch, channels, samples]
+            audio_list = [audio_batch[i] for i in range(audio_batch.shape[0])]
+        elif isinstance(audio_batch, list):
+            audio_list = audio_batch
+        else:
+            audio_list = [audio_batch]
+        saved_paths = []
+        for i, audio in enumerate(audio_list):
+            output_path = output_dir / f"{file_prefix}_{i:04d}"
+            saved_path = self.save_audio(
+                audio,
+                output_path,
+                sample_rate=sample_rate,
+                format=format,
+                channels_first=channels_first
+            )
+            saved_paths.append(saved_path)
+        return saved_paths
+def get_audio_file_hash(audio_file) -> str:
+    """
+    Get hash identifier for an audio file.
+    Args:
+        audio_file: Path to audio file (str) or file-like object
+    Returns:
+        Hash string or empty string
+    """
+    if audio_file is None:
+        return ""
+    try:
+        if isinstance(audio_file, str):
+            if os.path.exists(audio_file):
+                with open(audio_file, 'rb') as f:
+                    return hashlib.md5(f.read()).hexdigest()
+            return hashlib.md5(audio_file.encode('utf-8')).hexdigest()
+        elif hasattr(audio_file, 'name'):
+            return hashlib.md5(str(audio_file.name).encode('utf-8')).hexdigest()
+        return hashlib.md5(str(audio_file).encode('utf-8')).hexdigest()
+    except Exception:
+        return hashlib.md5(str(audio_file).encode('utf-8')).hexdigest()
+def generate_uuid_from_params(
+    captions: str,
+    lyrics: str,
+    bpm: Optional[int],
+    key_scale: str,
+    time_signature: str,
+    vocal_language: str,
+    inference_steps: int,
+    guidance_scale: float,
+    seed: Union[str, float, int],
+    audio_duration: Optional[float],
+    audio_code_string: Union[str, List[str]],
+    repainting_start: float,
+    repainting_end: Optional[float],
+    instruction: str,
+    audio_cover_strength: float,
+    task_type: str,
+    use_adg: bool,
+    cfg_interval_start: float,
+    cfg_interval_end: float,
+    audio_format: str,
+    reference_audio=None,
+    src_audio=None,
+    batch_index: int = 0,
+) -> str:
+    """
+    Generate deterministic UUID from generation parameters.
+    Same parameters will always generate the same UUID.
+    Args:
+        captions: Music caption
+        lyrics: Lyrics text
+        bpm: BPM value
+        key_scale: Musical key and scale
+        time_signature: Time signature
+        vocal_language: Vocal language code
+        inference_steps: Number of inference steps
+        guidance_scale: Guidance scale
+        seed: Random seed
+        audio_duration: Audio duration in seconds
+        audio_code_string: Audio code string or list
+        repainting_start: Repainting start time
+        repainting_end: Repainting end time
+        instruction: Task instruction
+        audio_cover_strength: Audio cover strength
+        task_type: Task type
+        use_adg: Whether to use ADG
+        cfg_interval_start: CFG interval start
+        cfg_interval_end: CFG interval end
+        audio_format: Audio format
+        reference_audio: Reference audio file path
+        src_audio: Source audio file path
+        batch_index: Index in batch (for audio_code_string list access)
+    Returns:
+        UUID string
+    """
+    params_dict = {
+        "captions": captions or "",
+        "lyrics": lyrics or "",
+        "bpm": bpm,
+        "key_scale": key_scale or "",
+        "time_signature": time_signature or "",
+        "vocal_language": vocal_language or "",
+        "inference_steps": inference_steps,
+        "guidance_scale": guidance_scale,
+        "seed": seed,
+        "audio_duration": audio_duration,
+        "audio_code_string": audio_code_string if isinstance(audio_code_string, str) else (audio_code_string[batch_index] if isinstance(audio_code_string, list) and batch_index < len(audio_code_string) else ""),
+        "repainting_start": repainting_start,
+        "repainting_end": repainting_end,
+        "instruction": instruction or "",
+        "audio_cover_strength": audio_cover_strength,
+        "task_type": task_type or "",
+        "use_adg": use_adg,
+        "cfg_interval_start": cfg_interval_start,
+        "cfg_interval_end": cfg_interval_end,
+        "audio_format": audio_format or "",
+        "reference_audio_hash": get_audio_file_hash(reference_audio),
+        "src_audio_hash": get_audio_file_hash(src_audio),
+    }
+    params_json = json.dumps(params_dict, sort_keys=True, ensure_ascii=False)
+    hash_obj = hashlib.sha256(params_json.encode('utf-8'))
+    hash_hex = hash_obj.hexdigest()
+    uuid_str = f"{hash_hex[0:8]}-{hash_hex[8:12]}-{hash_hex[12:16]}-{hash_hex[16:20]}-{hash_hex[20:32]}"
+    return uuid_str
+def generate_uuid_from_audio_data(
+    audio_data: Union[torch.Tensor, np.ndarray],
+    seed: Optional[int] = None
+) -> str:
+    """
+    Generate UUID from audio data (for caching/deduplication)
+    Args:
+        audio_data: Audio data
+        seed: Optional seed value
+    Returns:
+        UUID string
+    """
+    if isinstance(audio_data, torch.Tensor):
+        # Convert to numpy and calculate hash
+        audio_np = audio_data.cpu().numpy()
+    else:
+        audio_np = audio_data
+    # Calculate data hash
+    data_hash = hashlib.md5(audio_np.tobytes()).hexdigest()
+    if seed is not None:
+        combined = f"{data_hash}_{seed}"
+        return hashlib.md5(combined.encode()).hexdigest()
+    return data_hash
+# Global default instance
+_default_saver = AudioSaver(default_format="flac")
+def save_audio(
+    audio_data: Union[torch.Tensor, np.ndarray],
+    output_path: Union[str, Path],
+    sample_rate: int = 48000,
+    format: Optional[str] = None,
+    channels_first: bool = True,
+) -> str:
+    """
+    Convenience function: save audio (using default configuration)
+    Args:
+        audio_data: Audio data
+        output_path: Output path
+        sample_rate: Sample rate
+        format: Format (default flac)
+        channels_first: Tensor format flag
+    Returns:
+        Saved file path
+    """
+    return _default_saver.save_audio(
+        audio_data, output_path, sample_rate, format, channels_first
+    )

acestep/constrained_logits_processor.py CHANGED Viewed

@@ -571,6 +571,33 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         if self.debug:
             logger.debug(f"Built audio code masks for {len(self.audio_code_token_ids)} tokens")
     def _build_keyscale_prefix_tree(self) -> Dict[Tuple[int, ...], Set[int]]:
         """
         Build keyscale prefix to allowed tokens mapping based on ACTUAL tokenization.
@@ -1484,10 +1511,10 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     if self.debug:
                         logger.debug(f"Codes generation: {self.codes_count}/{self.target_codes}, blocking EOS")
                 else:
-                    # Force EOS token when target codes count is reached
-                    mask = torch.full_like(scores, float('-inf'))
-                    mask[:, self.eos_token_id] = 0
-                    scores = scores + mask
                     if self.debug:
                         logger.debug(f"Codes generation: {self.codes_count}/{self.target_codes}, forcing EOS")
             return self._apply_temperature_scaling(scores)
@@ -1609,20 +1636,15 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
         input_ids: torch.LongTensor,
         scores: torch.FloatTensor,
     ) -> torch.FloatTensor:
-        """Process a single sequence and return modified scores."""
         # Check if we have tokens in queue for user-provided field
         # If so, inject the next token directly
         if self.user_field_token_queue:
-            mask = torch.full_like(scores, float('-inf'))
             next_token = self.user_field_token_queue[0]
-            mask[0, next_token] = 0
-            scores = scores + mask
             return scores
-        # Create mask (all -inf initially)
-        mask = torch.full_like(scores, float('-inf'))
         if self.state in self.fixed_strings:
             # Fixed string state: force specific tokens
             fixed_str = self.fixed_strings[self.state]
@@ -1633,28 +1655,18 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                 # This happens when we're about to complete the </think> tag
                 if self.state == FSMState.THINK_END_TAG and self.stop_at_reasoning:
                     # Check if the next token would complete the fixed string
-                    # We check if position_in_state + length of next token would complete it
-                    # Since we don't know which token will be selected, we check if we're close to completion
-                    # Actually, a better approach: check if this is the last character(s) of the fixed string
                     remaining_chars = len(fixed_str) - self.position_in_state
                     # If remaining is small (<= 10 chars, which is typically 1-2 tokens), force EOS
                     if remaining_chars <= 10:
                         # Force EOS token to stop generation
                         if self.eos_token_id is not None:
-                            mask[0, self.eos_token_id] = 0
-                            scores = scores + mask
                             if self.debug:
                                 logger.debug(f"stop_at_reasoning=True: forcing EOS near end of </think> tag (remaining: {remaining_chars} chars)")
                             return scores
-                for t in allowed:
-                    mask[0, t] = 0
-                # Apply mask
-                scores = scores + mask
-                # Update position tracking
-                # We need to check if the selected token completes the fixed string
-                # This will be done in update_state() after token selection
             else:
                 # Position exceeds string, move to next state
                 # If stop_at_reasoning is True and we're transitioning from THINK_END_TAG,
@@ -1662,8 +1674,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                 if self.state == FSMState.THINK_END_TAG and self.stop_at_reasoning:
                     # Force EOS token to stop generation
                     if self.eos_token_id is not None:
-                        mask[0, self.eos_token_id] = 0
-                        scores = scores + mask
                         if self.debug:
                             logger.debug(f"stop_at_reasoning=True: forcing EOS after completing </think> tag")
                         return scores
@@ -1676,7 +1687,9 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     if self.debug:
                         logger.warning(f"State transition from {old_state.name} to {self.state.name} still in fixed_strings, avoiding recursion")
                     return scores
-                return self._process_single_sequence(input_ids, torch.zeros_like(scores))
         elif self.state == FSMState.BPM_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
@@ -1690,22 +1703,18 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "bpm"
                     # Inject first token
-                    mask[0, value_tokens[0]] = 0
-                    scores = scores + mask
                     return scores
             # Allow valid numeric tokens using prefix tree (supports multi-digit tokens like "120")
             allowed = self._get_allowed_numeric_tokens(self.bpm_prefix_tree)
-            for t in allowed:
-                mask[0, t] = 0
             # Also allow newline if current token sequence prefix allows it
-            # Check if current token sequence is in prefix tree and allows newline
             token_prefix = tuple(self.accumulated_token_ids)
             if token_prefix in self.bpm_prefix_tree and self.newline_token in self.bpm_prefix_tree[token_prefix]:
-                mask[0, self.newline_token] = 0
-            scores = scores + mask
         elif self.state == FSMState.CAPTION_VALUE:
             # Caption field generation with YAML format support:
@@ -1724,8 +1733,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "caption"
                     # Inject first token
-                    mask[0, value_tokens[0]] = 0
-                    scores = scores + mask
                     return scores
             # Check if we should transition after a newline (non-indented line = new field)
@@ -1757,7 +1765,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                 # The field name detection will happen in update_state()
                 return scores
-            # Block backticks (code blocks)
             if self.backtick_token is not None:
                 scores[0, self.backtick_token] = float('-inf')
@@ -1773,8 +1781,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             if self.caption_token_count >= 512:
                 # Force end by only allowing newline
                 if self.newline_token is not None:
-                    mask[0, self.newline_token] = 0
-                    scores = scores + mask
                     return scores
             # Allow natural generation (with blocked audio codes and backticks)
@@ -1791,8 +1798,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "duration"
                     # Inject first token
-                    mask[0, value_tokens[0]] = 0
-                    scores = scores + mask
                     return scores
             # If target_duration is set, force generate that exact value
@@ -1804,26 +1810,22 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     # Force the next digit
                     next_digit = int(target_str[current_pos])
                     if next_digit in self.digit_tokens:
-                        mask[0, self.digit_tokens[next_digit]] = 0
                 else:
                     # All digits generated, force newline
                     if self.newline_token:
-                        mask[0, self.newline_token] = 0
-                scores = scores + mask
             else:
                 # Normal duration generation with range constraint
                 # Allow valid numeric tokens using prefix tree (supports multi-digit tokens like "60", "120")
                 allowed = self._get_allowed_numeric_tokens(self.duration_prefix_tree)
-                for t in allowed:
-                    mask[0, t] = 0
                 # Also allow newline if current token sequence prefix allows it
                 token_prefix = tuple(self.accumulated_token_ids)
                 if token_prefix in self.duration_prefix_tree and self.newline_token in self.duration_prefix_tree[token_prefix]:
-                    mask[0, self.newline_token] = 0
-                scores = scores + mask
         elif self.state == FSMState.GENRES_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
@@ -1836,8 +1838,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "genres"
                     # Inject first token
-                    mask[0, value_tokens[0]] = 0
-                    scores = scores + mask
                     return scores
             # Try to hot-reload genres vocab if file has changed
@@ -1848,24 +1849,20 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             if allowed:
                 # Use vocabulary-constrained decoding
-                for t in allowed:
-                    mask[0, t] = 0
-                scores = scores + mask
             elif self.genres_vocab:
                 # Vocab is loaded but no valid continuation found
                 # Force newline to end the field
                 if self.newline_token:
-                    mask[0, self.newline_token] = 0
                     if self.debug:
                         logger.debug(f"No valid genre continuation for '{self.accumulated_value}', forcing newline")
-                scores = scores + mask
             else:
                 # Fallback: no vocab loaded, use probability-based ending
                 if self._should_end_text_field(scores):
                     if self.newline_token:
-                        mask[0, self.newline_token] = 0
                         self._transition_to_next_state()
-                    scores = scores + mask
                 else:
                     # Allow any token except newline if we don't have content yet
                     if not self.accumulated_value.strip():
@@ -1884,8 +1881,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "keyscale"
                     # Inject first token
-                    mask[0, value_tokens[0]] = 0
-                    scores = scores + mask
                     return scores
             # Check if current token sequence is complete (allows newline)
@@ -1893,21 +1889,17 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             if token_prefix in self.keyscale_prefix_tree and self.newline_token in self.keyscale_prefix_tree[token_prefix]:
                 # Complete keyscale, allow newline
                 if self.newline_token:
-                    mask[0, self.newline_token] = 0
-                scores = scores + mask
             else:
                 # Not complete, allow valid continuation tokens
                 allowed = self._get_allowed_keyscale_tokens()
                 if allowed:
-                    for t in allowed:
-                        mask[0, t] = 0
-                    scores = scores + mask
                 else:
                     # No valid tokens found - force newline to end field
                     # This handles edge cases where keyscale format is unexpected
                     if self.newline_token:
-                        mask[0, self.newline_token] = 0
-                    scores = scores + mask
         elif self.state == FSMState.LANGUAGE_VALUE:
             # Language field: Use top-1 probability language (greedy selection)
@@ -1925,8 +1917,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "language"
                     # Inject first token
-                    mask[0, value_tokens[0]] = 0
-                    scores = scores + mask
                     return scores
             # If we haven't started generating language yet (empty accumulated_token_ids),
@@ -1938,19 +1929,17 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     candidate_tokens = list(self.language_prefix_tree[empty_prefix])
                     if candidate_tokens:
-                        # Find the token with highest probability (top-1)
-                        # Create a mask that blocks all tokens except candidates
-                        temp_mask = torch.full_like(scores, float('-inf'))
-                        for t in candidate_tokens:
-                            temp_mask[0, t] = 0
-                        temp_scores = scores + temp_mask
                         # Get the highest probability token among candidates
-                        top_token_id = torch.argmax(temp_scores[0]).item()
-                        # Only allow this top-1 token, block all others (including other language tokens)
-                        mask[0, top_token_id] = 0
-                        scores = scores + mask
                         if self.debug:
                             top_token_text = self.tokenizer.decode([top_token_id])
@@ -1958,13 +1947,11 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     else:
                         # No valid first tokens found - force newline
                         if self.newline_token:
-                            mask[0, self.newline_token] = 0
-                        scores = scores + mask
                 else:
                     # Empty prefix not in tree - force newline
                     if self.newline_token:
-                        mask[0, self.newline_token] = 0
-                    scores = scores + mask
             else:
                 # We've started generating a language, continue with prefix tree constraints
                 # Check if current token sequence is complete (allows newline)
@@ -1972,20 +1959,16 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                 if token_prefix in self.language_prefix_tree and self.newline_token in self.language_prefix_tree[token_prefix]:
                     # Complete language, allow newline
                     if self.newline_token:
-                        mask[0, self.newline_token] = 0
-                    scores = scores + mask
                 else:
                     # Not complete, allow valid continuation tokens
                     allowed = self._get_allowed_language_tokens()
                     if allowed:
-                        for t in allowed:
-                            mask[0, t] = 0
-                        scores = scores + mask
                     else:
                         # No valid tokens found - force newline to end field
                         if self.newline_token:
-                            mask[0, self.newline_token] = 0
-                        scores = scores + mask
         elif self.state == FSMState.TIMESIG_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
@@ -1998,8 +1981,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "timesignature"
                     # Inject first token
-                    mask[0, value_tokens[0]] = 0
-                    scores = scores + mask
                     return scores
             # Check if current token sequence is complete (allows newline)
@@ -2007,14 +1989,11 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
             if token_prefix in self.timesig_prefix_tree and self.newline_token in self.timesig_prefix_tree[token_prefix]:
                 # Complete value, allow newline
                 if self.newline_token:
-                    mask[0, self.newline_token] = 0
-                scores = scores + mask
             else:
                 # Not complete, allow valid continuation tokens
                 allowed = self._get_allowed_timesig_tokens()
-                for t in allowed:
-                    mask[0, t] = 0
-                scores = scores + mask
         return scores

         if self.debug:
             logger.debug(f"Built audio code masks for {len(self.audio_code_token_ids)} tokens")
+    def _apply_whitelist_inplace(self, scores: torch.Tensor, allowed_tokens: List[int]) -> None:
+        """
+        Apply whitelist constraint inplace: only allow specified tokens, block all others.
+        This is more efficient than creating a mask tensor because:
+        1. No memory allocation for mask
+        2. No tensor addition operation
+        Args:
+            scores: [1, vocab_size] scores tensor to modify inplace
+            allowed_tokens: List of token IDs to allow (all others will be set to -inf)
+        """
+        if not allowed_tokens:
+            # No tokens allowed, set all to -inf
+            scores.fill_(float('-inf'))
+            return
+        # Save the original values of allowed tokens
+        allowed_indices = torch.tensor(allowed_tokens, device=scores.device, dtype=torch.long)
+        saved_values = scores[0, allowed_indices].clone()
+        # Set all scores to -inf
+        scores.fill_(float('-inf'))
+        # Restore allowed token values
+        scores[0, allowed_indices] = saved_values
     def _build_keyscale_prefix_tree(self) -> Dict[Tuple[int, ...], Set[int]]:
         """
         Build keyscale prefix to allowed tokens mapping based on ACTUAL tokenization.
                     if self.debug:
                         logger.debug(f"Codes generation: {self.codes_count}/{self.target_codes}, blocking EOS")
                 else:
+                    # Force EOS token when target codes count is reached - inplace
+                    eos_scores = scores[:, self.eos_token_id].clone()
+                    scores.fill_(float('-inf'))
+                    scores[:, self.eos_token_id] = eos_scores
                     if self.debug:
                         logger.debug(f"Codes generation: {self.codes_count}/{self.target_codes}, forcing EOS")
             return self._apply_temperature_scaling(scores)
         input_ids: torch.LongTensor,
         scores: torch.FloatTensor,
     ) -> torch.FloatTensor:
+        """Process a single sequence and return modified scores (inplace when possible)."""
         # Check if we have tokens in queue for user-provided field
         # If so, inject the next token directly
         if self.user_field_token_queue:
             next_token = self.user_field_token_queue[0]
+            self._apply_whitelist_inplace(scores, [next_token])
             return scores
         if self.state in self.fixed_strings:
             # Fixed string state: force specific tokens
             fixed_str = self.fixed_strings[self.state]
                 # This happens when we're about to complete the </think> tag
                 if self.state == FSMState.THINK_END_TAG and self.stop_at_reasoning:
                     # Check if the next token would complete the fixed string
                     remaining_chars = len(fixed_str) - self.position_in_state
                     # If remaining is small (<= 10 chars, which is typically 1-2 tokens), force EOS
                     if remaining_chars <= 10:
                         # Force EOS token to stop generation
                         if self.eos_token_id is not None:
+                            self._apply_whitelist_inplace(scores, [self.eos_token_id])
                             if self.debug:
                                 logger.debug(f"stop_at_reasoning=True: forcing EOS near end of </think> tag (remaining: {remaining_chars} chars)")
                             return scores
+                # Apply whitelist constraint inplace
+                self._apply_whitelist_inplace(scores, allowed)
             else:
                 # Position exceeds string, move to next state
                 # If stop_at_reasoning is True and we're transitioning from THINK_END_TAG,
                 if self.state == FSMState.THINK_END_TAG and self.stop_at_reasoning:
                     # Force EOS token to stop generation
                     if self.eos_token_id is not None:
+                        self._apply_whitelist_inplace(scores, [self.eos_token_id])
                         if self.debug:
                             logger.debug(f"stop_at_reasoning=True: forcing EOS after completing </think> tag")
                         return scores
                     if self.debug:
                         logger.warning(f"State transition from {old_state.name} to {self.state.name} still in fixed_strings, avoiding recursion")
                     return scores
+                # For recursion, reset scores to zero (no constraints from previous state)
+                scores.zero_()
+                return self._process_single_sequence(input_ids, scores)
         elif self.state == FSMState.BPM_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "bpm"
                     # Inject first token
+                    self._apply_whitelist_inplace(scores, [value_tokens[0]])
                     return scores
             # Allow valid numeric tokens using prefix tree (supports multi-digit tokens like "120")
             allowed = self._get_allowed_numeric_tokens(self.bpm_prefix_tree)
             # Also allow newline if current token sequence prefix allows it
             token_prefix = tuple(self.accumulated_token_ids)
             if token_prefix in self.bpm_prefix_tree and self.newline_token in self.bpm_prefix_tree[token_prefix]:
+                allowed = allowed + [self.newline_token]
+            self._apply_whitelist_inplace(scores, allowed)
         elif self.state == FSMState.CAPTION_VALUE:
             # Caption field generation with YAML format support:
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "caption"
                     # Inject first token
+                    self._apply_whitelist_inplace(scores, [value_tokens[0]])
                     return scores
             # Check if we should transition after a newline (non-indented line = new field)
                 # The field name detection will happen in update_state()
                 return scores
+            # Block backticks (code blocks) - inplace
             if self.backtick_token is not None:
                 scores[0, self.backtick_token] = float('-inf')
             if self.caption_token_count >= 512:
                 # Force end by only allowing newline
                 if self.newline_token is not None:
+                    self._apply_whitelist_inplace(scores, [self.newline_token])
                     return scores
             # Allow natural generation (with blocked audio codes and backticks)
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "duration"
                     # Inject first token
+                    self._apply_whitelist_inplace(scores, [value_tokens[0]])
                     return scores
             # If target_duration is set, force generate that exact value
                     # Force the next digit
                     next_digit = int(target_str[current_pos])
                     if next_digit in self.digit_tokens:
+                        self._apply_whitelist_inplace(scores, [self.digit_tokens[next_digit]])
                 else:
                     # All digits generated, force newline
                     if self.newline_token:
+                        self._apply_whitelist_inplace(scores, [self.newline_token])
             else:
                 # Normal duration generation with range constraint
                 # Allow valid numeric tokens using prefix tree (supports multi-digit tokens like "60", "120")
                 allowed = self._get_allowed_numeric_tokens(self.duration_prefix_tree)
                 # Also allow newline if current token sequence prefix allows it
                 token_prefix = tuple(self.accumulated_token_ids)
                 if token_prefix in self.duration_prefix_tree and self.newline_token in self.duration_prefix_tree[token_prefix]:
+                    allowed = allowed + [self.newline_token]
+                self._apply_whitelist_inplace(scores, allowed)
         elif self.state == FSMState.GENRES_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "genres"
                     # Inject first token
+                    self._apply_whitelist_inplace(scores, [value_tokens[0]])
                     return scores
             # Try to hot-reload genres vocab if file has changed
             if allowed:
                 # Use vocabulary-constrained decoding
+                self._apply_whitelist_inplace(scores, allowed)
             elif self.genres_vocab:
                 # Vocab is loaded but no valid continuation found
                 # Force newline to end the field
                 if self.newline_token:
                     if self.debug:
                         logger.debug(f"No valid genre continuation for '{self.accumulated_value}', forcing newline")
+                    self._apply_whitelist_inplace(scores, [self.newline_token])
             else:
                 # Fallback: no vocab loaded, use probability-based ending
                 if self._should_end_text_field(scores):
                     if self.newline_token:
+                        self._apply_whitelist_inplace(scores, [self.newline_token])
                         self._transition_to_next_state()
                 else:
                     # Allow any token except newline if we don't have content yet
                     if not self.accumulated_value.strip():
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "keyscale"
                     # Inject first token
+                    self._apply_whitelist_inplace(scores, [value_tokens[0]])
                     return scores
             # Check if current token sequence is complete (allows newline)
             if token_prefix in self.keyscale_prefix_tree and self.newline_token in self.keyscale_prefix_tree[token_prefix]:
                 # Complete keyscale, allow newline
                 if self.newline_token:
+                    self._apply_whitelist_inplace(scores, [self.newline_token])
             else:
                 # Not complete, allow valid continuation tokens
                 allowed = self._get_allowed_keyscale_tokens()
                 if allowed:
+                    self._apply_whitelist_inplace(scores, allowed)
                 else:
                     # No valid tokens found - force newline to end field
                     # This handles edge cases where keyscale format is unexpected
                     if self.newline_token:
+                        self._apply_whitelist_inplace(scores, [self.newline_token])
         elif self.state == FSMState.LANGUAGE_VALUE:
             # Language field: Use top-1 probability language (greedy selection)
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "language"
                     # Inject first token
+                    self._apply_whitelist_inplace(scores, [value_tokens[0]])
                     return scores
             # If we haven't started generating language yet (empty accumulated_token_ids),
                     candidate_tokens = list(self.language_prefix_tree[empty_prefix])
                     if candidate_tokens:
+                        # Find the token with highest probability (top-1) among candidates
+                        # Use tensor indexing to get scores of candidate tokens directly
+                        candidate_indices = torch.tensor(candidate_tokens, device=scores.device, dtype=torch.long)
+                        candidate_scores = scores[0, candidate_indices]
                         # Get the highest probability token among candidates
+                        best_idx = torch.argmax(candidate_scores).item()
+                        top_token_id = candidate_tokens[best_idx]
+                        # Only allow this top-1 token, block all others
+                        self._apply_whitelist_inplace(scores, [top_token_id])
                         if self.debug:
                             top_token_text = self.tokenizer.decode([top_token_id])
                     else:
                         # No valid first tokens found - force newline
                         if self.newline_token:
+                            self._apply_whitelist_inplace(scores, [self.newline_token])
                 else:
                     # Empty prefix not in tree - force newline
                     if self.newline_token:
+                        self._apply_whitelist_inplace(scores, [self.newline_token])
             else:
                 # We've started generating a language, continue with prefix tree constraints
                 # Check if current token sequence is complete (allows newline)
                 if token_prefix in self.language_prefix_tree and self.newline_token in self.language_prefix_tree[token_prefix]:
                     # Complete language, allow newline
                     if self.newline_token:
+                        self._apply_whitelist_inplace(scores, [self.newline_token])
                 else:
                     # Not complete, allow valid continuation tokens
                     allowed = self._get_allowed_language_tokens()
                     if allowed:
+                        self._apply_whitelist_inplace(scores, allowed)
                     else:
                         # No valid tokens found - force newline to end field
                         if self.newline_token:
+                            self._apply_whitelist_inplace(scores, [self.newline_token])
         elif self.state == FSMState.TIMESIG_VALUE:
             # Check if field is user-provided and we haven't started injecting yet
                     self.user_field_token_queue = value_tokens
                     self.current_user_field = "timesignature"
                     # Inject first token
+                    self._apply_whitelist_inplace(scores, [value_tokens[0]])
                     return scores
             # Check if current token sequence is complete (allows newline)
             if token_prefix in self.timesig_prefix_tree and self.newline_token in self.timesig_prefix_tree[token_prefix]:
                 # Complete value, allow newline
                 if self.newline_token:
+                    self._apply_whitelist_inplace(scores, [self.newline_token])
             else:
                 # Not complete, allow valid continuation tokens
                 allowed = self._get_allowed_timesig_tokens()
+                self._apply_whitelist_inplace(scores, allowed)
         return scores

acestep/gradio_ui/event.py DELETED Viewed

The diff for this file is too large to render. See raw diff

acestep/gradio_ui/events/results_handlers.py CHANGED Viewed

@@ -332,10 +332,9 @@ def generate_with_progress(
                 logger.info(f"Generating LM batch chunk {chunk_idx+1}/{num_chunks} (size: {chunk_size}, seeds: {chunk_seeds})...")
                 # Generate batch
-                metadata_list, audio_codes_list, status = llm_handler.generate_with_stop_condition_batch(
                     caption=captions or "",
                     lyrics=lyrics or "",
-                    batch_size=chunk_size,
                     infer_type="llm_dit",
                     temperature=lm_temperature,
                     cfg_scale=lm_cfg_scale,
@@ -347,6 +346,7 @@ def generate_with_progress(
                     use_cot_language=use_cot_language,
                     is_format_caption=is_format_caption,
                     constrained_decoding_debug=constrained_decoding_debug,
                     seeds=chunk_seeds,
                 )
@@ -474,9 +474,26 @@ def generate_with_progress(
         progress=progress
     )
-    # Extract results
-    first_audio, second_audio, all_audio_paths, generation_info, status_message, seed_value_for_ui, \
-        align_score_1, align_text_1, align_plot_1, align_score_2, align_text_2, align_plot_2 = result
     # Extract LM timing from status if available and prepend to generation_info
     if status:

                 logger.info(f"Generating LM batch chunk {chunk_idx+1}/{num_chunks} (size: {chunk_size}, seeds: {chunk_seeds})...")
                 # Generate batch
+                metadata_list, audio_codes_list, status = llm_handler.generate_with_stop_condition(
                     caption=captions or "",
                     lyrics=lyrics or "",
                     infer_type="llm_dit",
                     temperature=lm_temperature,
                     cfg_scale=lm_cfg_scale,
                     use_cot_language=use_cot_language,
                     is_format_caption=is_format_caption,
                     constrained_decoding_debug=constrained_decoding_debug,
+                    batch_size=chunk_size,
                     seeds=chunk_seeds,
                 )
         progress=progress
     )
+    # Extract results from new dict structure
+    if not isinstance(result, dict):
+        # Fallback for old tuple format (should not happen)
+        first_audio, second_audio, all_audio_paths, generation_info, status_message, seed_value_for_ui, \
+            align_score_1, align_text_1, align_plot_1, align_score_2, align_text_2, align_plot_2 = result
+    else:
+        audios = result.get("audios", [])
+        all_audio_paths = [audio.get("path") for audio in audios]
+        first_audio = all_audio_paths[0] if len(all_audio_paths) > 0 else None
+        second_audio = all_audio_paths[1] if len(all_audio_paths) > 1 else None
+        generation_info = result.get("generation_info", "")
+        status_message = result.get("status_message", "")
+        seed_value_for_ui = result.get("extra_outputs", {}).get("seed_value", "")
+        # Legacy alignment fields (no longer used)
+        align_score_1 = ""
+        align_text_1 = ""
+        align_plot_1 = None
+        align_score_2 = ""
+        align_text_2 = ""
+        align_plot_2 = None
     # Extract LM timing from status if available and prepend to generation_info
     if status:

acestep/handler.py CHANGED Viewed

@@ -10,6 +10,8 @@ import traceback
 import re
 import random
 import uuid
 from contextlib import contextmanager
 from typing import Optional, Dict, Any, Tuple, List, Union
@@ -37,16 +39,12 @@ warnings.filterwarnings("ignore")
 class AceStepHandler:
     """ACE-Step Business Logic Handler"""
-    def __init__(self, save_root = None):
         self.model = None
         self.config = None
         self.device = "cpu"
         self.dtype = torch.float32  # Will be set based on device in initialize_service
-        if save_root is None:
-            self.temp_dir = tempfile.mkdtemp()
-        else:
-            self.temp_dir = save_root
         # VAE for audio encoding/decoding
         self.vae = None
@@ -81,8 +79,7 @@ class AceStepHandler:
     def get_available_checkpoints(self) -> str:
         """Return project root directory path"""
         # Get project root (handler.py is in acestep/, so go up two levels to project root)
-        current_file = os.path.abspath(__file__)
-        project_root = os.path.dirname(os.path.dirname(current_file))
         # default checkpoints
         checkpoint_dir = os.path.join(project_root, "checkpoints")
         if os.path.exists(checkpoint_dir):
@@ -93,8 +90,7 @@ class AceStepHandler:
     def get_available_acestep_v15_models(self) -> List[str]:
         """Scan and return all model directory names starting with 'acestep-v15-'"""
         # Get project root
-        current_file = os.path.abspath(__file__)
-        project_root = os.path.dirname(os.path.dirname(current_file))
         checkpoint_dir = os.path.join(project_root, "checkpoints")
         models = []
@@ -171,8 +167,7 @@ class AceStepHandler:
             # Auto-detect project root (independent of passed project_root parameter)
-            current_file = os.path.abspath(__file__)
-            actual_project_root = os.path.dirname(os.path.dirname(current_file))
             checkpoint_dir = os.path.join(actual_project_root, "checkpoints")
             # 1. Load main model
@@ -187,7 +182,7 @@ class AceStepHandler:
                     attn_implementation = "sdpa"
                 try:
-                    logger.info(f"Attempting to load model with attention implementation: {attn_implementation}")
                     self.model = AutoModel.from_pretrained(
                         acestep_v15_checkpoint_path,
                         trust_remote_code=True,
@@ -195,9 +190,9 @@ class AceStepHandler:
                         dtype="bfloat16"
                     )
                 except Exception as e:
-                    logger.warning(f"Failed to load model with {attn_implementation}: {e}")
                     if attn_implementation == "sdpa":
-                        logger.info("Falling back to eager attention")
                         attn_implementation = "eager"
                         self.model = AutoModel.from_pretrained(
                             acestep_v15_checkpoint_path,
@@ -215,7 +210,7 @@ class AceStepHandler:
                 else:
                     # If offload_to_cpu is True, check if we should keep DiT on GPU
                     if not self.offload_dit_to_cpu:
-                        logger.info(f"Keeping main model on {device} (persistent)")
                         self.model = self.model.to(device).to(self.dtype)
                     else:
                         self.model = self.model.to("cpu").to(self.dtype)
@@ -239,7 +234,7 @@ class AceStepHandler:
                             raise ValueError(f"Unsupported quantization type: {self.quantization}")
                         quantize_(self.model, quant_config)
-                        logger.info(f"DiT quantized with: {self.quantization}")
                 silence_latent_path = os.path.join(acestep_v15_checkpoint_path, "silence_latent.pt")
@@ -260,7 +255,7 @@ class AceStepHandler:
             if os.path.exists(vae_checkpoint_path):
                 self.vae = AutoencoderOobleck.from_pretrained(vae_checkpoint_path)
                 # Use bfloat16 for VAE on GPU, otherwise use self.dtype (float32 on CPU)
-                vae_dtype = torch.bfloat16 if device in ["cuda", "xpu"] else self.dtype
                 if not self.offload_to_cpu:
                     self.vae = self.vae.to(device).to(vae_dtype)
                 else:
@@ -302,6 +297,7 @@ class AceStepHandler:
         except Exception as e:
             error_msg = f"❌ Error initializing model: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
             return error_msg, False
     @contextmanager
@@ -326,7 +322,7 @@ class AceStepHandler:
                 try:
                     param = next(model.parameters())
                     if param.device.type == "cpu":
-                        logger.info(f"Moving {model_name} to {self.device} (persistent)")
                         model.to(self.device).to(self.dtype)
                         if hasattr(self, "silence_latent"):
                             self.silence_latent = self.silence_latent.to(self.device).to(self.dtype)
@@ -341,10 +337,10 @@ class AceStepHandler:
             return
         # Load to GPU
-        logger.info(f"Loading {model_name} to {self.device}")
         start_time = time.time()
         if model_name == "vae":
-            vae_dtype = torch.bfloat16 if self.device in ["cuda", "xpu"] else self.dtype
             model.to(self.device).to(vae_dtype)
         else:
             model.to(self.device).to(self.dtype)
@@ -354,13 +350,13 @@ class AceStepHandler:
         load_time = time.time() - start_time
         self.current_offload_cost += load_time
-        logger.info(f"Loaded {model_name} to {self.device} in {load_time:.4f}s")
         try:
             yield
         finally:
             # Offload to CPU
-            logger.info(f"Offloading {model_name} to CPU")
             start_time = time.time()
             model.to("cpu")
@@ -370,7 +366,7 @@ class AceStepHandler:
             torch.cuda.empty_cache()
             offload_time = time.time() - start_time
             self.current_offload_cost += offload_time
-            logger.info(f"Offloaded {model_name} to CPU in {offload_time:.4f}s")
     def process_target_audio(self, audio_file) -> Optional[torch.Tensor]:
         """Process target audio"""
@@ -386,23 +382,12 @@ class AceStepHandler:
             else:
                 audio = torch.from_numpy(audio_np.T)
-            if audio.shape[0] == 1:
-                audio = torch.cat([audio, audio], dim=0)
-            audio = audio[:2]
-            # Resample if needed
-            if sr != 48000:
-                import torch.nn.functional as F
-                ratio = 48000 / sr
-                new_length = int(audio.shape[-1] * ratio)
-                audio = F.interpolate(audio.unsqueeze(0), size=new_length, mode='linear', align_corners=False).squeeze(0)
-            audio = torch.clamp(audio, -1.0, 1.0)
             return audio
         except Exception as e:
-            logger.error(f"Error processing target audio: {e}")
             return None
     def _parse_audio_code_string(self, code_str: str) -> List[int]:
@@ -411,7 +396,8 @@ class AceStepHandler:
             return []
         try:
             return [int(x) for x in re.findall(r"<\|audio_code_(\d+)\|>", code_str)]
-        except Exception:
             return []
     def _decode_audio_codes_to_latents(self, code_str: str) -> Optional[torch.Tensor]:
@@ -538,9 +524,7 @@ class AceStepHandler:
             )
         """
         # Align instruction formatting with _prepare_batch
-        final_instruction = instruction or DEFAULT_DIT_INSTRUCTION
-        if not final_instruction.endswith(":"):
-            final_instruction = final_instruction + ":"
         # Extract caption and language from metas if available (from LM CoT output)
         # Fallback to user-provided values if not in metas
@@ -571,7 +555,7 @@ class AceStepHandler:
         parsed_meta = self._parse_metas([metas])[0]
         caption_input = SFT_GEN_PROMPT.format(final_instruction, actual_caption, parsed_meta)
-        lyrics_input = f"# Languages\n{actual_language}\n\n# Lyric\n{lyrics}<|endoftext|>"
         return caption_input, lyrics_input
     def _get_text_hidden_states(self, text_prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -614,7 +598,7 @@ class AceStepHandler:
                     return match.group(1).strip()
             return caption
         except Exception as e:
-            logger.error(f"Error extracting caption: {e}")
             return caption
     def prepare_seeds(self, actual_batch_size, seed, use_random_seed):
@@ -638,7 +622,8 @@ class AceStepHandler:
                     else:
                         try:
                             seed_list.append(int(float(s)))
-                        except (ValueError, TypeError):
                             seed_list.append(-1)
             elif seed is None or (isinstance(seed, (int, float)) and seed < 0):
                 # If seed is None or negative, use -1 for all items
@@ -679,7 +664,176 @@ class AceStepHandler:
         return actual_seed_list, seed_value_for_ui
     def prepare_metadata(self, bpm, key_scale, time_signature):
-        # Build metadata dict - use "N/A" as default for empty fields
         metadata_dict = {}
         if bpm:
             metadata_dict["bpm"] = bpm
@@ -695,10 +849,12 @@ class AceStepHandler:
             metadata_dict["timesignature"] = time_signature
         else:
             metadata_dict["timesignature"] = "N/A"
         return metadata_dict
-    def is_silence(self, audio):
-        return torch.all(audio.abs() < 1e-6)
     def generate_instruction(
         self,
@@ -745,23 +901,12 @@ class AceStepHandler:
             # Load audio file
             audio, sr = torchaudio.load(audio_file)
-            logger.info(f"Reference audio shape: {audio.shape}")
-            logger.info(f"Reference audio sample rate: {sr}")
-            logger.info(f"Reference audio duration: {audio.shape[-1] / 48000.0} seconds")
-            # Convert to stereo (duplicate channel if mono)
-            if audio.shape[0] == 1:
-                audio = torch.cat([audio, audio], dim=0)
-            # Keep only first 2 channels
-            audio = audio[:2]
-            # Resample to 48kHz if needed
-            if sr != 48000:
-                audio = torchaudio.transforms.Resample(sr, 48000)(audio)
-            # Clamp values to [-1.0, 1.0]
-            audio = torch.clamp(audio, -1.0, 1.0)
             is_silence = self.is_silence(audio)
             if is_silence:
@@ -800,7 +945,7 @@ class AceStepHandler:
             return audio
         except Exception as e:
-            logger.error(f"Error processing reference audio: {e}")
             return None
     def process_src_audio(self, audio_file) -> Optional[torch.Tensor]:
@@ -811,24 +956,13 @@ class AceStepHandler:
             # Load audio file
             audio, sr = torchaudio.load(audio_file)
-            # Convert to stereo (duplicate channel if mono)
-            if audio.shape[0] == 1:
-                audio = torch.cat([audio, audio], dim=0)
-            # Keep only first 2 channels
-            audio = audio[:2]
-            # Resample to 48kHz if needed
-            if sr != 48000:
-                audio = torchaudio.transforms.Resample(sr, 48000)(audio)
-            # Clamp values to [-1.0, 1.0]
-            audio = torch.clamp(audio, -1.0, 1.0)
             return audio
         except Exception as e:
-            logger.error(f"Error processing target audio: {e}")
             return None
     def convert_src_audio_to_codes(self, audio_file) -> str:
@@ -856,19 +990,12 @@ class AceStepHandler:
             # Encode audio to latents using VAE
             with torch.no_grad():
                 with self._load_model_context("vae"):
-                    # Prepare audio for VAE: [channels, samples] -> [1, channels, samples]
-                    vae_input = processed_audio.unsqueeze(0).to(self.device).to(self.vae.dtype)
                     # Check if audio is silence
-                    if self.is_silence(vae_input):
                         return "❌ Audio file appears to be silent"
-                    # Encode to latents
-                    latents = self.vae.encode(vae_input).latent_dist.sample()
-                    # Cast back to model dtype
-                    latents = latents.to(self.dtype)
-                    # Transpose: [1, d, T] -> [1, T, d] -> [T, d]
-                    latents = latents.squeeze(0).transpose(0, 1)  # [T, d]
                 # Create attention mask for latents
                 attention_mask = torch.ones(latents.shape[0], dtype=torch.bool, device=self.device)
@@ -893,7 +1020,7 @@ class AceStepHandler:
         except Exception as e:
             error_msg = f"❌ Error converting audio to codes: {str(e)}\n{traceback.format_exc()}"
-            logger.error(error_msg)
             return error_msg
     def prepare_batch_data(
@@ -922,26 +1049,7 @@ class AceStepHandler:
             calculated_duration = audio_duration
         # Build metadata dict - use "N/A" as default for empty fields
-        metadata_dict = {}
-        if bpm:
-            metadata_dict["bpm"] = bpm
-        else:
-            metadata_dict["bpm"] = "N/A"
-        if key_scale.strip():
-            metadata_dict["keyscale"] = key_scale
-        else:
-            metadata_dict["keyscale"] = "N/A"
-        if time_signature.strip() and time_signature != "N/A" and time_signature:
-            metadata_dict["timesignature"] = time_signature
-        else:
-            metadata_dict["timesignature"] = "N/A"
-        # Add duration to metadata if available (inference service format: "30 seconds")
-        if calculated_duration is not None:
-            metadata_dict["duration"] = f"{int(calculated_duration)} seconds"
-        # If duration not set, inference service will use default (30 seconds)
         # Format metadata - inference service accepts dict and will convert to string
         # Create a copy for each batch item (in case we modify it)
@@ -977,7 +1085,7 @@ class AceStepHandler:
             target_wavs = torch.zeros(2, frames)
             return target_wavs
         except Exception as e:
-            logger.error(f"Error creating target audio: {e}")
             # Fallback to 30 seconds if error
             return torch.zeros(2, 30 * 48000)
@@ -1158,16 +1266,8 @@ class AceStepHandler:
         """
         batch_size = len(captions)
-        # Ensure audio_code_hints is a list of the correct length
-        if audio_code_hints is None:
-            audio_code_hints = [None] * batch_size
-        elif len(audio_code_hints) != batch_size:
-            if len(audio_code_hints) == 1:
-                audio_code_hints = audio_code_hints * batch_size
-            else:
-                audio_code_hints = audio_code_hints[:batch_size]
-                while len(audio_code_hints) < batch_size:
-                    audio_code_hints.append(None)
         for ii, refer_audio_list in enumerate(refer_audios):
             if isinstance(refer_audio_list, list):
@@ -1179,17 +1279,6 @@ class AceStepHandler:
         if vocal_languages is None:
             vocal_languages = self._create_fallback_vocal_languages(batch_size)
-        # Normalize audio_code_hints to batch list
-        if audio_code_hints is None:
-            audio_code_hints = [None] * batch_size
-        elif not isinstance(audio_code_hints, list):
-            audio_code_hints = [audio_code_hints] * batch_size
-        elif len(audio_code_hints) == 1 and batch_size > 1:
-            audio_code_hints = audio_code_hints * batch_size
-        else:
-            audio_code_hints = (audio_code_hints + [None] * batch_size)[:batch_size]
-        audio_code_hints = [hint if isinstance(hint, str) and hint.strip() else None for hint in audio_code_hints]
         # Parse metas with fallbacks
         parsed_metas = self._parse_metas(metas)
@@ -1223,13 +1312,9 @@ class AceStepHandler:
                         expected_latent_length = current_wav.shape[-1] // 1920
                         target_latent = self.silence_latent[0, :expected_latent_length, :]
                     else:
-                        # Ensure input is in VAE's dtype
                         logger.info(f"[generate_music] Encoding target audio to latents for item {i}...")
-                        vae_input = current_wav.to(self.device).to(self.vae.dtype)
-                        target_latent = self.vae.encode(vae_input).latent_dist.sample()
-                        # Cast back to model dtype
-                        target_latent = target_latent.to(self.dtype)
-                        target_latent = target_latent.squeeze(0).transpose(0, 1)
                     target_latents_list.append(target_latent)
                     latent_lengths.append(target_latent.shape[0])
@@ -1268,18 +1353,7 @@ class AceStepHandler:
         # Process instructions early so we can use them for task type detection
         # Use custom instructions if provided, otherwise use default
-        if instructions is None:
-            instructions = [DEFAULT_DIT_INSTRUCTION] * batch_size
-        # Ensure instructions list has the same length as batch_size
-        if len(instructions) != batch_size:
-            if len(instructions) == 1:
-                instructions = instructions * batch_size
-            else:
-                # Pad or truncate to match batch_size
-                instructions = instructions[:batch_size]
-                while len(instructions) < batch_size:
-                    instructions.append(DEFAULT_DIT_INSTRUCTION)
         # Generate chunk_masks and spans based on repainting parameters
         # Also determine if this is a cover task (target audio provided without repainting)
@@ -1428,6 +1502,10 @@ class AceStepHandler:
         else:
             precomputed_lm_hints_25Hz = None
         # Format text_inputs
         text_inputs = []
         text_token_idss = []
@@ -1437,26 +1515,10 @@ class AceStepHandler:
         for i in range(batch_size):
             # Use custom instruction for this batch item
-            instruction = instructions[i] if i < len(instructions) else DEFAULT_DIT_INSTRUCTION
-            # Ensure instruction ends with ":"
-            if not instruction.endswith(":"):
-                instruction = instruction + ":"
-            # Extract caption and language from metas if available (from LM CoT output)
-            # Fallback to user-provided values if not in metas
-            actual_caption = captions[i]
-            actual_language = vocal_languages[i]
-            # Check if metas contains caption/language from LM CoT
-            if i < len(parsed_metas) and parsed_metas[i]:
-                meta_dict = parsed_metas[i]
-                if isinstance(meta_dict, dict):
-                    # Extract caption from metas if available
-                    if 'caption' in meta_dict and meta_dict['caption']:
-                        actual_caption = str(meta_dict['caption'])
-                    # Extract language from metas if available
-                    if 'language' in meta_dict and meta_dict['language']:
-                        actual_language = str(meta_dict['language'])
             # Format text prompt with custom instruction (using LM-generated caption if available)
             text_prompt = SFT_GEN_PROMPT.format(instruction, actual_caption, parsed_metas[i])
@@ -1473,7 +1535,7 @@ class AceStepHandler:
             text_attention_mask = text_inputs_dict.attention_mask[0].bool()
             # Format and tokenize lyrics (using LM-generated language if available)
-            lyrics_text = f"# Languages\n{actual_language}\n\n# Lyric\n{lyrics[i]}<|endoftext|>"
             lyrics_inputs_dict = self.text_tokenizer(
                 lyrics_text,
                 padding="longest",
@@ -1495,36 +1557,12 @@ class AceStepHandler:
         # Pad tokenized sequences
         max_text_length = max(len(seq) for seq in text_token_idss)
-        padded_text_token_idss = torch.stack([
-            torch.nn.functional.pad(
-                seq, (0, max_text_length - len(seq)), 'constant',
-                self.text_tokenizer.pad_token_id
-            )
-            for seq in text_token_idss
-        ])
-        padded_text_attention_masks = torch.stack([
-            torch.nn.functional.pad(
-                seq, (0, max_text_length - len(seq)), 'constant', 0
-            )
-            for seq in text_attention_masks
-        ])
         max_lyric_length = max(len(seq) for seq in lyric_token_idss)
-        padded_lyric_token_idss = torch.stack([
-            torch.nn.functional.pad(
-                seq, (0, max_lyric_length - len(seq)), 'constant',
-                self.text_tokenizer.pad_token_id
-            )
-            for seq in lyric_token_idss
-        ])
-        padded_lyric_attention_masks = torch.stack([
-            torch.nn.functional.pad(
-                seq, (0, max_lyric_length - len(seq)), 'constant', 0
-            )
-            for seq in lyric_attention_masks
-        ])
         padded_non_cover_text_input_ids = None
         padded_non_cover_text_attention_masks = None
@@ -1533,14 +1571,10 @@ class AceStepHandler:
             non_cover_text_attention_masks = []
             for i in range(batch_size):
                 # Use custom instruction for this batch item
-                instruction = DEFAULT_DIT_INSTRUCTION
                 # Extract caption from metas if available (from LM CoT output)
-                actual_caption = captions[i]
-                if i < len(parsed_metas) and parsed_metas[i]:
-                    meta_dict = parsed_metas[i]
-                    if isinstance(meta_dict, dict) and 'caption' in meta_dict and meta_dict['caption']:
-                        actual_caption = str(meta_dict['caption'])
                 # Format text prompt with custom instruction (using LM-generated caption if available)
                 text_prompt = SFT_GEN_PROMPT.format(instruction, actual_caption, parsed_metas[i])
@@ -1558,19 +1592,8 @@ class AceStepHandler:
                 non_cover_text_input_ids.append(text_token_ids)
                 non_cover_text_attention_masks.append(non_cover_text_attention_mask)
-            padded_non_cover_text_input_ids = torch.stack([
-                torch.nn.functional.pad(
-                    seq, (0, max_text_length - len(seq)), 'constant',
-                    self.text_tokenizer.pad_token_id
-                )
-                for seq in non_cover_text_input_ids
-            ])
-            padded_non_cover_text_attention_masks = torch.stack([
-                torch.nn.functional.pad(
-                    seq, (0, max_text_length - len(seq)), 'constant', 0
-                )
-                for seq in non_cover_text_attention_masks
-            ])
         if audio_cover_strength < 1.0:
             assert padded_non_cover_text_input_ids is not None, "When audio_cover_strength < 1.0, padded_non_cover_text_input_ids must not be None"
@@ -1804,7 +1827,7 @@ class AceStepHandler:
         if self.config.is_turbo:
             # Limit inference steps to maximum 8
             if infer_steps > 8:
-                logger.warning(f"dmd_gan version: infer_steps {infer_steps} exceeds maximum 8, clamping to 8")
                 infer_steps = 8
             # CFG parameters are not adjustable for dmd_gan (they will be ignored)
             # Note: guidance_scale, cfg_interval_start, cfg_interval_end are still passed but may be ignored by the model
@@ -1827,30 +1850,12 @@ class AceStepHandler:
         if isinstance(repainting_end, (int, float)):
             repainting_end = [repainting_end]
-        # Convert instructions to list
-        if isinstance(instructions, str):
-            instructions = [instructions]
-        elif instructions is None:
-            instructions = None
-        # Convert audio_code_hints to list
-        if isinstance(audio_code_hints, str):
-            audio_code_hints = [audio_code_hints]
-        elif audio_code_hints is None:
-            audio_code_hints = None
         # Get batch size from captions
         batch_size = len(captions)
-        # Ensure audio_code_hints matches batch size
-        if audio_code_hints is not None:
-            if len(audio_code_hints) != batch_size:
-                if len(audio_code_hints) == 1:
-                    audio_code_hints = audio_code_hints * batch_size
-                else:
-                    audio_code_hints = audio_code_hints[:batch_size]
-                    while len(audio_code_hints) < batch_size:
-                        audio_code_hints.append(None)
         # Convert seed to list format
         if seed is None:
@@ -1947,6 +1952,14 @@ class AceStepHandler:
         logger.info("[service_generate] Generating audio...")
         with self._load_model_context("model"):
             outputs = self.model.generate_audio(**generate_kwargs)
         return outputs
     def tiled_decode(self, latents, chunk_size=512, overlap=64):
@@ -2042,25 +2055,34 @@ class AceStepHandler:
         use_adg: bool = False,
         cfg_interval_start: float = 0.0,
         cfg_interval_end: float = 1.0,
-        audio_format: str = "mp3",
-        lm_temperature: float = 0.6,
         use_tiled_decode: bool = True,
         progress=None
-    ) -> Tuple[Optional[str], Optional[str], List[str], str, str, str, str, str, Optional[Any], str, str, Optional[Any]]:
         """
         Main interface for music generation
         Returns:
-            (first_audio, second_audio, all_audio_paths, generation_info, status_message,
-             seed_value_for_ui, align_score_1, align_text_1, align_plot_1,
-             align_score_2, align_text_2, align_plot_2)
         """
         if progress is None:
             def progress(*args, **kwargs):
                 pass
         if self.model is None or self.vae is None or self.text_tokenizer is None or self.text_encoder is None:
-            return None, None, [], "", "❌ Model not fully initialized. Please initialize all components first.", "-1", "", "", None, "", "", None
         def _has_audio_codes(v: Union[str, List[str]]) -> bool:
             if isinstance(v, list):
@@ -2191,8 +2213,8 @@ class AceStepHandler:
             pred_latents = outputs["target_latents"]  # [batch, latent_length, latent_dim]
             time_costs = outputs["time_costs"]
             time_costs["offload_time_cost"] = self.current_offload_cost
-            logger.info(f"  - pred_latents: {pred_latents.shape}, dtype={pred_latents.dtype} {pred_latents.min()=}, {pred_latents.max()=}, {pred_latents.mean()=} {pred_latents.std()=}")
-            logger.info(f"  - time_costs: {time_costs}")
             if progress:
                 progress(0.8, desc="Decoding audio...")
             logger.info("[generate_music] Decoding latents with VAE...")
@@ -2221,30 +2243,19 @@ class AceStepHandler:
             # Update offload cost one last time to include VAE offloading
             time_costs["offload_time_cost"] = self.current_offload_cost
-            logger.info("[generate_music] VAE decode completed. Saving audio files...")
             if progress:
-                progress(0.9, desc="Saving audio files...")
-            # Save audio files using soundfile (supports wav, flac, mp3 via format param)
-            audio_format_lower = audio_format.lower() if audio_format else "wav"
-            if audio_format_lower not in ["wav", "flac", "mp3"]:
-                audio_format_lower = "wav"
-            saved_files = []
-            saved_uuids = []  # Store UUIDs for each file
             for i in range(actual_batch_size):
-                # Generate unique UUID for each audio file
-                file_uuid = str(uuid.uuid4())
-                audio_file = os.path.join(self.temp_dir, f"{file_uuid}.{audio_format_lower}")
-                # Convert to numpy: [channels, samples] -> [samples, channels]
-                audio_np = pred_wavs[i].cpu().float().numpy().T
-                sf.write(audio_file, audio_np, self.sample_rate)
-                saved_files.append(audio_file)
-                saved_uuids.append(file_uuid)
-            # Prepare return values
-            first_audio = saved_files[0] if len(saved_files) > 0 else None
-            second_audio = saved_files[1] if len(saved_files) > 1 else None
             # Format time costs if available
             time_costs_str = ""
@@ -2262,34 +2273,55 @@ class AceStepHandler:
     **Seeds:** {seed_value_for_ui}
     **Steps:** {inference_steps}
-    **Files:** {len(saved_files)} audio(s){time_costs_str}"""
             status_message = f"✅ Generation completed successfully!"
-            logger.info(f"[generate_music] Done! Generated {len(saved_files)} audio files.")
-            # Alignment scores and plots (placeholder for now)
-            align_score_1 = ""
-            align_text_1 = ""
-            align_plot_1 = None
-            align_score_2 = ""
-            align_text_2 = ""
-            align_plot_2 = None
-            return (
-                first_audio,
-                second_audio,
-                saved_files,
-                generation_info,
-                status_message,
-                seed_value_for_ui,
-                align_score_1,
-                align_text_1,
-                align_plot_1,
-                align_score_2,
-                align_text_2,
-                align_plot_2,
-            )
         except Exception as e:
             error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
-            return None, None, [], "", error_msg, seed_value_for_ui, "", "", None, "", "", None

 import re
 import random
 import uuid
+import hashlib
+import json
 from contextlib import contextmanager
 from typing import Optional, Dict, Any, Tuple, List, Union
 class AceStepHandler:
     """ACE-Step Business Logic Handler"""
+    def __init__(self):
         self.model = None
         self.config = None
         self.device = "cpu"
         self.dtype = torch.float32  # Will be set based on device in initialize_service
         # VAE for audio encoding/decoding
         self.vae = None
     def get_available_checkpoints(self) -> str:
         """Return project root directory path"""
         # Get project root (handler.py is in acestep/, so go up two levels to project root)
+        project_root = self._get_project_root()
         # default checkpoints
         checkpoint_dir = os.path.join(project_root, "checkpoints")
         if os.path.exists(checkpoint_dir):
     def get_available_acestep_v15_models(self) -> List[str]:
         """Scan and return all model directory names starting with 'acestep-v15-'"""
         # Get project root
+        project_root = self._get_project_root()
         checkpoint_dir = os.path.join(project_root, "checkpoints")
         models = []
             # Auto-detect project root (independent of passed project_root parameter)
+            actual_project_root = self._get_project_root()
             checkpoint_dir = os.path.join(actual_project_root, "checkpoints")
             # 1. Load main model
                     attn_implementation = "sdpa"
                 try:
+                    logger.info(f"[initialize_service] Attempting to load model with attention implementation: {attn_implementation}")
                     self.model = AutoModel.from_pretrained(
                         acestep_v15_checkpoint_path,
                         trust_remote_code=True,
                         dtype="bfloat16"
                     )
                 except Exception as e:
+                    logger.warning(f"[initialize_service] Failed to load model with {attn_implementation}: {e}")
                     if attn_implementation == "sdpa":
+                        logger.info("[initialize_service] Falling back to eager attention")
                         attn_implementation = "eager"
                         self.model = AutoModel.from_pretrained(
                             acestep_v15_checkpoint_path,
                 else:
                     # If offload_to_cpu is True, check if we should keep DiT on GPU
                     if not self.offload_dit_to_cpu:
+                        logger.info(f"[initialize_service] Keeping main model on {device} (persistent)")
                         self.model = self.model.to(device).to(self.dtype)
                     else:
                         self.model = self.model.to("cpu").to(self.dtype)
                             raise ValueError(f"Unsupported quantization type: {self.quantization}")
                         quantize_(self.model, quant_config)
+                        logger.info(f"[initialize_service] DiT quantized with: {self.quantization}")
                 silence_latent_path = os.path.join(acestep_v15_checkpoint_path, "silence_latent.pt")
             if os.path.exists(vae_checkpoint_path):
                 self.vae = AutoencoderOobleck.from_pretrained(vae_checkpoint_path)
                 # Use bfloat16 for VAE on GPU, otherwise use self.dtype (float32 on CPU)
+                vae_dtype = self._get_vae_dtype(device)
                 if not self.offload_to_cpu:
                     self.vae = self.vae.to(device).to(vae_dtype)
                 else:
         except Exception as e:
             error_msg = f"❌ Error initializing model: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+            logger.exception("[initialize_service] Error initializing model")
             return error_msg, False
     @contextmanager
                 try:
                     param = next(model.parameters())
                     if param.device.type == "cpu":
+                        logger.info(f"[_load_model_context] Moving {model_name} to {self.device} (persistent)")
                         model.to(self.device).to(self.dtype)
                         if hasattr(self, "silence_latent"):
                             self.silence_latent = self.silence_latent.to(self.device).to(self.dtype)
             return
         # Load to GPU
+        logger.info(f"[_load_model_context] Loading {model_name} to {self.device}")
         start_time = time.time()
         if model_name == "vae":
+            vae_dtype = self._get_vae_dtype()
             model.to(self.device).to(vae_dtype)
         else:
             model.to(self.device).to(self.dtype)
         load_time = time.time() - start_time
         self.current_offload_cost += load_time
+        logger.info(f"[_load_model_context] Loaded {model_name} to {self.device} in {load_time:.4f}s")
         try:
             yield
         finally:
             # Offload to CPU
+            logger.info(f"[_load_model_context] Offloading {model_name} to CPU")
             start_time = time.time()
             model.to("cpu")
             torch.cuda.empty_cache()
             offload_time = time.time() - start_time
             self.current_offload_cost += offload_time
+            logger.info(f"[_load_model_context] Offloaded {model_name} to CPU in {offload_time:.4f}s")
     def process_target_audio(self, audio_file) -> Optional[torch.Tensor]:
         """Process target audio"""
             else:
                 audio = torch.from_numpy(audio_np.T)
+            # Normalize to stereo 48kHz
+            audio = self._normalize_audio_to_stereo_48k(audio, sr)
             return audio
         except Exception as e:
+            logger.exception("[process_target_audio] Error processing target audio")
             return None
     def _parse_audio_code_string(self, code_str: str) -> List[int]:
             return []
         try:
             return [int(x) for x in re.findall(r"<\|audio_code_(\d+)\|>", code_str)]
+        except Exception as e:
+            logger.debug(f"[_parse_audio_code_string] Failed to parse audio code string: {e}")
             return []
     def _decode_audio_codes_to_latents(self, code_str: str) -> Optional[torch.Tensor]:
             )
         """
         # Align instruction formatting with _prepare_batch
+        final_instruction = self._format_instruction(instruction or DEFAULT_DIT_INSTRUCTION)
         # Extract caption and language from metas if available (from LM CoT output)
         # Fallback to user-provided values if not in metas
         parsed_meta = self._parse_metas([metas])[0]
         caption_input = SFT_GEN_PROMPT.format(final_instruction, actual_caption, parsed_meta)
+        lyrics_input = self._format_lyrics(lyrics, actual_language)
         return caption_input, lyrics_input
     def _get_text_hidden_states(self, text_prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
                     return match.group(1).strip()
             return caption
         except Exception as e:
+            logger.exception("[extract_caption_from_sft_format] Error extracting caption")
             return caption
     def prepare_seeds(self, actual_batch_size, seed, use_random_seed):
                     else:
                         try:
                             seed_list.append(int(float(s)))
+                        except (ValueError, TypeError) as e:
+                            logger.debug(f"[prepare_seeds] Failed to parse seed value '{s}': {e}")
                             seed_list.append(-1)
             elif seed is None or (isinstance(seed, (int, float)) and seed < 0):
                 # If seed is None or negative, use -1 for all items
         return actual_seed_list, seed_value_for_ui
     def prepare_metadata(self, bpm, key_scale, time_signature):
+        """Build metadata dict - use "N/A" as default for empty fields."""
+        return self._build_metadata_dict(bpm, key_scale, time_signature)
+    def is_silence(self, audio):
+        return torch.all(audio.abs() < 1e-6)
+    def _get_project_root(self) -> str:
+        """Get project root directory path."""
+        current_file = os.path.abspath(__file__)
+        return os.path.dirname(os.path.dirname(current_file))
+    def _get_vae_dtype(self, device: Optional[str] = None) -> torch.dtype:
+        """Get VAE dtype based on device."""
+        device = device or self.device
+        return torch.bfloat16 if device in ["cuda", "xpu"] else self.dtype
+    def _format_instruction(self, instruction: str) -> str:
+        """Format instruction to ensure it ends with colon."""
+        if not instruction.endswith(":"):
+            instruction = instruction + ":"
+        return instruction
+    def _normalize_audio_to_stereo_48k(self, audio: torch.Tensor, sr: int) -> torch.Tensor:
+        """
+        Normalize audio to stereo 48kHz format.
+        Args:
+            audio: Audio tensor [channels, samples] or [samples]
+            sr: Sample rate
+        Returns:
+            Normalized audio tensor [2, samples] at 48kHz
+        """
+        # Convert to stereo (duplicate channel if mono)
+        if audio.shape[0] == 1:
+            audio = torch.cat([audio, audio], dim=0)
+        # Keep only first 2 channels
+        audio = audio[:2]
+        # Resample to 48kHz if needed
+        if sr != 48000:
+            audio = torchaudio.transforms.Resample(sr, 48000)(audio)
+        # Clamp values to [-1.0, 1.0]
+        audio = torch.clamp(audio, -1.0, 1.0)
+        return audio
+    def _normalize_audio_code_hints(self, audio_code_hints: Optional[Union[str, List[str]]], batch_size: int) -> List[Optional[str]]:
+        """Normalize audio_code_hints to list of correct length."""
+        if audio_code_hints is None:
+            normalized = [None] * batch_size
+        elif isinstance(audio_code_hints, str):
+            normalized = [audio_code_hints] * batch_size
+        elif len(audio_code_hints) == 1 and batch_size > 1:
+            normalized = audio_code_hints * batch_size
+        elif len(audio_code_hints) != batch_size:
+            # Pad or truncate to match batch_size
+            normalized = list(audio_code_hints[:batch_size])
+            while len(normalized) < batch_size:
+                normalized.append(None)
+        else:
+            normalized = list(audio_code_hints)
+        # Clean up: convert empty strings to None
+        normalized = [hint if isinstance(hint, str) and hint.strip() else None for hint in normalized]
+        return normalized
+    def _normalize_instructions(self, instructions: Optional[Union[str, List[str]]], batch_size: int, default: Optional[str] = None) -> List[str]:
+        """Normalize instructions to list of correct length."""
+        if instructions is None:
+            default_instruction = default or DEFAULT_DIT_INSTRUCTION
+            return [default_instruction] * batch_size
+        elif isinstance(instructions, str):
+            return [instructions] * batch_size
+        elif len(instructions) == 1:
+            return instructions * batch_size
+        elif len(instructions) != batch_size:
+            # Pad or truncate to match batch_size
+            normalized = list(instructions[:batch_size])
+            default_instruction = default or DEFAULT_DIT_INSTRUCTION
+            while len(normalized) < batch_size:
+                normalized.append(default_instruction)
+            return normalized
+        else:
+            return list(instructions)
+    def _format_lyrics(self, lyrics: str, language: str) -> str:
+        """Format lyrics text with language header."""
+        return f"# Languages\n{language}\n\n# Lyric\n{lyrics}<|endoftext|>"
+    def _pad_sequences(self, sequences: List[torch.Tensor], max_length: int, pad_value: int = 0) -> torch.Tensor:
+        """Pad sequences to same length."""
+        return torch.stack([
+            torch.nn.functional.pad(seq, (0, max_length - len(seq)), 'constant', pad_value)
+            for seq in sequences
+        ])
+    def _extract_caption_and_language(self, metas: List[Union[str, Dict[str, Any]]], captions: List[str], vocal_languages: List[str]) -> Tuple[List[str], List[str]]:
+        """Extract caption and language from metas with fallback to provided values."""
+        actual_captions = list(captions)
+        actual_languages = list(vocal_languages)
+        for i, meta in enumerate(metas):
+            if i >= len(actual_captions):
+                break
+            meta_dict = None
+            if isinstance(meta, str):
+                parsed = self._parse_metas([meta])
+                if parsed and isinstance(parsed[0], dict):
+                    meta_dict = parsed[0]
+            elif isinstance(meta, dict):
+                meta_dict = meta
+            if meta_dict:
+                if 'caption' in meta_dict and meta_dict['caption']:
+                    actual_captions[i] = str(meta_dict['caption'])
+                if 'language' in meta_dict and meta_dict['language']:
+                    actual_languages[i] = str(meta_dict['language'])
+        return actual_captions, actual_languages
+    def _encode_audio_to_latents(self, audio: torch.Tensor) -> torch.Tensor:
+        """
+        Encode audio to latents using VAE.
+        Args:
+            audio: Audio tensor [channels, samples] or [batch, channels, samples]
+        Returns:
+            Latents tensor [T, D] or [batch, T, D]
+        """
+        # Ensure batch dimension
+        if audio.dim() == 2:
+            audio = audio.unsqueeze(0)
+        # Ensure input is in VAE's dtype
+        vae_input = audio.to(self.device).to(self.vae.dtype)
+        # Encode to latents
+        with torch.no_grad():
+            latents = self.vae.encode(vae_input).latent_dist.sample()
+        # Cast back to model dtype
+        latents = latents.to(self.dtype)
+        # Transpose: [batch, d, T] -> [batch, T, d]
+        latents = latents.transpose(1, 2)
+        # Remove batch dimension if input didn't have it
+        if audio.dim() == 2:
+            latents = latents.squeeze(0)
+        return latents
+    def _build_metadata_dict(self, bpm: Optional[Union[int, str]], key_scale: str, time_signature: str, duration: Optional[float] = None) -> Dict[str, Any]:
+        """
+        Build metadata dictionary with default values.
+        Args:
+            bpm: BPM value (optional)
+            key_scale: Key/scale string
+            time_signature: Time signature string
+            duration: Duration in seconds (optional)
+        Returns:
+            Metadata dictionary
+        """
         metadata_dict = {}
         if bpm:
             metadata_dict["bpm"] = bpm
             metadata_dict["timesignature"] = time_signature
         else:
             metadata_dict["timesignature"] = "N/A"
+        # Add duration if provided
+        if duration is not None:
+            metadata_dict["duration"] = f"{int(duration)} seconds"
         return metadata_dict
     def generate_instruction(
         self,
             # Load audio file
             audio, sr = torchaudio.load(audio_file)
+            logger.debug(f"[process_reference_audio] Reference audio shape: {audio.shape}")
+            logger.debug(f"[process_reference_audio] Reference audio sample rate: {sr}")
+            logger.debug(f"[process_reference_audio] Reference audio duration: {audio.shape[-1] / 48000.0} seconds")
+            # Normalize to stereo 48kHz
+            audio = self._normalize_audio_to_stereo_48k(audio, sr)
             is_silence = self.is_silence(audio)
             if is_silence:
             return audio
         except Exception as e:
+            logger.exception("[process_reference_audio] Error processing reference audio")
             return None
     def process_src_audio(self, audio_file) -> Optional[torch.Tensor]:
             # Load audio file
             audio, sr = torchaudio.load(audio_file)
+            # Normalize to stereo 48kHz
+            audio = self._normalize_audio_to_stereo_48k(audio, sr)
             return audio
         except Exception as e:
+            logger.exception("[process_src_audio] Error processing source audio")
             return None
     def convert_src_audio_to_codes(self, audio_file) -> str:
             # Encode audio to latents using VAE
             with torch.no_grad():
                 with self._load_model_context("vae"):
                     # Check if audio is silence
+                    if self.is_silence(processed_audio.unsqueeze(0)):
                         return "❌ Audio file appears to be silent"
+                    # Encode to latents using helper method
+                    latents = self._encode_audio_to_latents(processed_audio)  # [T, d]
                 # Create attention mask for latents
                 attention_mask = torch.ones(latents.shape[0], dtype=torch.bool, device=self.device)
         except Exception as e:
             error_msg = f"❌ Error converting audio to codes: {str(e)}\n{traceback.format_exc()}"
+            logger.exception("[convert_src_audio_to_codes] Error converting audio to codes")
             return error_msg
     def prepare_batch_data(
             calculated_duration = audio_duration
         # Build metadata dict - use "N/A" as default for empty fields
+        metadata_dict = self._build_metadata_dict(bpm, key_scale, time_signature, calculated_duration)
         # Format metadata - inference service accepts dict and will convert to string
         # Create a copy for each batch item (in case we modify it)
             target_wavs = torch.zeros(2, frames)
             return target_wavs
         except Exception as e:
+            logger.exception("[create_target_wavs] Error creating target audio")
             # Fallback to 30 seconds if error
             return torch.zeros(2, 30 * 48000)
         """
         batch_size = len(captions)
+        # Normalize audio_code_hints to batch list
+        audio_code_hints = self._normalize_audio_code_hints(audio_code_hints, batch_size)
         for ii, refer_audio_list in enumerate(refer_audios):
             if isinstance(refer_audio_list, list):
         if vocal_languages is None:
             vocal_languages = self._create_fallback_vocal_languages(batch_size)
         # Parse metas with fallbacks
         parsed_metas = self._parse_metas(metas)
                         expected_latent_length = current_wav.shape[-1] // 1920
                         target_latent = self.silence_latent[0, :expected_latent_length, :]
                     else:
+                        # Encode using helper method
                         logger.info(f"[generate_music] Encoding target audio to latents for item {i}...")
+                        target_latent = self._encode_audio_to_latents(current_wav.squeeze(0))  # Remove batch dim for helper
                     target_latents_list.append(target_latent)
                     latent_lengths.append(target_latent.shape[0])
         # Process instructions early so we can use them for task type detection
         # Use custom instructions if provided, otherwise use default
+        instructions = self._normalize_instructions(instructions, batch_size, DEFAULT_DIT_INSTRUCTION)
         # Generate chunk_masks and spans based on repainting parameters
         # Also determine if this is a cover task (target audio provided without repainting)
         else:
             precomputed_lm_hints_25Hz = None
+        # Extract caption and language from metas if available (from LM CoT output)
+        # Fallback to user-provided values if not in metas
+        actual_captions, actual_languages = self._extract_caption_and_language(parsed_metas, captions, vocal_languages)
         # Format text_inputs
         text_inputs = []
         text_token_idss = []
         for i in range(batch_size):
             # Use custom instruction for this batch item
+            instruction = self._format_instruction(instructions[i] if i < len(instructions) else DEFAULT_DIT_INSTRUCTION)
+            actual_caption = actual_captions[i]
+            actual_language = actual_languages[i]
             # Format text prompt with custom instruction (using LM-generated caption if available)
             text_prompt = SFT_GEN_PROMPT.format(instruction, actual_caption, parsed_metas[i])
             text_attention_mask = text_inputs_dict.attention_mask[0].bool()
             # Format and tokenize lyrics (using LM-generated language if available)
+            lyrics_text = self._format_lyrics(lyrics[i], actual_language)
             lyrics_inputs_dict = self.text_tokenizer(
                 lyrics_text,
                 padding="longest",
         # Pad tokenized sequences
         max_text_length = max(len(seq) for seq in text_token_idss)
+        padded_text_token_idss = self._pad_sequences(text_token_idss, max_text_length, self.text_tokenizer.pad_token_id)
+        padded_text_attention_masks = self._pad_sequences(text_attention_masks, max_text_length, 0)
         max_lyric_length = max(len(seq) for seq in lyric_token_idss)
+        padded_lyric_token_idss = self._pad_sequences(lyric_token_idss, max_lyric_length, self.text_tokenizer.pad_token_id)
+        padded_lyric_attention_masks = self._pad_sequences(lyric_attention_masks, max_lyric_length, 0)
         padded_non_cover_text_input_ids = None
         padded_non_cover_text_attention_masks = None
             non_cover_text_attention_masks = []
             for i in range(batch_size):
                 # Use custom instruction for this batch item
+                instruction = self._format_instruction(DEFAULT_DIT_INSTRUCTION)
                 # Extract caption from metas if available (from LM CoT output)
+                actual_caption = actual_captions[i]
                 # Format text prompt with custom instruction (using LM-generated caption if available)
                 text_prompt = SFT_GEN_PROMPT.format(instruction, actual_caption, parsed_metas[i])
                 non_cover_text_input_ids.append(text_token_ids)
                 non_cover_text_attention_masks.append(non_cover_text_attention_mask)
+            padded_non_cover_text_input_ids = self._pad_sequences(non_cover_text_input_ids, max_text_length, self.text_tokenizer.pad_token_id)
+            padded_non_cover_text_attention_masks = self._pad_sequences(non_cover_text_attention_masks, max_text_length, 0)
         if audio_cover_strength < 1.0:
             assert padded_non_cover_text_input_ids is not None, "When audio_cover_strength < 1.0, padded_non_cover_text_input_ids must not be None"
         if self.config.is_turbo:
             # Limit inference steps to maximum 8
             if infer_steps > 8:
+                logger.warning(f"[service_generate] dmd_gan version: infer_steps {infer_steps} exceeds maximum 8, clamping to 8")
                 infer_steps = 8
             # CFG parameters are not adjustable for dmd_gan (they will be ignored)
             # Note: guidance_scale, cfg_interval_start, cfg_interval_end are still passed but may be ignored by the model
         if isinstance(repainting_end, (int, float)):
             repainting_end = [repainting_end]
         # Get batch size from captions
         batch_size = len(captions)
+        # Normalize instructions and audio_code_hints to match batch size
+        instructions = self._normalize_instructions(instructions, batch_size, DEFAULT_DIT_INSTRUCTION) if instructions is not None else None
+        audio_code_hints = self._normalize_audio_code_hints(audio_code_hints, batch_size) if audio_code_hints is not None else None
         # Convert seed to list format
         if seed is None:
         logger.info("[service_generate] Generating audio...")
         with self._load_model_context("model"):
             outputs = self.model.generate_audio(**generate_kwargs)
+        # Add intermediate information to outputs for extra_outputs
+        outputs["src_latents"] = src_latents
+        outputs["target_latents_input"] = target_latents  # Input target latents (before generation)
+        outputs["chunk_masks"] = chunk_mask
+        outputs["spans"] = spans
+        outputs["latent_masks"] = batch.get("latent_masks")  # Latent masks for valid length
         return outputs
     def tiled_decode(self, latents, chunk_size=512, overlap=64):
         use_adg: bool = False,
         cfg_interval_start: float = 0.0,
         cfg_interval_end: float = 1.0,
         use_tiled_decode: bool = True,
         progress=None
+    ) -> Dict[str, Any]:
         """
         Main interface for music generation
         Returns:
+            Dictionary containing:
+            - audios: List of audio dictionaries with path, key, params
+            - generation_info: Markdown-formatted generation information
+            - status_message: Status message
+            - extra_outputs: Dictionary with latents, masks, time_costs, etc.
+            - success: Whether generation completed successfully
+            - error: Error message if generation failed
         """
         if progress is None:
             def progress(*args, **kwargs):
                 pass
         if self.model is None or self.vae is None or self.text_tokenizer is None or self.text_encoder is None:
+            return {
+                "audios": [],
+                "generation_info": "",
+                "status_message": "❌ Model not fully initialized. Please initialize all components first.",
+                "extra_outputs": {},
+                "success": False,
+                "error": "Model not fully initialized",
+            }
         def _has_audio_codes(v: Union[str, List[str]]) -> bool:
             if isinstance(v, list):
             pred_latents = outputs["target_latents"]  # [batch, latent_length, latent_dim]
             time_costs = outputs["time_costs"]
             time_costs["offload_time_cost"] = self.current_offload_cost
+            logger.debug(f"[generate_music] pred_latents: {pred_latents.shape}, dtype={pred_latents.dtype} {pred_latents.min()=}, {pred_latents.max()=}, {pred_latents.mean()=} {pred_latents.std()=}")
+            logger.debug(f"[generate_music] time_costs: {time_costs}")
             if progress:
                 progress(0.8, desc="Decoding audio...")
             logger.info("[generate_music] Decoding latents with VAE...")
             # Update offload cost one last time to include VAE offloading
             time_costs["offload_time_cost"] = self.current_offload_cost
+            logger.info("[generate_music] VAE decode completed. Preparing audio tensors...")
             if progress:
+                progress(0.9, desc="Preparing audio data...")
+            # Prepare audio tensors (no file I/O here, no UUID generation)
+            # pred_wavs is already [batch, channels, samples] format
+            # Move to CPU and convert to float32 for return
+            audio_tensors = []
             for i in range(actual_batch_size):
+                # Extract audio tensor: [channels, samples] format, CPU, float32
+                audio_tensor = pred_wavs[i].cpu().float()
+                audio_tensors.append(audio_tensor)
             # Format time costs if available
             time_costs_str = ""
     **Seeds:** {seed_value_for_ui}
     **Steps:** {inference_steps}
+    **Audio Count:** {len(audio_tensors)} audio(s){time_costs_str}"""
             status_message = f"✅ Generation completed successfully!"
+            logger.info(f"[generate_music] Done! Generated {len(audio_tensors)} audio tensors.")
+            # Extract intermediate information from outputs
+            src_latents = outputs.get("src_latents")  # [batch, T, D]
+            target_latents_input = outputs.get("target_latents_input")  # [batch, T, D]
+            chunk_masks = outputs.get("chunk_masks")  # [batch, T]
+            spans = outputs.get("spans", [])  # List of tuples
+            latent_masks = outputs.get("latent_masks")  # [batch, T]
+            # Move latents to CPU to save memory (they can be large)
+            extra_outputs = {
+                "pred_latents": pred_latents.cpu() if pred_latents is not None else None,
+                "target_latents": target_latents_input.cpu() if target_latents_input is not None else None,
+                "src_latents": src_latents.cpu() if src_latents is not None else None,
+                "chunk_masks": chunk_masks.cpu() if chunk_masks is not None else None,
+                "latent_masks": latent_masks.cpu() if latent_masks is not None else None,
+                "spans": spans,
+                "time_costs": time_costs,
+                "seed_value": seed_value_for_ui,
+            }
+            # Build audios list with tensor data (no file paths, no UUIDs, handled outside)
+            audios = []
+            for idx, audio_tensor in enumerate(audio_tensors):
+                audio_dict = {
+                    "tensor": audio_tensor,  # torch.Tensor [channels, samples], CPU, float32
+                    "sample_rate": self.sample_rate,
+                }
+                audios.append(audio_dict)
+            return {
+                "audios": audios,
+                "generation_info": generation_info,
+                "status_message": status_message,
+                "extra_outputs": extra_outputs,
+                "success": True,
+                "error": None,
+            }
         except Exception as e:
             error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
+            logger.exception("[generate_music] Generation failed")
+            return {
+                "audios": [],
+                "generation_info": "",
+                "status_message": error_msg,
+                "extra_outputs": {},
+                "success": False,
+                "error": str(e),
+            }

acestep/inference.py CHANGED Viewed

@@ -7,105 +7,100 @@ backward-compatible Gradio UI support.
 """
 import math
 from typing import Optional, Union, List, Dict, Any, Tuple
 from dataclasses import dataclass, field, asdict
 from loguru import logger
-import time as time_module
 @dataclass
-class GenerationConfig:
-    """Configuration for music generation.
     Attributes:
         # Text Inputs
-        caption: Text description of the desired music
-        lyrics: Lyrics text for vocal music (use "[Instrumental]" for instrumental)
         # Music Metadata
-        bpm: Beats per minute (e.g., 120). None for auto-detection
-        key_scale: Musical key (e.g., "C Major", "Am"). Empty for auto-detection
-        time_signature: Time signature (e.g., "4/4", "3/4"). Empty for auto-detection
-        vocal_language: Language code for vocals (e.g., "en", "zh", "ja")
-        audio_duration: Duration in seconds. None for auto-detection
         # Generation Parameters
-        inference_steps: Number of denoising steps (8 for turbo, 32-100 for base)
-        guidance_scale: Classifier-free guidance scale (higher = more adherence to prompt)
-        use_random_seed: Whether to use random seed (True) or fixed seed
-        seed: Random seed for reproducibility (-1 for random)
-        batch_size: Number of samples to generate (1-8)
         # Advanced DiT Parameters
-        use_adg: Use Adaptive Dual Guidance (base model only)
-        cfg_interval_start: CFG application start ratio (0.0-1.0)
-        cfg_interval_end: CFG application end ratio (0.0-1.0)
-        audio_format: Output audio format ("mp3", "wav", "flac")
         # Task-Specific Parameters
-        task_type: Generation task type ("text2music", "cover", "repaint", "lego", "extract", "complete")
-        reference_audio: Path to reference audio file (for style transfer)
-        src_audio: Path to source audio file (for audio-to-audio tasks)
-        audio_code_string: Pre-extracted audio codes (advanced use)
-        repainting_start: Repainting start time in seconds (for repaint/lego tasks)
-        repainting_end: Repainting end time in seconds (-1 for end of audio)
-        audio_cover_strength: Strength of audio cover/codes influence (0.0-1.0)
-        instruction: Task-specific instruction prompt (auto-generated if empty)
-        # 5Hz Language Model Parameters (CoT Reasoning)
-        use_llm_thinking: Enable LM-based Chain-of-Thought reasoning for metadata/codes
-        lm_temperature: LM sampling temperature (0.0-2.0, higher = more creative)
-        lm_cfg_scale: LM classifier-free guidance scale
-        lm_top_k: LM top-k sampling (0 = disabled)
-        lm_top_p: LM nucleus sampling (1.0 = disabled)
-        lm_negative_prompt: Negative prompt for LM guidance
-        use_cot_metas: Generate metadata using LM CoT
-        use_cot_caption: Refine caption using LM CoT
-        use_cot_language: Detect language using LM CoT
-        is_format_caption: Whether caption is already formatted
-        constrained_decoding_debug: Enable debug logging for constrained decoding
-        # Batch LM Generation
-        allow_lm_batch: Allow batch LM code generation (faster for batch_size >= 2)
-        lm_batch_chunk_size: Maximum batch size per LM inference chunk (GPU memory constraint)
     """
     # Text Inputs
     caption: str = ""
     lyrics: str = ""
-    # Music Metadata
-    bpm: Optional[int] = None
-    key_scale: str = ""
-    time_signature: str = ""
     vocal_language: str = "unknown"
-    audio_duration: Optional[float] = None
-    # Generation Parameters
     inference_steps: int = 8
-    guidance_scale: float = 7.0
-    use_random_seed: bool = True
     seed: int = -1
-    batch_size: int = 1
-    # Advanced DiT Parameters
     use_adg: bool = False
     cfg_interval_start: float = 0.0
     cfg_interval_end: float = 1.0
-    audio_format: str = "mp3"
-    # Task-Specific Parameters
-    task_type: str = "text2music"
-    reference_audio: Optional[str] = None
-    src_audio: Optional[str] = None
-    audio_code_string: Union[str, List[str]] = ""
     repainting_start: float = 0.0
     repainting_end: float = -1
     audio_cover_strength: float = 1.0
-    instruction: str = ""
     # 5Hz Language Model Parameters
-    use_llm_thinking: bool = False
     lm_temperature: float = 0.85
     lm_cfg_scale: float = 2.0
     lm_top_k: int = 0
@@ -114,66 +109,59 @@ class GenerationConfig:
     use_cot_metas: bool = True
     use_cot_caption: bool = True
     use_cot_language: bool = True
-    is_format_caption: bool = False
-    constrained_decoding_debug: bool = False
-    # Batch LM Generation
-    allow_lm_batch: bool = False
-    lm_batch_chunk_size: int = 4
 @dataclass
 class GenerationResult:
     """Result of music generation.
     Attributes:
         # Audio Outputs
-        audio_paths: List of paths to generated audio files
-        first_audio: Path to first generated audio (backward compatibility)
-        second_audio: Path to second generated audio (backward compatibility)
-        # Generation Information
         generation_info: Markdown-formatted generation information
         status_message: Status message from generation
-        seed_value: Actual seed value used for generation
-        # LM-Generated Metadata (if applicable)
-        lm_metadata: Metadata generated by language model (dict or None)
-        # Audio-Text Alignment Scores (if available)
-        align_score_1: First alignment score
-        align_text_1: First alignment text description
-        align_plot_1: First alignment plot image
-        align_score_2: Second alignment score
-        align_text_2: Second alignment text description
-        align_plot_2: Second alignment plot image
-        # Success Status
         success: Whether generation completed successfully
         error: Error message if generation failed
     """
     # Audio Outputs
-    audio_paths: List[str] = field(default_factory=list)
-    first_audio: Optional[str] = None
-    second_audio: Optional[str] = None
     # Generation Information
     generation_info: str = ""
     status_message: str = ""
-    seed_value: str = ""
-    # LM-Generated Metadata
-    lm_metadata: Optional[Dict[str, Any]] = None
-    # Audio-Text Alignment Scores
-    align_score_1: Optional[float] = None
-    align_text_1: Optional[str] = None
-    align_plot_1: Optional[Any] = None
-    align_score_2: Optional[float] = None
-    align_text_2: Optional[str] = None
-    align_plot_2: Optional[Any] = None
     # Success Status
     success: bool = True
     error: Optional[str] = None
@@ -186,75 +174,71 @@ class GenerationResult:
 def generate_music(
     dit_handler,
     llm_handler,
     config: GenerationConfig,
 ) -> GenerationResult:
     """Generate music using ACE-Step model with optional LM reasoning.
-    This is the main inference API for music generation. It supports various task types
-    (text2music, cover, repaint, etc.) and can optionally use a 5Hz Language Model for
-    Chain-of-Thought reasoning to generate metadata and audio codes.
     Args:
         dit_handler: Initialized DiT model handler (AceStepHandler instance)
         llm_handler: Initialized LLM handler (LLMHandler instance)
         config: Generation configuration (GenerationConfig instance)
     Returns:
-        GenerationResult: Generation result containing audio paths and metadata
-    Example:
-        >>> from acestep.handler import AceStepHandler
-        >>> from acestep.llm_inference import LLMHandler
-        >>> from acestep.inference import GenerationConfig, generate_music
-        >>>
-        >>> # Initialize handlers
-        >>> dit_handler = AceStepHandler()
-        >>> llm_handler = LLMHandler()
-        >>> dit_handler.initialize_service(...)
-        >>> llm_handler.initialize(...)
-        >>>
-        >>> # Configure generation
-        >>> config = GenerationConfig(
-        ...     caption="upbeat electronic dance music",
-        ...     bpm=128,
-        ...     audio_duration=30,
-        ...     batch_size=2,
-        ... )
-        >>>
-        >>> # Generate music
-        >>> result = generate_music(dit_handler, llm_handler, config)
-        >>> print(f"Generated {len(result.audio_paths)} audio files")
-        >>> for path in result.audio_paths:
-        ...     print(f"Audio: {path}")
     """
     try:
         # Phase 1: LM-based metadata and code generation (if enabled)
-        audio_code_string_to_use = config.audio_code_string
         lm_generated_metadata = None
-        lm_generated_audio_codes = None
         lm_generated_audio_codes_list = []
         # Extract mutable copies of metadata (will be updated by LM if needed)
-        bpm = config.bpm
-        key_scale = config.key_scale
-        time_signature = config.time_signature
-        audio_duration = config.audio_duration
-        # Determine if we should use batch LM generation
-        should_use_lm_batch = (
-            config.use_llm_thinking
-            and llm_handler.llm_initialized
-            and config.use_cot_metas
-            and config.allow_lm_batch
-            and config.batch_size >= 2
-        )
         # LM-based Chain-of-Thought reasoning
-        if config.use_llm_thinking and llm_handler.llm_initialized and config.use_cot_metas:
             # Convert sampling parameters
-            top_k_value = None if config.lm_top_k == 0 else int(config.lm_top_k)
-            top_p_value = None if config.lm_top_p >= 1.0 else config.lm_top_p
             # Build user_metadata from user-provided values
             user_metadata = {}
@@ -286,165 +270,231 @@ def generate_music(
             user_metadata_to_pass = user_metadata if user_metadata else None
-            # Batch LM generation (faster for multiple samples)
-            if should_use_lm_batch:
-                actual_seed_list, _ = dit_handler.prepare_seeds(
-                    config.batch_size, config.seed, config.use_random_seed
-                )
-                max_inference_batch_size = int(config.lm_batch_chunk_size)
-                num_chunks = math.ceil(config.batch_size / max_inference_batch_size)
-                all_metadata_list = []
-                all_audio_codes_list = []
-                for chunk_idx in range(num_chunks):
-                    chunk_start = chunk_idx * max_inference_batch_size
-                    chunk_end = min(chunk_start + max_inference_batch_size, config.batch_size)
-                    chunk_size = chunk_end - chunk_start
-                    chunk_seeds = actual_seed_list[chunk_start:chunk_end]
-                    logger.info(
-                        f"LM batch chunk {chunk_idx+1}/{num_chunks} "
-                        f"(size: {chunk_size}, seeds: {chunk_seeds})"
-                    )
-                    metadata_list, audio_codes_list, status = llm_handler.generate_with_stop_condition_batch(
-                        caption=config.caption or "",
-                        lyrics=config.lyrics or "",
-                        batch_size=chunk_size,
-                        infer_type="llm_dit",
-                        temperature=config.lm_temperature,
-                        cfg_scale=config.lm_cfg_scale,
-                        negative_prompt=config.lm_negative_prompt,
-                        top_k=top_k_value,
-                        top_p=top_p_value,
-                        user_metadata=user_metadata_to_pass,
-                        use_cot_caption=config.use_cot_caption,
-                        use_cot_language=config.use_cot_language,
-                        is_format_caption=config.is_format_caption,
-                        constrained_decoding_debug=config.constrained_decoding_debug,
-                        seeds=chunk_seeds,
-                    )
-                    all_metadata_list.extend(metadata_list)
-                    all_audio_codes_list.extend(audio_codes_list)
-                lm_generated_metadata = all_metadata_list[0] if all_metadata_list else None
-                lm_generated_audio_codes_list = all_audio_codes_list
-                audio_code_string_to_use = all_audio_codes_list
-                # Update metadata from LM if not provided by user
-                if lm_generated_metadata:
-                    bpm, key_scale, time_signature, audio_duration = _update_metadata_from_lm(
-                        lm_generated_metadata, bpm, key_scale, time_signature, audio_duration
-                    )
-            else:
-                # Sequential LM generation (current behavior)
-                # Phase 1: Generate CoT metadata
-                phase1_start = time_module.time()
-                metadata, _, status = llm_handler.generate_with_stop_condition(
-                    caption=config.caption or "",
-                    lyrics=config.lyrics or "",
-                    infer_type="dit",
-                    temperature=config.lm_temperature,
-                    cfg_scale=config.lm_cfg_scale,
-                    negative_prompt=config.lm_negative_prompt,
-                    top_k=top_k_value,
-                    top_p=top_p_value,
-                    user_metadata=user_metadata_to_pass,
-                    use_cot_caption=config.use_cot_caption,
-                    use_cot_language=config.use_cot_language,
-                    is_format_caption=config.is_format_caption,
-                    constrained_decoding_debug=config.constrained_decoding_debug,
                 )
-                lm_phase1_time = time_module.time() - phase1_start
-                logger.info(f"LM Phase 1 (CoT) completed in {lm_phase1_time:.2f}s")
-                # Phase 2: Generate audio codes
-                phase2_start = time_module.time()
-                metadata, audio_codes, status = llm_handler.generate_with_stop_condition(
-                    caption=config.caption or "",
-                    lyrics=config.lyrics or "",
-                    infer_type="llm_dit",
-                    temperature=config.lm_temperature,
-                    cfg_scale=config.lm_cfg_scale,
-                    negative_prompt=config.lm_negative_prompt,
                     top_k=top_k_value,
                     top_p=top_p_value,
                     user_metadata=user_metadata_to_pass,
-                    use_cot_caption=config.use_cot_caption,
-                    use_cot_language=config.use_cot_language,
                     is_format_caption=config.is_format_caption,
                     constrained_decoding_debug=config.constrained_decoding_debug,
                 )
-                lm_phase2_time = time_module.time() - phase2_start
-                logger.info(f"LM Phase 2 (Codes) completed in {lm_phase2_time:.2f}s")
-                lm_generated_metadata = metadata
-                if audio_codes:
-                    audio_code_string_to_use = audio_codes
-                    lm_generated_audio_codes = audio_codes
-                    # Update metadata from LM if not provided by user
-                    bpm, key_scale, time_signature, audio_duration = _update_metadata_from_lm(
-                        metadata, bpm, key_scale, time_signature, audio_duration
-                    )
         # Phase 2: DiT music generation
         result = dit_handler.generate_music(
-            captions=config.caption,
-            lyrics=config.lyrics,
             bpm=bpm,
             key_scale=key_scale,
             time_signature=time_signature,
-            vocal_language=config.vocal_language,
-            inference_steps=config.inference_steps,
-            guidance_scale=config.guidance_scale,
             use_random_seed=config.use_random_seed,
-            seed=config.seed,
-            reference_audio=config.reference_audio,
             audio_duration=audio_duration,
-            batch_size=config.batch_size,
-            src_audio=config.src_audio,
             audio_code_string=audio_code_string_to_use,
-            repainting_start=config.repainting_start,
-            repainting_end=config.repainting_end,
-            instruction=config.instruction,
-            audio_cover_strength=config.audio_cover_strength,
-            task_type=config.task_type,
-            use_adg=config.use_adg,
-            cfg_interval_start=config.cfg_interval_start,
-            cfg_interval_end=config.cfg_interval_end,
-            audio_format=config.audio_format,
-            lm_temperature=config.lm_temperature,
         )
-        # Extract results
-        (first_audio, second_audio, all_audio_paths, generation_info, status_message,
-         seed_value, align_score_1, align_text_1, align_plot_1,
-         align_score_2, align_text_2, align_plot_2) = result
         # Append LM metadata to generation info
         if lm_generated_metadata:
             generation_info = _append_lm_metadata_to_info(generation_info, lm_generated_metadata)
-        # Create result object
         return GenerationResult(
-            audio_paths=all_audio_paths or [],
-            first_audio=first_audio,
-            second_audio=second_audio,
             generation_info=generation_info,
             status_message=status_message,
-            seed_value=seed_value,
-            lm_metadata=lm_generated_metadata,
-            align_score_1=align_score_1,
-            align_text_1=align_text_1,
-            align_plot_1=align_plot_1,
-            align_score_2=align_score_2,
-            align_text_2=align_text_2,
-            align_plot_2=align_plot_2,
             success=True,
             error=None,
         )
@@ -452,10 +502,12 @@ def generate_music(
     except Exception as e:
         logger.exception("Music generation failed")
         return GenerationResult(
-            success=False,
-            error=str(e),
             generation_info=f"❌ Generation failed: {str(e)}",
             status_message=f"Error: {str(e)}",
         )
@@ -525,7 +577,7 @@ def _append_lm_metadata_to_info(generation_info: str, metadata: Dict[str, Any])
 # LEGACY GRADIO UI COMPATIBILITY LAYER
 # ============================================================================
-def generate(
     dit_handler,
     llm_handler,
     captions,
@@ -575,20 +627,19 @@ def generate(
         Tuple with 28 elements for Gradio UI component updates
     """
-    # Convert legacy parameters to new config
-    config = GenerationConfig(
         caption=captions,
         lyrics=lyrics,
         bpm=bpm,
-        key_scale=key_scale,
-        time_signature=time_signature,
         vocal_language=vocal_language,
-        audio_duration=audio_duration,
         inference_steps=inference_steps,
         guidance_scale=guidance_scale,
-        use_random_seed=random_seed_checkbox,
         seed=seed,
-        batch_size=batch_size_input,
         use_adg=use_adg,
         cfg_interval_start=cfg_interval_start,
         cfg_interval_end=cfg_interval_end,
@@ -596,12 +647,11 @@ def generate(
         task_type=task_type,
         reference_audio=reference_audio,
         src_audio=src_audio,
-        audio_code_string=text2music_audio_code_string,
         repainting_start=repainting_start,
         repainting_end=repainting_end,
         audio_cover_strength=audio_cover_strength,
         instruction=instruction_display_gen,
-        use_llm_thinking=think_checkbox,
         lm_temperature=lm_temperature,
         lm_cfg_scale=lm_cfg_scale,
         lm_top_k=lm_top_k,
@@ -610,29 +660,49 @@ def generate(
         use_cot_metas=use_cot_metas,
         use_cot_caption=use_cot_caption,
         use_cot_language=use_cot_language,
-        is_format_caption=is_format_caption,
-        constrained_decoding_debug=constrained_decoding_debug,
-        allow_lm_batch=allow_lm_batch,
-        lm_batch_chunk_size=lm_batch_chunk_size,
     )
     # Call new API
-    result = generate_music(dit_handler, llm_handler, config)
     # Determine which codes to update in UI
-    if config.allow_lm_batch and result.lm_metadata:
         # Batch mode: extract codes from metadata if available
-        lm_codes_list = result.lm_metadata.get('audio_codes_list', [])
         updated_audio_codes = lm_codes_list[0] if lm_codes_list else text2music_audio_code_string
         codes_outputs = (lm_codes_list + [""] * 8)[:8]
     else:
         # Single mode
-        lm_codes = result.lm_metadata.get('audio_codes', '') if result.lm_metadata else ''
         updated_audio_codes = lm_codes if lm_codes else text2music_audio_code_string
         codes_outputs = [""] * 8
     # Prepare audio outputs (up to 8)
-    audio_outputs = (result.audio_paths + [None] * 8)[:8]
     # Return tuple for Gradio UI (28 elements)
     return (
@@ -644,16 +714,16 @@ def generate(
         audio_outputs[5],  # generated_audio_6
         audio_outputs[6],  # generated_audio_7
         audio_outputs[7],  # generated_audio_8
-        result.audio_paths,  # generated_audio_batch
         result.generation_info,
         result.status_message,
-        result.seed_value,
-        result.align_score_1,
-        result.align_text_1,
-        result.align_plot_1,
-        result.align_score_2,
-        result.align_text_2,
-        result.align_plot_2,
         updated_audio_codes,  # Update main audio codes in UI
         codes_outputs[0],  # text2music_audio_code_string_1
         codes_outputs[1],  # text2music_audio_code_string_2
@@ -663,266 +733,8 @@ def generate(
         codes_outputs[5],  # text2music_audio_code_string_6
         codes_outputs[6],  # text2music_audio_code_string_7
         codes_outputs[7],  # text2music_audio_code_string_8
-        result.lm_metadata,  # Store metadata for "Send to src audio" buttons
         is_format_caption,  # Keep is_format_caption unchanged
     )
-# ============================================================================
-# TESTING & EXAMPLES
-# ============================================================================
-if __name__ == "__main__":
-    """
-    Test suite for the inference API.
-    Demonstrates various usage scenarios and validates functionality.
-    Usage:
-        python -m acestep.inference
-    """
-    import os
-    import json
-    from acestep.handler import AceStepHandler
-    from acestep.llm_inference import LLMHandler
-    # Initialize paths
-    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
-    checkpoint_dir = os.path.join(project_root, "checkpoints")
-    print("=" * 80)
-    print("ACE-Step Inference API Test Suite")
-    print("=" * 80)
-    # ========================================================================
-    # Initialize Handlers
-    # ========================================================================
-    print("\n[1/3] Initializing handlers...")
-    dit_handler = AceStepHandler(save_root="./")
-    llm_handler = LLMHandler()
-    try:
-        # Initialize DiT handler
-        print("  - Initializing DiT model...")
-        status_dit, success_dit = dit_handler.initialize_service(
-            project_root=project_root,
-            config_path="acestep-v15-turbo-rl",
-            device="cuda",
-        )
-        if not success_dit:
-            print(f"  ❌ DiT initialization failed: {status_dit}")
-            exit(1)
-        print(f"  ✓ DiT model initialized successfully")
-        # Initialize LLM handler
-        print("  - Initializing 5Hz LM model...")
-        status_llm, success_llm = llm_handler.initialize(
-            checkpoint_dir=checkpoint_dir,
-            lm_model_path="acestep-5Hz-lm-0.6B-v3",
-            backend="vllm",
-            device="cuda",
-        )
-        if success_llm:
-            print(f"  ✓ LM model initialized successfully")
-        else:
-            print(f"  ⚠ LM initialization failed (will skip LM tests): {status_llm}")
-    except Exception as e:
-        print(f"  ❌ Initialization error: {e}")
-        exit(1)
-    # ========================================================================
-    # Helper Functions
-    # ========================================================================
-    def load_example_config(example_file: str) -> GenerationConfig:
-        """Load configuration from an example JSON file."""
-        try:
-            with open(example_file, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-            # Convert example format to GenerationConfig
-            # Handle time signature format (example uses "4" instead of "4/4")
-            time_sig = data.get('timesignature', '')
-            if time_sig and '/' not in time_sig:
-                time_sig = f"{time_sig}/4"  # Default to /4 if only numerator given
-            config = GenerationConfig(
-                caption=data.get('caption', ''),
-                lyrics=data.get('lyrics', ''),
-                bpm=data.get('bpm'),
-                key_scale=data.get('keyscale', ''),
-                time_signature=time_sig,
-                vocal_language=data.get('language', 'unknown'),
-                audio_duration=data.get('duration'),
-                use_llm_thinking=data.get('think', False),
-                batch_size=data.get('batch_size', 1),
-                inference_steps=data.get('inference_steps', 8),
-            )
-            return config
-        except Exception as e:
-            print(f"  ⚠ Failed to load example file: {e}")
-            return None
-    # ========================================================================
-    # Test Cases
-    # ========================================================================
-    test_results = []
-    def run_test(test_name: str, config: GenerationConfig, expected_outputs: int = 1):
-        """Run a single test case and collect results."""
-        print(f"\n{'=' * 80}")
-        print(f"Test: {test_name}")
-        print(f"{'=' * 80}")
-        # Display configuration
-        print("\nConfiguration:")
-        print(f"  Task Type: {config.task_type}")
-        print(f"  Caption: {config.caption[:60]}..." if len(config.caption) > 60 else f"  Caption: {config.caption}")
-        if config.lyrics:
-            print(f"  Lyrics: {config.lyrics[:60]}..." if len(config.lyrics) > 60 else f"  Lyrics: {config.lyrics}")
-        if config.bpm:
-            print(f"  BPM: {config.bpm}")
-        if config.key_scale:
-            print(f"  Key Scale: {config.key_scale}")
-        if config.time_signature:
-            print(f"  Time Signature: {config.time_signature}")
-        if config.audio_duration:
-            print(f"  Duration: {config.audio_duration}s")
-        print(f"  Batch Size: {config.batch_size}")
-        print(f"  Inference Steps: {config.inference_steps}")
-        print(f"  Use LLM Thinking: {config.use_llm_thinking}")
-        # Run generation
-        print("\nGenerating...")
-        import time
-        start_time = time.time()
-        result = generate_music(dit_handler, llm_handler, config)
-        elapsed_time = time.time() - start_time
-        # Display results
-        print("\nResults:")
-        print(f"  Success: {'✓' if result.success else '✗'}")
-        if result.success:
-            print(f"  Generated Files: {len(result.audio_paths)}")
-            for i, path in enumerate(result.audio_paths, 1):
-                if os.path.exists(path):
-                    file_size = os.path.getsize(path) / (1024 * 1024)  # MB
-                    print(f"    [{i}] {os.path.basename(path)} ({file_size:.2f} MB)")
-                else:
-                    print(f"    [{i}] {os.path.basename(path)} (file not found)")
-            print(f"  Seed: {result.seed_value}")
-            print(f"  Generation Time: {elapsed_time:.2f}s")
-            # Display LM metadata if available
-            if result.lm_metadata:
-                print(f"\n  LM-Generated Metadata:")
-                for key, value in result.lm_metadata.items():
-                    if key not in ['audio_codes', 'audio_codes_list']:  # Skip large code strings
-                        print(f"    {key}: {value}")
-            # Validate outputs
-            if len(result.audio_paths) != expected_outputs:
-                print(f"  ⚠ Warning: Expected {expected_outputs} outputs, got {len(result.audio_paths)}")
-                success = False
-            else:
-                success = True
-        else:
-            print(f"  Error: {result.error}")
-            success = False
-        # Store test result
-        test_results.append({
-            "test_name": test_name,
-            "success": success,
-            "generation_success": result.success,
-            "num_outputs": len(result.audio_paths) if result.success else 0,
-            "expected_outputs": expected_outputs,
-            "elapsed_time": elapsed_time,
-            "error": result.error if not result.success else None,
-        })
-        return result
-    # ========================================================================
-    # Test: Production Example (from examples directory)
-    # ========================================================================
-    print("\n[2/3] Running Test...")
-    # Load production example (J-Rock song from examples/text2music/example_05.json)
-    example_file = os.path.join(project_root, "examples", "text2music", "example_05.json")
-    if not os.path.exists(example_file):
-        print(f"\n  ❌ Example file not found: {example_file}")
-        print("     Please ensure the examples directory exists.")
-        exit(1)
-    print(f"  Loading example: {os.path.basename(example_file)}")
-    config = load_example_config(example_file)
-    if not config:
-        print("  ❌ Failed to load example configuration")
-        exit(1)
-    # Reduce duration for faster testing (original is 200s)
-    print(f"  Original duration: {config.audio_duration}s")
-    config.audio_duration = 30
-    config.use_random_seed = False
-    config.seed = 42
-    print(f"  Test duration: {config.audio_duration}s (reduced for testing)")
-    run_test("Production Example (J-Rock Song)", config, expected_outputs=1)
-    # ========================================================================
-    # Test Summary
-    # ========================================================================
-    print("\n[3/3] Test Summary")
-    print("=" * 80)
-    if len(test_results) == 0:
-        print("No tests were run.")
-        exit(1)
-    result = test_results[0]
-    print(f"\nTest: {result['test_name']}")
-    print(f"Status: {'✓ PASS' if result['success'] else '✗ FAIL'}")
-    print(f"Generation: {'Success' if result['generation_success'] else 'Failed'}")
-    print(f"Outputs: {result['num_outputs']}/{result['expected_outputs']}")
-    print(f"Time: {result['elapsed_time']:.2f}s")
-    if result["error"]:
-        print(f"Error: {result['error']}")
-    # Save test results to JSON
-    results_file = os.path.join(project_root, "test_results.json")
-    try:
-        with open(results_file, "w") as f:
-            json.dump({
-                "test_name": result['test_name'],
-                "success": result['success'],
-                "generation_success": result['generation_success'],
-                "num_outputs": result['num_outputs'],
-                "expected_outputs": result['expected_outputs'],
-                "elapsed_time": result['elapsed_time'],
-                "error": result['error'],
-            }, f, indent=2)
-        print(f"\n✓ Test results saved to: {results_file}")
-    except Exception as e:
-        print(f"\n⚠ Failed to save test results: {e}")
-    # Exit with appropriate code
-    print("\n" + "=" * 80)
-    if result['success']:
-        print("Test passed! ✓")
-        print("=" * 80)
-        exit(0)
-    else:
-        print("Test failed! ✗")
-        print("=" * 80)
-        exit(1)

 """
 import math
+import os
+import tempfile
 from typing import Optional, Union, List, Dict, Any, Tuple
 from dataclasses import dataclass, field, asdict
 from loguru import logger
+from acestep.audio_utils import AudioSaver, generate_uuid_from_params
 @dataclass
+class GenerationParams:
+    """Configuration for music generation parameters.
     Attributes:
         # Text Inputs
+        caption: A short text prompt describing the desired music (main prompt). < 512 characters
+        lyrics: Lyrics for the music. Use "[Instrumental]" for instrumental songs. < 4096 characters
+        instrumental: If True, generate instrumental music regardless of lyrics.
         # Music Metadata
+        bpm: BPM (beats per minute), e.g., 120. Set to None for automatic estimation. 30 ~ 300
+        keyscale: Musical key (e.g., "C Major", "Am"). Leave empty for auto-detection. A-G, #/♭, major/minor
+        timesignature: Time signature (2 for '2/4', 3 for '3/4', 4 for '4/4', 6 for '6/8'). Leave empty for auto-detection.
+        vocal_language: Language code for vocals, e.g., "en", "zh", "ja", or "unknown". see acestep/constants.py:VALID_LANGUAGES
+        duration: Target audio length in seconds. If <0 or None, model chooses automatically. 10 ~ 600
         # Generation Parameters
+        inference_steps: Number of diffusion steps (e.g., 8 for turbo, 32–100 for base model).
+        guidance_scale: CFG (classifier-free guidance) strength. Higher means following the prompt more strictly. Only support for non-turbo model.
+        seed: Integer seed for reproducibility. -1 means use random seed each time.
         # Advanced DiT Parameters
+        use_adg: Whether to use Adaptive Dual Guidance (only works for base model).
+        cfg_interval_start: Start ratio (0.0–1.0) to apply CFG.
+        cfg_interval_end: End ratio (0.0–1.0) to apply CFG.
         # Task-Specific Parameters
+        task_type: Type of generation task. One of: "text2music", "cover", "repaint", "lego", "extract", "complete".
+        reference_audio: Path to a reference audio file for style transfer or cover tasks.
+        src_audio: Path to a source audio file for audio-to-audio tasks.
+        audio_codes: Audio semantic codes as a string (advanced use, for code-control generation).
+        repainting_start: For repaint/lego tasks: start time in seconds for region to repaint.
+        repainting_end: For repaint/lego tasks: end time in seconds for region to repaint (-1 for until end).
+        audio_cover_strength: Strength of reference audio/codes influence (range 0.0–1.0). set smaller (0.2) for style transfer tasks.
+        instruction: Optional task instruction prompt. If empty, auto-generated by system.
+        # 5Hz Language Model Parameters for CoT reasoning
+        thinking: If True, enable 5Hz Language Model "Chain-of-Thought" reasoning for semantic/music metadata and codes.
+        lm_temperature: Sampling temperature for the LLM (0.0–2.0). Higher = more creative/varied results.
+        lm_cfg_scale: Classifier-free guidance scale for the LLM.
+        lm_top_k: LLM top-k sampling (0 = disabled).
+        lm_top_p: LLM top-p nucleus sampling (1.0 = disabled).
+        lm_negative_prompt: Negative prompt to use for LLM (for control).
+        use_cot_metas: Whether to let LLM generate music metadata via CoT reasoning.
+        use_cot_caption: Whether to let LLM rewrite or format the input caption via CoT reasoning.
+        use_cot_language: Whether to let LLM detect vocal language via CoT.
     """
+    # Required Inputs
+    task_type: str = "text2music"
+    instruction: str = "Fill the audio semantic mask based on the given conditions:"
+    # Audio Uploads
+    reference_audio: Optional[str] = None
+    src_audio: Optional[str] = None
+    # LM Codes Hints
+    audio_codes: str = ""
     # Text Inputs
     caption: str = ""
     lyrics: str = ""
+    instrumental: bool = False
+    # Metadata
     vocal_language: str = "unknown"
+    bpm: Optional[int] = None
+    keyscale: str = ""
+    timesignature: str = ""
+    duration: float = -1.0
+    # Advanced Settings
     inference_steps: int = 8
     seed: int = -1
+    guidance_scale: float = 7.0
     use_adg: bool = False
     cfg_interval_start: float = 0.0
     cfg_interval_end: float = 1.0
     repainting_start: float = 0.0
     repainting_end: float = -1
     audio_cover_strength: float = 1.0
     # 5Hz Language Model Parameters
+    thinking: bool = True
     lm_temperature: float = 0.85
     lm_cfg_scale: float = 2.0
     lm_top_k: int = 0
     use_cot_metas: bool = True
     use_cot_caption: bool = True
     use_cot_language: bool = True
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert config to dictionary for JSON serialization."""
+        return asdict(self)
+@dataclass
+class GenerationConfig:
+    """Configuration for music generation.
+    Attributes:
+        batch_size: Number of audio samples to generate
+        allow_lm_batch: Whether to allow batch processing in LM
+        use_random_seed: Whether to use random seed
+        seed: Seed(s) for batch generation. Can be:
+            - None: Use random seeds (when use_random_seed=True) or params.seed (when use_random_seed=False)
+            - List[int]: List of seeds, will be padded with random seeds if fewer than batch_size
+            - int: Single seed value (will be converted to list and padded)
+        lm_batch_chunk_size: Batch chunk size for LM processing
+        is_format_caption: Whether to format caption
+        constrained_decoding_debug: Whether to enable constrained decoding debug
+        audio_format: Output audio format, one of "mp3", "wav", "flac". Default: "flac"
+    """
+    batch_size: int = 2
+    allow_lm_batch: bool = False
+    use_random_seed: bool = True
+    seed: Optional[Union[int, List[int]]] = None
+    lm_batch_chunk_size: int = 8
+    is_format_caption: bool = False
+    use_constrained_decoding: bool = True
+    constrained_decoding_debug: bool = False
+    audio_format: str = "flac"  # Default to FLAC for fast saving
 @dataclass
 class GenerationResult:
     """Result of music generation.
     Attributes:
         # Audio Outputs
+        audios: List of audio dictionaries with paths, keys, params
         generation_info: Markdown-formatted generation information
         status_message: Status message from generation
+        extra_outputs: Extra outputs from generation
         success: Whether generation completed successfully
         error: Error message if generation failed
     """
     # Audio Outputs
+    audios: List[Dict[str, Any]] = field(default_factory=list)
     # Generation Information
     generation_info: str = ""
     status_message: str = ""
+    extra_outputs: Dict[str, Any] = field(default_factory=dict)
     # Success Status
     success: bool = True
     error: Optional[str] = None
 def generate_music(
     dit_handler,
     llm_handler,
+    params: GenerationParams,
     config: GenerationConfig,
+    save_dir: Optional[str] = None,
 ) -> GenerationResult:
     """Generate music using ACE-Step model with optional LM reasoning.
     Args:
         dit_handler: Initialized DiT model handler (AceStepHandler instance)
         llm_handler: Initialized LLM handler (LLMHandler instance)
+        params: Generation parameters (GenerationParams instance)
         config: Generation configuration (GenerationConfig instance)
     Returns:
+        GenerationResult with generated audio files and metadata
     """
     try:
         # Phase 1: LM-based metadata and code generation (if enabled)
+        audio_code_string_to_use = params.audio_codes
         lm_generated_metadata = None
         lm_generated_audio_codes_list = []
         # Extract mutable copies of metadata (will be updated by LM if needed)
+        bpm = params.bpm
+        key_scale = params.keyscale
+        time_signature = params.timesignature
+        audio_duration = params.duration
+        # Determine if we need to generate audio codes
+        # If user has provided audio_codes, we don't need to generate them
+        # Otherwise, check if we need audio codes (lm_dit mode) or just metas (dit mode)
+        user_provided_audio_codes = bool(params.audio_codes and str(params.audio_codes).strip())
+        # Determine infer_type: use "llm_dit" if we need audio codes, "dit" if only metas needed
+        # For now, we use "llm_dit" if batch mode or if user hasn't provided codes
+        # Use "dit" if user has provided codes (only need metas) or if explicitly only need metas
+        # Note: This logic can be refined based on specific requirements
+        need_audio_codes = not user_provided_audio_codes
+        # Determine if we should use chunk-based LM generation (always use chunks for consistency)
+        # Determine actual batch size for chunk processing
+        actual_batch_size = config.batch_size if config.batch_size is not None else 1
+        # Prepare seeds for batch generation
+        # Use config.seed if provided, otherwise fallback to params.seed
+        # Convert config.seed (None, int, or List[int]) to format that prepare_seeds accepts
+        seed_for_generation = params.seed  # Default fallback
+        if config.seed is not None:
+            if isinstance(config.seed, list):
+                # Convert List[int] to comma-separated string
+                seed_for_generation = ",".join(str(s) for s in config.seed)
+            elif isinstance(config.seed, int):
+                # Single int seed
+                seed_for_generation = config.seed
+        # Use dit_handler.prepare_seeds to handle seed list generation and padding
+        # This will handle all the logic: padding with random seeds if needed, etc.
+        actual_seed_list, _ = dit_handler.prepare_seeds(
+            actual_batch_size, seed_for_generation, config.use_random_seed
+        )
         # LM-based Chain-of-Thought reasoning
+        if params.thinking and llm_handler.llm_initialized and params.use_cot_metas:
             # Convert sampling parameters
+            top_k_value = None if params.lm_top_k == 0 else int(params.lm_top_k)
+            top_p_value = None if params.lm_top_p >= 1.0 else params.lm_top_p
             # Build user_metadata from user-provided values
             user_metadata = {}
             user_metadata_to_pass = user_metadata if user_metadata else None
+            # Determine infer_type based on whether we need audio codes
+            # - "llm_dit": generates both metas and audio codes (two-phase internally)
+            # - "dit": generates only metas (single phase)
+            infer_type = "llm_dit" if need_audio_codes else "dit"
+            # Use chunk size from config, or default to batch_size if not set
+            max_inference_batch_size = int(config.lm_batch_chunk_size) if config.lm_batch_chunk_size > 0 else actual_batch_size
+            num_chunks = math.ceil(actual_batch_size / max_inference_batch_size)
+            all_metadata_list = []
+            all_audio_codes_list = []
+            for chunk_idx in range(num_chunks):
+                chunk_start = chunk_idx * max_inference_batch_size
+                chunk_end = min(chunk_start + max_inference_batch_size, actual_batch_size)
+                chunk_size = chunk_end - chunk_start
+                chunk_seeds = actual_seed_list[chunk_start:chunk_end] if chunk_start < len(actual_seed_list) else None
+                logger.info(
+                    f"LM chunk {chunk_idx+1}/{num_chunks} (infer_type={infer_type}) "
+                    f"(size: {chunk_size}, seeds: {chunk_seeds})"
                 )
+                # Use the determined infer_type
+                # - "llm_dit" will internally run two phases (metas + codes)
+                # - "dit" will only run phase 1 (metas only)
+                result = llm_handler.generate_with_stop_condition(
+                    caption=params.caption or "",
+                    lyrics=params.lyrics or "",
+                    infer_type=infer_type,
+                    temperature=params.lm_temperature,
+                    cfg_scale=params.lm_cfg_scale,
+                    negative_prompt=params.lm_negative_prompt,
                     top_k=top_k_value,
                     top_p=top_p_value,
                     user_metadata=user_metadata_to_pass,
+                    use_cot_caption=params.use_cot_caption,
+                    use_cot_language=params.use_cot_language,
                     is_format_caption=config.is_format_caption,
+                    use_constrained_decoding=config.use_constrained_decoding,
                     constrained_decoding_debug=config.constrained_decoding_debug,
+                    batch_size=chunk_size,
+                    seeds=chunk_seeds,
                 )
+                if chunk_size > 1:
+                    metadata_list, audio_codes_list, status = result
+                    all_metadata_list.extend(metadata_list)
+                    all_audio_codes_list.extend(audio_codes_list)
+                else:
+                    metadata, audio_codes, status = result
+                    all_metadata_list.append(metadata)
+                    all_audio_codes_list.append(audio_codes)
+            lm_generated_metadata = all_metadata_list[0] if all_metadata_list else None
+            lm_generated_audio_codes_list = all_audio_codes_list
+            # Set audio_code_string_to_use based on infer_type
+            if infer_type == "llm_dit":
+                # If batch mode, use list; otherwise use single string
+                if actual_batch_size > 1:
+                    audio_code_string_to_use = all_audio_codes_list
+                else:
+                    audio_code_string_to_use = all_audio_codes_list[0] if all_audio_codes_list else ""
+            else:
+                # For "dit" mode, keep user-provided codes or empty
+                audio_code_string_to_use = params.audio_codes
+            # Update metadata from LM if not provided by user
+            if lm_generated_metadata:
+                bpm, key_scale, time_signature, audio_duration = _update_metadata_from_lm(
+                    lm_generated_metadata, bpm, key_scale, time_signature, audio_duration
+                )
         # Phase 2: DiT music generation
+        # Use seed_for_generation (from config.seed or params.seed) instead of params.seed for actual generation
         result = dit_handler.generate_music(
+            captions=params.caption,
+            lyrics=params.lyrics,
             bpm=bpm,
             key_scale=key_scale,
             time_signature=time_signature,
+            vocal_language=params.vocal_language,
+            inference_steps=params.inference_steps,
+            guidance_scale=params.guidance_scale,
             use_random_seed=config.use_random_seed,
+            seed=seed_for_generation,  # Use config.seed (or params.seed fallback) instead of params.seed directly
+            reference_audio=params.reference_audio,
             audio_duration=audio_duration,
+            batch_size=config.batch_size if config.batch_size is not None else 1,
+            src_audio=params.src_audio,
             audio_code_string=audio_code_string_to_use,
+            repainting_start=params.repainting_start,
+            repainting_end=params.repainting_end,
+            instruction=params.instruction,
+            audio_cover_strength=params.audio_cover_strength,
+            task_type=params.task_type,
+            use_adg=params.use_adg,
+            cfg_interval_start=params.cfg_interval_start,
+            cfg_interval_end=params.cfg_interval_end,
         )
+        # Check if generation failed
+        if not result.get("success", False):
+            return GenerationResult(
+                audios=[],
+                generation_info=result.get("generation_info", ""),
+                status_message=result.get("status_message", ""),
+                extra_outputs={},
+                success=False,
+                error=result.get("error"),
+            )
+        # Extract results from dit_handler.generate_music dict
+        dit_audios = result.get("audios", [])
+        generation_info = result.get("generation_info", "")
+        status_message = result.get("status_message", "")
+        dit_extra_outputs = result.get("extra_outputs", {})
         # Append LM metadata to generation info
         if lm_generated_metadata:
             generation_info = _append_lm_metadata_to_info(generation_info, lm_generated_metadata)
+        # Use the seed list already prepared above (from config.seed or params.seed fallback)
+        # actual_seed_list was computed earlier using dit_handler.prepare_seeds
+        seed_list = actual_seed_list
+        # Get base params dictionary
+        base_params_dict = params.to_dict()
+        # Save audio files using AudioSaver (format from config)
+        audio_format = config.audio_format if config.audio_format else "flac"
+        audio_saver = AudioSaver(default_format=audio_format)
+        # Use handler's temp_dir for saving files
+        if save_dir is not None:
+            os.makedirs(save_dir, exist_ok=True)
+        # Build audios list for GenerationResult with params and save files
+        # Audio saving and UUID generation handled here, outside of handler
+        audios = []
+        for idx, dit_audio in enumerate(dit_audios):
+            # Create a copy of params dict for this audio
+            audio_params = base_params_dict.copy()
+            # Update audio-specific values
+            audio_params["seed"] = seed_list[idx] if idx < len(seed_list) else None
+            # Add audio codes if batch mode
+            if lm_generated_audio_codes_list and idx < len(lm_generated_audio_codes_list):
+                audio_params["audio_codes"] = lm_generated_audio_codes_list[idx]
+            # Get audio tensor and metadata
+            audio_tensor = dit_audio.get("tensor")
+            sample_rate = dit_audio.get("sample_rate", 48000)
+            # Generate UUID for this audio (moved from handler)
+            batch_seed = seed_list[idx] if idx < len(seed_list) else seed_list[0] if seed_list else -1
+            audio_code_str = lm_generated_audio_codes_list[idx] if (lm_generated_audio_codes_list and idx < len(lm_generated_audio_codes_list)) else audio_code_string_to_use
+            if isinstance(audio_code_str, list):
+                audio_code_str = audio_code_str[idx] if idx < len(audio_code_str) else ""
+            audio_key = generate_uuid_from_params(
+                captions=params.caption,
+                lyrics=params.lyrics,
+                bpm=bpm,
+                key_scale=key_scale,
+                time_signature=time_signature,
+                vocal_language=params.vocal_language,
+                inference_steps=params.inference_steps,
+                guidance_scale=params.guidance_scale,
+                seed=batch_seed,
+                audio_duration=audio_duration,
+                audio_code_string=audio_code_str,
+                repainting_start=params.repainting_start,
+                repainting_end=params.repainting_end,
+                instruction=params.instruction,
+                audio_cover_strength=params.audio_cover_strength,
+                task_type=params.task_type,
+                use_adg=params.use_adg,
+                cfg_interval_start=params.cfg_interval_start,
+                cfg_interval_end=params.cfg_interval_end,
+                audio_format=audio_format,
+                reference_audio=params.reference_audio,
+                src_audio=params.src_audio,
+                batch_index=idx,
+            )
+            # Save audio file (handled outside handler)
+            audio_path = None
+            if audio_tensor is not None and save_dir is not None:
+                try:
+                    audio_file = os.path.join(save_dir, f"{audio_key}.{audio_format}")
+                    audio_path = audio_saver.save_audio(
+                        audio_tensor,
+                        audio_file,
+                        sample_rate=sample_rate,
+                        format=audio_format,
+                        channels_first=True
+                    )
+                except Exception as e:
+                    logger.error(f"[generate_music] Failed to save audio file: {e}")
+                    audio_path = ""  # Fallback to empty path
+            audio_dict = {
+                "path": audio_path or "",  # File path (saved here, not in handler)
+                "tensor": audio_tensor,  # Audio tensor [channels, samples], CPU, float32
+                "key": audio_key,
+                "sample_rate": sample_rate,
+                "params": audio_params,
+            }
+            audios.append(audio_dict)
+        # Merge extra_outputs: include dit_extra_outputs (latents, masks) and add LM metadata
+        extra_outputs = dit_extra_outputs.copy()
+        extra_outputs["lm_metadata"] = lm_generated_metadata
+        # Create and return GenerationResult
         return GenerationResult(
+            audios=audios,
             generation_info=generation_info,
             status_message=status_message,
+            extra_outputs=extra_outputs,
             success=True,
             error=None,
         )
     except Exception as e:
         logger.exception("Music generation failed")
         return GenerationResult(
+            audios=[],
             generation_info=f"❌ Generation failed: {str(e)}",
             status_message=f"Error: {str(e)}",
+            extra_outputs={},
+            success=False,
+            error=str(e),
         )
 # LEGACY GRADIO UI COMPATIBILITY LAYER
 # ============================================================================
+def generate_for_gradio(
     dit_handler,
     llm_handler,
     captions,
         Tuple with 28 elements for Gradio UI component updates
     """
+    # Convert legacy parameters to GenerationParams and GenerationConfig
+    params = GenerationParams(
         caption=captions,
         lyrics=lyrics,
         bpm=bpm,
+        keyscale=key_scale,
+        timesignature=time_signature,
         vocal_language=vocal_language,
+        audio_codes=text2music_audio_code_string,
+        duration=audio_duration,
         inference_steps=inference_steps,
         guidance_scale=guidance_scale,
         seed=seed,
         use_adg=use_adg,
         cfg_interval_start=cfg_interval_start,
         cfg_interval_end=cfg_interval_end,
         task_type=task_type,
         reference_audio=reference_audio,
         src_audio=src_audio,
         repainting_start=repainting_start,
         repainting_end=repainting_end,
         audio_cover_strength=audio_cover_strength,
         instruction=instruction_display_gen,
+        thinking=think_checkbox,
         lm_temperature=lm_temperature,
         lm_cfg_scale=lm_cfg_scale,
         lm_top_k=lm_top_k,
         use_cot_metas=use_cot_metas,
         use_cot_caption=use_cot_caption,
         use_cot_language=use_cot_language,
     )
+    config = GenerationConfig(batch_size=1)
+    config.batch_size = batch_size_input
+    config.use_random_seed = random_seed_checkbox
+    config.allow_lm_batch = allow_lm_batch
+    config.lm_batch_chunk_size = lm_batch_chunk_size
+    config.is_format_caption = is_format_caption
+    config.constrained_decoding_debug = constrained_decoding_debug
     # Call new API
+    result = generate_music(dit_handler, llm_handler, params, config)
+    # Extract audio paths from result.audios
+    audio_paths = [audio["path"] for audio in result.audios]
+    # Extract extra outputs
+    extra_outputs = result.extra_outputs
+    seed_value = extra_outputs.get("seed_value", "")
+    lm_metadata = extra_outputs.get("lm_metadata", None)
+    # Legacy alignment fields (no longer used, set to empty/None)
+    align_score_1 = ""
+    align_text_1 = ""
+    align_plot_1 = None
+    align_score_2 = ""
+    align_text_2 = ""
+    align_plot_2 = None
     # Determine which codes to update in UI
+    if config.allow_lm_batch and lm_metadata:
         # Batch mode: extract codes from metadata if available
+        lm_codes_list = lm_metadata.get('audio_codes_list', [])
         updated_audio_codes = lm_codes_list[0] if lm_codes_list else text2music_audio_code_string
         codes_outputs = (lm_codes_list + [""] * 8)[:8]
     else:
         # Single mode
+        lm_codes = lm_metadata.get('audio_codes', '') if lm_metadata else ''
         updated_audio_codes = lm_codes if lm_codes else text2music_audio_code_string
         codes_outputs = [""] * 8
     # Prepare audio outputs (up to 8)
+    audio_outputs = (audio_paths + [None] * 8)[:8]
     # Return tuple for Gradio UI (28 elements)
     return (
         audio_outputs[5],  # generated_audio_6
         audio_outputs[6],  # generated_audio_7
         audio_outputs[7],  # generated_audio_8
+        audio_paths,  # generated_audio_batch
         result.generation_info,
         result.status_message,
+        seed_value,
+        align_score_1,
+        align_text_1,
+        align_plot_1,
+        align_score_2,
+        align_text_2,
+        align_plot_2,
         updated_audio_codes,  # Update main audio codes in UI
         codes_outputs[0],  # text2music_audio_code_string_1
         codes_outputs[1],  # text2music_audio_code_string_2
         codes_outputs[5],  # text2music_audio_code_string_6
         codes_outputs[6],  # text2music_audio_code_string_7
         codes_outputs[7],  # text2music_audio_code_string_8
+        lm_metadata,  # Store metadata for "Send to src audio" buttons
         is_format_caption,  # Keep is_format_caption unchanged
     )

acestep/llm_inference.py CHANGED Viewed

@@ -5,7 +5,7 @@ Handles all LM-related operations including initialization and generation
 import os
 import traceback
 import time
-from typing import Optional, Dict, Any, Tuple, List
 from contextlib import contextmanager
 import yaml
@@ -85,6 +85,189 @@ class LLMHandler:
         except Exception as e:
             return 0.9, False
     def initialize(
         self,
         checkpoint_dir: str,
@@ -150,41 +333,21 @@ class LLMHandler:
                     # vllm initialization failed, fallback to PyTorch
                     if not self.llm_initialized:
                         logger.warning("vllm initialization failed, falling back to PyTorch backend")
-                        try:
-                            self.llm = AutoModelForCausalLM.from_pretrained(full_lm_model_path, trust_remote_code=True)
-                            if not self.offload_to_cpu:
-                                self.llm = self.llm.to(device).to(self.dtype)
-                            else:
-                                self.llm = self.llm.to("cpu").to(self.dtype)
-                            self.llm.eval()
-                            self.llm_backend = "pt"
-                            self.llm_initialized = True
-                            logger.info("5Hz LM initialized successfully using PyTorch backend (fallback)")
-                            status_msg = f"✅ 5Hz LM initialized successfully (PyTorch fallback)\nModel: {full_lm_model_path}\nBackend: PyTorch"
-                        except Exception as e:
-                            return f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}", False
                 # If vllm initialization succeeded, self.llm_initialized should already be True
             else:
                 # Use PyTorch backend (pt)
-                try:
-                    self.llm = AutoModelForCausalLM.from_pretrained(full_lm_model_path, trust_remote_code=True)
-                    if not self.offload_to_cpu:
-                        self.llm = self.llm.to(device).to(self.dtype)
-                    else:
-                        self.llm = self.llm.to("cpu").to(self.dtype)
-                    self.llm.eval()
-                    self.llm_backend = "pt"
-                    self.llm_initialized = True
-                    logger.info(f"5Hz LM initialized successfully using PyTorch backend on {device}")
-                    status_msg = f"✅ 5Hz LM initialized successfully\nModel: {full_lm_model_path}\nBackend: PyTorch\nDevice: {device}"
-                except Exception as e:
-                    return f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}", False
             return status_msg, True
         except Exception as e:
-            error_msg = f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
-            return error_msg, False
     def _initialize_5hz_lm_vllm(self, model_path: str) -> str:
         """Initialize 5Hz LM model using vllm backend"""
@@ -230,12 +393,11 @@ class LLMHandler:
             return f"✅ 5Hz LM initialized successfully\nModel: {model_path}\nDevice: {device_name}\nGPU Memory Utilization: {gpu_memory_utilization:.2f}"
         except Exception as e:
             self.llm_initialized = False
-            error_msg = f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
-            return error_msg
-    def _run_vllm_from_formatted(
         self,
-        formatted_prompt: str,
         temperature: float,
         cfg_scale: float,
         negative_prompt: str,
@@ -244,7 +406,7 @@ class LLMHandler:
         repetition_penalty: float,
         use_constrained_decoding: bool = True,
         constrained_decoding_debug: bool = False,
-        metadata_temperature: Optional[float] = 0.85,
         codes_temperature: Optional[float] = None,
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
@@ -256,37 +418,40 @@ class LLMHandler:
         caption: str = "",
         lyrics: str = "",
         cot_text: str = "",
-    ) -> str:
-        """Shared vllm path: accept prebuilt formatted prompt and return text."""
         from nanovllm import SamplingParams
         # Determine effective temperature for sampler
-        use_phase_temperatures = metadata_temperature is not None or codes_temperature is not None
         effective_sampler_temp = 1.0 if use_phase_temperatures else temperature
-        # Use shared constrained processor if enabled
-        constrained_processor = None
-        if use_constrained_decoding or use_phase_temperatures:
-            # Reset processor state for new generation
-            self.constrained_processor.reset()
-            # Use shared processor, just update caption and settings
-            self.constrained_processor.enabled = use_constrained_decoding
-            self.constrained_processor.debug = constrained_decoding_debug
-            self.constrained_processor.metadata_temperature = metadata_temperature if use_phase_temperatures else None
-            self.constrained_processor.codes_temperature = codes_temperature if use_phase_temperatures else None
-            self.constrained_processor.set_target_duration(target_duration)
-            # Always call set_user_metadata to ensure previous settings are cleared if None
-            self.constrained_processor.set_user_metadata(user_metadata)
-            self.constrained_processor.set_stop_at_reasoning(stop_at_reasoning)
-            # Set skip_caption and skip_language based on flags
-            self.constrained_processor.set_skip_genres(skip_genres)
-            self.constrained_processor.set_skip_caption(skip_caption)
-            self.constrained_processor.set_skip_language(skip_language)
-            # Set generation phase for phase-aware processing
-            self.constrained_processor.set_generation_phase(generation_phase)
-            constrained_processor = self.constrained_processor
         sampling_params = SamplingParams(
             max_tokens=self.max_model_len - 64,
@@ -301,119 +466,25 @@ class LLMHandler:
         if cfg_scale > 1.0:
             # Build unconditional prompt based on generation phase
-            if generation_phase == "codes":
-                # Codes phase: use empty CoT in unconditional prompt
-                # formatted_prompt was built with build_formatted_prompt_with_cot(caption, lyrics, cot_text)
-                # For unconditional, we use empty CoT: build_formatted_prompt_with_cot(caption, lyrics, cot_text, is_negative_prompt=True, negative_prompt=...)
-                formatted_unconditional_prompt = self.build_formatted_prompt_with_cot(
-                    caption, lyrics, cot_text, is_negative_prompt=True, negative_prompt=negative_prompt
-                )
-            else:
-                # CoT phase: unconditional prompt
-                # If negative_prompt is provided, use it as caption; otherwise remove caption and keep only lyrics
-                formatted_unconditional_prompt = self.build_formatted_prompt(
-                    caption, lyrics, is_negative_prompt=True, generation_phase="cot", negative_prompt=negative_prompt
-                )
-            outputs = self.llm.generate(
-                [formatted_prompt],
-                sampling_params,
-                unconditional_prompts=[formatted_unconditional_prompt],
-            )
-        else:
-            outputs = self.llm.generate([formatted_prompt], sampling_params)
-        # Extract text (retain original selection order/logic)
-        if isinstance(outputs, list) and len(outputs) > 0:
-            if hasattr(outputs[0], "outputs") and len(outputs[0].outputs) > 0:
-                output_text = outputs[0].outputs[0].text
-            elif hasattr(outputs[0], "text"):
-                output_text = outputs[0].text
-            elif isinstance(outputs[0], dict) and "text" in outputs[0]:
-                output_text = outputs[0]["text"]
-            else:
-                output_text = str(outputs[0])
-        else:
-            output_text = str(outputs)
-        return output_text
-    def _run_vllm_batch(
-        self,
-        formatted_prompts: List[str],
-        temperature: float,
-        cfg_scale: float,
-        negative_prompt: str,
-        top_k: Optional[int],
-        top_p: Optional[float],
-        repetition_penalty: float,
-        use_constrained_decoding: bool = True,
-        constrained_decoding_debug: bool = False,
-        target_duration: Optional[float] = None,
-        generation_phase: str = "codes",
-        caption: str = "",
-        lyrics: str = "",
-        cot_text: str = "",
-        seeds: Optional[List[int]] = None,
-    ) -> List[str]:
-        """Batch generation using vllm backend"""
-        from nanovllm import SamplingParams
-        batch_size = len(formatted_prompts)
-        # Determine effective temperature for sampler
-        effective_sampler_temp = temperature
-        # Use shared constrained processor if enabled
-        # Note: vllm batch mode uses same processor for all items
-        constrained_processor = None
-        if use_constrained_decoding:
-            # Reset processor state for new generation
-            self.constrained_processor.reset()
-            self.constrained_processor.enabled = use_constrained_decoding
-            self.constrained_processor.debug = constrained_decoding_debug
-            self.constrained_processor.metadata_temperature = None
-            self.constrained_processor.codes_temperature = None
-            self.constrained_processor.set_target_duration(target_duration)
-            self.constrained_processor.set_user_metadata(None)
-            self.constrained_processor.set_stop_at_reasoning(False)
-            self.constrained_processor.set_skip_genres(True)
-            self.constrained_processor.set_skip_caption(True)
-            self.constrained_processor.set_skip_language(True)
-            self.constrained_processor.set_generation_phase(generation_phase)
-            constrained_processor = self.constrained_processor
-        # Build sampling params
-        sampling_params = SamplingParams(
-            max_tokens=self.max_model_len - 64,
-            temperature=effective_sampler_temp,
-            cfg_scale=cfg_scale,
-            top_k=top_k,
-            top_p=top_p,
-            repetition_penalty=repetition_penalty,
-            logits_processor=constrained_processor,
-            logits_processor_update_state=constrained_processor.update_state if constrained_processor else None,
-        )
-        # Generate with or without CFG
-        if cfg_scale > 1.0:
-            # Build unconditional prompts
-            formatted_unconditional_prompt = self.build_formatted_prompt_with_cot(
-                caption, lyrics, cot_text, is_negative_prompt=True, negative_prompt=negative_prompt
             )
             unconditional_prompts = [formatted_unconditional_prompt] * batch_size
             outputs = self.llm.generate(
-                formatted_prompts,
                 sampling_params,
                 unconditional_prompts=unconditional_prompts,
             )
         else:
-            outputs = self.llm.generate(formatted_prompts, sampling_params)
-        # Extract text from each output
         output_texts = []
         for output in outputs:
             if hasattr(output, "outputs") and len(output.outputs) > 0:
@@ -424,70 +495,11 @@ class LLMHandler:
                 output_texts.append(output["text"])
             else:
                 output_texts.append(str(output))
-        return output_texts
-    def _run_pt_batch(
-        self,
-        formatted_prompts: List[str],
-        temperature: float,
-        cfg_scale: float,
-        negative_prompt: str,
-        top_k: Optional[int],
-        top_p: Optional[float],
-        repetition_penalty: float,
-        use_constrained_decoding: bool = True,
-        constrained_decoding_debug: bool = False,
-        target_duration: Optional[float] = None,
-        generation_phase: str = "codes",
-        caption: str = "",
-        lyrics: str = "",
-        cot_text: str = "",
-        seeds: Optional[List[int]] = None,
-    ) -> List[str]:
-        """Batch generation using PyTorch backend"""
-        import random
-        batch_size = len(formatted_prompts)
-        output_texts = []
-        # Generate each item sequentially with different seeds
-        # (PyTorch backend doesn't support true batching efficiently)
-        for i, formatted_prompt in enumerate(formatted_prompts):
-            # Set seed for this item if provided
-            if seeds and i < len(seeds):
-                torch.manual_seed(seeds[i])
-                if torch.cuda.is_available():
-                    torch.cuda.manual_seed_all(seeds[i])
-            # Generate using single-item method
-            output_text = self._run_pt_from_formatted(
-                formatted_prompt=formatted_prompt,
-                temperature=temperature,
-                cfg_scale=cfg_scale,
-                negative_prompt=negative_prompt,
-                top_k=top_k,
-                top_p=top_p,
-                repetition_penalty=repetition_penalty,
-                use_constrained_decoding=use_constrained_decoding,
-                constrained_decoding_debug=constrained_decoding_debug,
-                target_duration=target_duration,
-                user_metadata=None,
-                stop_at_reasoning=False,
-                skip_genres=True,
-                skip_caption=True,
-                skip_language=True,
-                generation_phase=generation_phase,
-                caption=caption,
-                lyrics=lyrics,
-                cot_text=cot_text,
-            )
-            output_texts.append(output_text)
-        return output_texts
-    def _run_pt_from_formatted(
         self,
         formatted_prompt: str,
         temperature: float,
@@ -496,20 +508,20 @@ class LLMHandler:
         top_k: Optional[int],
         top_p: Optional[float],
         repetition_penalty: float,
-        use_constrained_decoding: bool = True,
-        constrained_decoding_debug: bool = False,
-        target_duration: Optional[float] = None,
-        user_metadata: Optional[Dict[str, Optional[str]]] = None,
-        stop_at_reasoning: bool = False,
-        skip_genres: bool = True,
-        skip_caption: bool = False,
-        skip_language: bool = False,
-        generation_phase: str = "cot",
-        caption: str = "",
-        lyrics: str = "",
-        cot_text: str = "",
     ) -> str:
-        """Shared PyTorch path: accept prebuilt formatted prompt and return text."""
         inputs = self.llm_tokenizer(
             formatted_prompt,
             return_tensors="pt",
@@ -517,27 +529,19 @@ class LLMHandler:
             truncation=True,
         )
-        # Use shared constrained processor if enabled
-        constrained_processor = None
-        if use_constrained_decoding:
-            # Reset processor state for new generation
-            self.constrained_processor.reset()
-            # Use shared processor, just update caption and settings
-            self.constrained_processor.enabled = use_constrained_decoding
-            self.constrained_processor.debug = constrained_decoding_debug
-            self.constrained_processor.set_target_duration(target_duration)
-            # Always call set_user_metadata to ensure previous settings are cleared if None
-            self.constrained_processor.set_user_metadata(user_metadata)
-            self.constrained_processor.set_stop_at_reasoning(stop_at_reasoning)
-            # Set skip_caption and skip_language based on flags
-            self.constrained_processor.set_skip_genres(skip_genres)
-            self.constrained_processor.set_skip_caption(skip_caption)
-            self.constrained_processor.set_skip_language(skip_language)
-            # Set generation phase for phase-aware processing
-            self.constrained_processor.set_generation_phase(generation_phase)
-            constrained_processor = self.constrained_processor
         with self._load_model_context():
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
@@ -546,25 +550,18 @@ class LLMHandler:
                 max_new_tokens = min(max_new_tokens, self.max_model_len - 64)
             # Build logits processor list (only for CFG and repetition penalty)
-            logits_processor = LogitsProcessorList()
-            # Add repetition penalty if needed
-            if repetition_penalty != 1.0:
-                logits_processor.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
             if cfg_scale > 1.0:
                 # Build unconditional prompt based on generation phase
-                if generation_phase == "codes":
-                    # Codes phase: use empty CoT in unconditional prompt
-                    formatted_unconditional_prompt = self.build_formatted_prompt_with_cot(
-                        caption, lyrics, cot_text, is_negative_prompt=True, negative_prompt=negative_prompt
-                    )
-                else:
-                    # CoT phase: unconditional prompt
-                    # If negative_prompt is provided, use it as caption; otherwise remove caption and keep only lyrics
-                    formatted_unconditional_prompt = self.build_formatted_prompt(
-                        caption, lyrics, is_negative_prompt=True, generation_phase="cot", negative_prompt=negative_prompt
-                    )
                 # Tokenize both prompts together to ensure same length (with left padding)
                 # Left padding is important for generation tasks
@@ -657,7 +654,101 @@ class LLMHandler:
         output_text = self.llm_tokenizer.decode(generated_ids, skip_special_tokens=False)
         return output_text
     def has_all_metas(self, user_metadata: Optional[Dict[str, Optional[str]]]) -> bool:
         """Check if all required metadata are present."""
         if user_metadata is None:
@@ -708,7 +799,9 @@ class LLMHandler:
         use_cot_caption: bool = True,
         use_cot_language: bool = True,
         is_format_caption: bool = False,
-    ) -> Tuple[Dict[str, Any], str, str]:
         """Two-phase LM generation: CoT generation followed by audio codes generation.
         - infer_type='dit': Phase 1 only - generate CoT and return metas (no audio codes)
@@ -721,30 +814,56 @@ class LLMHandler:
                            If specified, constrained decoding will inject these values directly.
             use_cot_caption: Whether to generate caption in CoT (default True).
             use_cot_language: Whether to generate language in CoT (default True).
         """
         import time
         infer_type = (infer_type or "").strip().lower()
         if infer_type not in {"dit", "llm_dit"}:
             return {}, "", f"❌ invalid infer_type: {infer_type!r} (expected 'dit' or 'llm_dit')"
         metadata = {}
         audio_codes = ""
         has_all_metas = self.has_all_metas(user_metadata)
-        # Timing variables
         phase1_time = 0.0
         phase2_time = 0.0
         # ========== PHASE 1: CoT Generation ==========
-        # Always generate CoT unless all metadata are user-provided
-        if not has_all_metas or not is_format_caption:
-            logger.info("Phase 1: Generating CoT metadata...")
             phase1_start = time.time()
             # Build formatted prompt for CoT phase
             formatted_prompt = self.build_formatted_prompt(caption, lyrics, generation_phase="cot")
             logger.info(f"generate_with_stop_condition: formatted_prompt={formatted_prompt}")
             # Generate CoT (stop at </think>)
             cot_output_text, status = self.generate_from_formatted_prompt(
@@ -774,23 +893,39 @@ class LLMHandler:
             phase1_time = time.time() - phase1_start
             if not cot_output_text:
                 return {}, "", status
             # Parse metadata from CoT output
             metadata, _ = self.parse_lm_output(cot_output_text)
-            logger.info(f"Phase 1 completed in {phase1_time:.2f}s. Generated metadata: {list(metadata.keys())}")
         else:
             # Use user-provided metadata
-            logger.info("Phase 1: Using user-provided metadata (skipping generation)")
             metadata = {k: v for k, v in user_metadata.items() if v is not None}
         # If infer_type is 'dit', stop here and return only metadata
         if infer_type == "dit":
-            status_msg = f"✅ Generated CoT metadata successfully\nFields: {', '.join(metadata.keys())}\nPhase1: {phase1_time:.2f}s"
-            return metadata, "", status_msg
         # ========== PHASE 2: Audio Codes Generation ==========
-        logger.info("Phase 2: Generating audio codes...")
         phase2_start = time.time()
         # Format metadata as CoT using YAML (matching training format)
@@ -799,221 +934,110 @@ class LLMHandler:
         # Build formatted prompt with CoT for codes generation phase
         formatted_prompt_with_cot = self.build_formatted_prompt_with_cot(caption, lyrics, cot_text)
         logger.info(f"generate_with_stop_condition: formatted_prompt_with_cot={formatted_prompt_with_cot}")
-        # Generate audio codes
-        codes_output_text, status = self.generate_from_formatted_prompt(
-            formatted_prompt=formatted_prompt_with_cot,
-            cfg={
-                "temperature": temperature,
-                "cfg_scale": cfg_scale,
-                "negative_prompt": negative_prompt,
-                "top_k": top_k,
-                "top_p": top_p,
-                "repetition_penalty": repetition_penalty,
-                "target_duration": target_duration,
-                "user_metadata": None,  # No user metadata injection in Phase 2
-                "skip_caption": True,  # Skip caption since CoT is already included
-                "skip_language": True,  # Skip language since CoT is already included
-                "generation_phase": "codes",
-                # Pass context for building unconditional prompt in codes phase
-                "caption": caption,
-                "lyrics": lyrics,
-                "cot_text": cot_text,
-            },
-            use_constrained_decoding=use_constrained_decoding,
-            constrained_decoding_debug=constrained_decoding_debug,
-            stop_at_reasoning=False,  # Generate codes until EOS
-        )
-        if not codes_output_text:
-            return metadata, "", status
-        phase2_time = time.time() - phase2_start
-        # Parse audio codes from output (metadata should be same as Phase 1)
-        _, audio_codes = self.parse_lm_output(codes_output_text)
-        codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
-        logger.info(f"Phase 2 completed in {phase2_time:.2f}s. Generated {codes_count} audio codes")
-        status_msg = f"✅ Generated successfully (2-phase)\nPhase 1: CoT metadata\nPhase 2: {codes_count} audio codes\nPhase1: {phase1_time:.2f}s, Phase2: {phase2_time:.2f}s"
-        return metadata, audio_codes, status_msg
-    def generate_with_stop_condition_batch(
-        self,
-        caption: str,
-        lyrics: str,
-        batch_size: int,
-        infer_type: str = "llm_dit",
-        temperature: float = 0.85,
-        cfg_scale: float = 1.0,
-        negative_prompt: str = "NO USER INPUT",
-        top_k: Optional[int] = None,
-        top_p: Optional[float] = None,
-        repetition_penalty: float = 1.0,
-        use_constrained_decoding: bool = True,
-        constrained_decoding_debug: bool = False,
-        target_duration: Optional[float] = None,
-        user_metadata: Optional[Dict[str, Optional[str]]] = None,
-        use_cot_caption: bool = True,
-        use_cot_language: bool = True,
-        is_format_caption: bool = False,
-        seeds: Optional[List[int]] = None,
-    ) -> Tuple[List[Dict[str, Any]], List[str], str]:
-        """
-        Batch version of generate_with_stop_condition.
-        Generates multiple audio codes with same conditions but different seeds (for diversity).
-        Args:
-            caption: Same caption for all items
-            lyrics: Same lyrics for all items
-            batch_size: Number of items to generate
-            seeds: Optional list of seeds for each batch item (for reproducibility)
-            ... (other args same as generate_with_stop_condition)
-        Returns:
-            Tuple of (metadata_list, audio_codes_list, status_message)
-            - metadata_list: List of metadata dicts (same metadata for all items)
-            - audio_codes_list: List of audio code strings (one per item, different due to sampling)
-            - status_message: Generation status
-        """
-        import random
-        import time
-        infer_type = (infer_type or "").strip().lower()
-        if infer_type not in {"dit", "llm_dit"}:
-            return [], [], f"❌ invalid infer_type: {infer_type!r} (expected 'dit' or 'llm_dit')"
-        # Generate seeds if not provided
-        if seeds is None:
-            seeds = [random.randint(0, 2**32 - 1) for _ in range(batch_size)]
-        elif len(seeds) < batch_size:
-            # Pad with random seeds if not enough provided
-            seeds = list(seeds) + [random.randint(0, 2**32 - 1) for _ in range(batch_size - len(seeds))]
-        else:
-            seeds = seeds[:batch_size]  # Truncate if too many
-        # Timing variables
-        phase1_time = 0.0
-        phase2_time = 0.0
-        # ========== PHASE 1: CoT Generation (ONCE for all items) ==========
-        has_all_metas = self.has_all_metas(user_metadata)
-        if not has_all_metas or not is_format_caption:
-            logger.info("Batch Phase 1: Generating CoT metadata (once for all items)...")
-            phase1_start = time.time()
-            # Generate CoT metadata once (same for all batch items)
-            metadata, _, status = self.generate_with_stop_condition(
-                caption=caption,
-                lyrics=lyrics,
-                infer_type="dit",  # Only generate metadata
-                temperature=temperature,
-                cfg_scale=cfg_scale,
-                negative_prompt=negative_prompt,
-                top_k=top_k,
-                top_p=top_p,
-                repetition_penalty=repetition_penalty,
                 use_constrained_decoding=use_constrained_decoding,
                 constrained_decoding_debug=constrained_decoding_debug,
-                target_duration=target_duration,
-                user_metadata=user_metadata,
-                use_cot_caption=use_cot_caption,
-                use_cot_language=use_cot_language,
-                is_format_caption=is_format_caption,
             )
-            phase1_time = time.time() - phase1_start
-            if not metadata:
-                return [], [], status
-            logger.info(f"Batch Phase 1 completed in {phase1_time:.2f}s. Generated metadata: {list(metadata.keys())}")
-        else:
-            # Use user-provided metadata
-            logger.info("Batch Phase 1: Using user-provided metadata (skipping generation)")
-            metadata = {k: v for k, v in user_metadata.items() if v is not None}
-        # If infer_type is 'dit', stop here and return only metadata
-        if infer_type == "dit":
-            metadata_list = [metadata.copy() for _ in range(batch_size)]
-            status_msg = f"✅ Generated CoT metadata successfully (batch mode)\nFields: {', '.join(metadata.keys())}\nPhase1: {phase1_time:.2f}s"
-            return metadata_list, [""] * batch_size, status_msg
-        # ========== PHASE 2: Audio Codes Generation (BATCH) ==========
-        logger.info(f"Batch Phase 2: Generating audio codes for {batch_size} items...")
-        phase2_start = time.time()
-        # Format metadata as CoT
-        cot_text = self._format_metadata_as_cot(metadata)
-        # Build formatted prompt with CoT
-        formatted_prompt = self.build_formatted_prompt_with_cot(caption, lyrics, cot_text)
-        # Replicate prompt for batch (all items have same prompt, differ by seeds)
-        formatted_prompts = [formatted_prompt] * batch_size
-        # Call backend-specific batch generation
-        try:
-            if self.llm_backend == "vllm":
-                codes_outputs = self._run_vllm_batch(
-                    formatted_prompts=formatted_prompts,
-                    temperature=temperature,
-                    cfg_scale=cfg_scale,
-                    negative_prompt=negative_prompt,
-                    top_k=top_k,
-                    top_p=top_p,
-                    repetition_penalty=repetition_penalty,
-                    use_constrained_decoding=use_constrained_decoding,
-                    constrained_decoding_debug=constrained_decoding_debug,
-                    target_duration=target_duration,
-                    generation_phase="codes",
-                    caption=caption,
-                    lyrics=lyrics,
-                    cot_text=cot_text,
-                    seeds=seeds,
-                )
-            else:  # pt backend
-                codes_outputs = self._run_pt_batch(
-                    formatted_prompts=formatted_prompts,
-                    temperature=temperature,
-                    cfg_scale=cfg_scale,
-                    negative_prompt=negative_prompt,
-                    top_k=top_k,
-                    top_p=top_p,
-                    repetition_penalty=repetition_penalty,
-                    use_constrained_decoding=use_constrained_decoding,
-                    constrained_decoding_debug=constrained_decoding_debug,
-                    target_duration=target_duration,
-                    generation_phase="codes",
-                    caption=caption,
-                    lyrics=lyrics,
-                    cot_text=cot_text,
-                    seeds=seeds,
-                )
-        except Exception as e:
-            error_msg = f"❌ Error in batch codes generation: {str(e)}"
-            logger.error(error_msg)
-            return [], [], error_msg
-        # Parse audio codes from each output
-        audio_codes_list = []
-        metadata_list = []
-        for output_text in codes_outputs:
-            _, audio_codes = self.parse_lm_output(output_text)
-            audio_codes_list.append(audio_codes)
-            metadata_list.append(metadata.copy())  # Same metadata for all
-        phase2_time = time.time() - phase2_start
-        # Log results
-        codes_counts = [len(codes.split('<|audio_code_')) - 1 if codes else 0 for codes in audio_codes_list]
-        logger.info(f"Batch Phase 2 completed in {phase2_time:.2f}s. Generated codes: {codes_counts}")
-        status_msg = f"✅ Batch generation completed ({batch_size} items)\nPhase 1: CoT metadata\nPhase 2: {sum(codes_counts)} total codes ({codes_counts})\nPhase1: {phase1_time:.2f}s, Phase2: {phase2_time:.2f}s"
-        return metadata_list, audio_codes_list, status_msg
     def build_formatted_prompt(self, caption: str, lyrics: str = "", is_negative_prompt: bool = False, generation_phase: str = "cot", negative_prompt: str = "NO USER INPUT") -> str:
         """
         Build the chat-formatted prompt for 5Hz LM from caption/lyrics.
@@ -1035,7 +1059,7 @@ class LLMHandler:
         if is_negative_prompt:
             # Unconditional prompt for CFG
             # Check if user provided a meaningful negative prompt (not the default)
-            has_negative_prompt = negative_prompt and negative_prompt.strip() and negative_prompt.strip() != "NO USER INPUT"
             if generation_phase == "cot":
                 # CoT phase unconditional prompt
@@ -1086,7 +1110,7 @@ class LLMHandler:
         if is_negative_prompt:
             # Unconditional prompt for codes phase
             # Check if user provided a meaningful negative prompt
-            has_negative_prompt = negative_prompt and negative_prompt.strip() and negative_prompt.strip() != "NO USER INPUT"
             # Use empty CoT for unconditional
             cot_for_prompt = "<think>\n</think>"
@@ -1369,8 +1393,8 @@ class LLMHandler:
         try:
             if self.llm_backend == "vllm":
-                output_text = self._run_vllm_from_formatted(
-                    formatted_prompt=formatted_prompt,
                     temperature=temperature,
                     cfg_scale=cfg_scale,
                     negative_prompt=negative_prompt,
@@ -1393,8 +1417,8 @@ class LLMHandler:
                 return output_text, f"✅ Generated successfully (vllm) | length={len(output_text)}"
             # PyTorch backend
-            output_text = self._run_pt_from_formatted(
-                formatted_prompt=formatted_prompt,
                 temperature=temperature,
                 cfg_scale=cfg_scale,
                 negative_prompt=negative_prompt,
@@ -1459,26 +1483,12 @@ class LLMHandler:
             eos_token_id = pad_token_id
         # Build logits processor for repetition penalty
-        logits_processor = LogitsProcessorList()
-        if repetition_penalty != 1.0:
-            logits_processor.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
         with torch.no_grad():
             for step in range(max_new_tokens):
                 # Forward pass
-                if past_key_values is None:
-                    outputs = model(
-                        input_ids=generated_ids,
-                        **model_kwargs,
-                        use_cache=use_cache,
-                    )
-                else:
-                    outputs = model(
-                        input_ids=generated_ids[:, -1:],
-                        past_key_values=past_key_values,
-                        **model_kwargs,
-                        use_cache=use_cache,
-                    )
                 # Get logits for the last position
                 next_token_logits = outputs.logits[:, -1, :]  # [batch_size, vocab_size]
@@ -1491,41 +1501,18 @@ class LLMHandler:
                 for processor in logits_processor:
                     next_token_logits = processor(generated_ids, next_token_logits)
-                # Apply top-k filtering
-                if top_k is not None and top_k > 0:
-                    indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
-                    next_token_logits[indices_to_remove] = float('-inf')
-                # Apply top-p filtering
-                if top_p is not None and 0.0 < top_p < 1.0:
-                    sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
-                    cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
-                    sorted_indices_to_remove = cumulative_probs > top_p
-                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-                    sorted_indices_to_remove[..., 0] = 0
-                    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
-                    next_token_logits[indices_to_remove] = float('-inf')
                 # Apply temperature and sample
-                if temperature > 0:
-                    next_token_logits = next_token_logits / temperature
-                    probs = torch.softmax(next_token_logits, dim=-1)
-                    next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-                else:
-                    next_tokens = torch.argmax(next_token_logits, dim=-1)
                 # Update constrained processor state
-                if constrained_processor is not None:
-                    for b in range(next_tokens.shape[0]):
-                        constrained_processor.update_state(next_tokens[b].item())
                 # Check for EOS token
-                should_stop = False
-                if torch.any(next_tokens == eos_token_id):
-                    should_stop = True
-                elif pad_token_id is not None and pad_token_id != eos_token_id:
-                    if torch.any(next_tokens == pad_token_id):
-                        should_stop = True
                 # Append token to sequence
                 next_tokens_unsqueezed = next_tokens.unsqueeze(1)
@@ -1601,28 +1588,12 @@ class LLMHandler:
             eos_token_id = pad_token_id
         # Build logits processor for non-CFG operations (repetition penalty, top_k, top_p)
-        logits_processor = LogitsProcessorList()
-        if repetition_penalty != 1.0:
-            logits_processor.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
         with torch.no_grad():
             for step in range(max_new_tokens):
                 # Forward pass for the entire batch (conditional + unconditional)
-                if past_key_values is None:
-                    # First step: full forward pass
-                    outputs = model(
-                        input_ids=generated_ids,
-                        **model_kwargs,
-                        use_cache=use_cache,
-                    )
-                else:
-                    # Subsequent steps: only forward the last token (utilizing KV cache)
-                    outputs = model(
-                        input_ids=generated_ids[:, -1:],
-                        past_key_values=past_key_values,
-                        **model_kwargs,
-                        use_cache=use_cache,
-                    )
                 # Get logits for the last position
                 next_token_logits = outputs.logits[:, -1, :]  # [batch_size*2, vocab_size]
@@ -1645,45 +1616,20 @@ class LLMHandler:
                 for processor in logits_processor:
                     cfg_logits = processor(current_input_ids, cfg_logits)
-                # Apply top-k filtering
-                if top_k is not None and top_k > 0:
-                    indices_to_remove = cfg_logits < torch.topk(cfg_logits, top_k)[0][..., -1, None]
-                    cfg_logits[indices_to_remove] = float('-inf')
-                # Apply top-p (nucleus) filtering
-                if top_p is not None and 0.0 < top_p < 1.0:
-                    sorted_logits, sorted_indices = torch.sort(cfg_logits, descending=True)
-                    cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
-                    # Remove tokens with cumulative probability above the threshold
-                    sorted_indices_to_remove = cumulative_probs > top_p
-                    # Shift the indices to the right to keep also the first token above the threshold
-                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-                    sorted_indices_to_remove[..., 0] = 0
-                    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
-                    cfg_logits[indices_to_remove] = float('-inf')
                 # Apply temperature and sample
-                if temperature > 0:
-                    cfg_logits = cfg_logits / temperature
-                    probs = torch.softmax(cfg_logits, dim=-1)
-                    next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-                else:
-                    next_tokens = torch.argmax(cfg_logits, dim=-1)
                 # Update constrained processor state AFTER sampling
-                if constrained_processor is not None:
-                    for b in range(next_tokens.shape[0]):
-                        constrained_processor.update_state(next_tokens[b].item())
                 # Check for EOS token in conditional sequences BEFORE unsqueezing
                 # Stop if any conditional sequence generates EOS token
                 # next_tokens shape: [batch_size] (only conditional tokens)
-                should_stop = False
-                if torch.any(next_tokens == eos_token_id):
-                    should_stop = True
-                elif pad_token_id is not None and pad_token_id != eos_token_id:
-                    if torch.any(next_tokens == pad_token_id):
-                        should_stop = True
                 # Apply the same sampled tokens to both conditional and unconditional sequences
                 next_tokens_unsqueezed = next_tokens.unsqueeze(1)

 import os
 import traceback
 import time
+from typing import Optional, Dict, Any, Tuple, List, Union
 from contextlib import contextmanager
 import yaml
         except Exception as e:
             return 0.9, False
+    def _has_meaningful_negative_prompt(self, negative_prompt: str) -> bool:
+        """Check if negative prompt is meaningful (not default/empty)"""
+        return negative_prompt and negative_prompt.strip() and negative_prompt.strip() != "NO USER INPUT"
+    def _build_logits_processor(self, repetition_penalty: float) -> LogitsProcessorList:
+        """Build logits processor list with repetition penalty if needed"""
+        logits_processor = LogitsProcessorList()
+        if repetition_penalty != 1.0:
+            logits_processor.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
+        return logits_processor
+    def _setup_constrained_processor(
+        self,
+        use_constrained_decoding: bool,
+        constrained_decoding_debug: bool,
+        target_duration: Optional[float],
+        user_metadata: Optional[Dict[str, Optional[str]]],
+        stop_at_reasoning: bool,
+        skip_genres: bool,
+        skip_caption: bool,
+        skip_language: bool,
+        generation_phase: str,
+        is_batch: bool = False,
+        metadata_temperature: Optional[float] = None,
+        codes_temperature: Optional[float] = None,
+    ) -> Optional[MetadataConstrainedLogitsProcessor]:
+        """Setup and configure constrained processor for generation"""
+        use_phase_temperatures = not is_batch and (metadata_temperature is not None or codes_temperature is not None)
+        if not use_constrained_decoding and not use_phase_temperatures:
+            return None
+        # Reset processor state for new generation
+        self.constrained_processor.reset()
+        # Use shared processor, just update settings
+        self.constrained_processor.enabled = use_constrained_decoding
+        self.constrained_processor.debug = constrained_decoding_debug
+        # Phase temperatures only supported in single mode
+        if use_phase_temperatures:
+            self.constrained_processor.metadata_temperature = metadata_temperature
+            self.constrained_processor.codes_temperature = codes_temperature
+        else:
+            self.constrained_processor.metadata_temperature = None
+            self.constrained_processor.codes_temperature = None
+        self.constrained_processor.set_target_duration(target_duration)
+        # Batch mode uses default/disabled settings for these options
+        if is_batch:
+            self.constrained_processor.set_user_metadata(None)
+            self.constrained_processor.set_stop_at_reasoning(False)
+            self.constrained_processor.set_skip_genres(True)
+            self.constrained_processor.set_skip_caption(True)
+            self.constrained_processor.set_skip_language(True)
+        else:
+            # Single mode uses provided settings
+            self.constrained_processor.set_user_metadata(user_metadata)
+            self.constrained_processor.set_stop_at_reasoning(stop_at_reasoning)
+            self.constrained_processor.set_skip_genres(skip_genres)
+            self.constrained_processor.set_skip_caption(skip_caption)
+            self.constrained_processor.set_skip_language(skip_language)
+        # Set generation phase for phase-aware processing
+        self.constrained_processor.set_generation_phase(generation_phase)
+        return self.constrained_processor
+    def _build_unconditional_prompt(
+        self,
+        caption: str,
+        lyrics: str,
+        cot_text: str,
+        negative_prompt: str,
+        generation_phase: str,
+        is_batch: bool = False,
+    ) -> str:
+        """Build unconditional prompt for CFG based on generation phase and batch mode"""
+        if is_batch or generation_phase == "codes":
+            # Codes phase or batch mode: use empty CoT in unconditional prompt
+            return self.build_formatted_prompt_with_cot(
+                caption, lyrics, cot_text, is_negative_prompt=True, negative_prompt=negative_prompt
+            )
+        else:
+            # CoT phase (single mode only): unconditional prompt
+            # If negative_prompt is provided, use it as caption; otherwise remove caption and keep only lyrics
+            return self.build_formatted_prompt(
+                caption, lyrics, is_negative_prompt=True, generation_phase="cot", negative_prompt=negative_prompt
+            )
+    def _load_pytorch_model(self, model_path: str, device: str) -> Tuple[bool, str]:
+        """Load PyTorch model from path and return (success, status_message)"""
+        try:
+            self.llm = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
+            if not self.offload_to_cpu:
+                self.llm = self.llm.to(device).to(self.dtype)
+            else:
+                self.llm = self.llm.to("cpu").to(self.dtype)
+            self.llm.eval()
+            self.llm_backend = "pt"
+            self.llm_initialized = True
+            logger.info(f"5Hz LM initialized successfully using PyTorch backend on {device}")
+            status_msg = f"✅ 5Hz LM initialized successfully\nModel: {model_path}\nBackend: PyTorch\nDevice: {device}"
+            return True, status_msg
+        except Exception as e:
+            return False, f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+    def _apply_top_k_filter(self, logits: torch.Tensor, top_k: Optional[int]) -> torch.Tensor:
+        """Apply top-k filtering to logits"""
+        if top_k is not None and top_k > 0:
+            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+            logits[indices_to_remove] = float('-inf')
+        return logits
+    def _apply_top_p_filter(self, logits: torch.Tensor, top_p: Optional[float]) -> torch.Tensor:
+        """Apply top-p (nucleus) filtering to logits"""
+        if top_p is not None and 0.0 < top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+            sorted_indices_to_remove = cumulative_probs > top_p
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+            indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+            logits[indices_to_remove] = float('-inf')
+        return logits
+    def _sample_tokens(self, logits: torch.Tensor, temperature: float) -> torch.Tensor:
+        """Sample tokens from logits with temperature"""
+        if temperature > 0:
+            logits = logits / temperature
+            probs = torch.softmax(logits, dim=-1)
+            return torch.multinomial(probs, num_samples=1).squeeze(1)
+        else:
+            return torch.argmax(logits, dim=-1)
+    def _check_eos_token(self, tokens: torch.Tensor, eos_token_id: int, pad_token_id: Optional[int]) -> bool:
+        """Check if any token in the batch is EOS or pad token"""
+        if torch.any(tokens == eos_token_id):
+            return True
+        if pad_token_id is not None and pad_token_id != eos_token_id:
+            if torch.any(tokens == pad_token_id):
+                return True
+        return False
+    def _update_constrained_processor_state(self, constrained_processor: Optional[MetadataConstrainedLogitsProcessor], tokens: torch.Tensor):
+        """Update constrained processor state with generated tokens"""
+        if constrained_processor is not None:
+            for b in range(tokens.shape[0]):
+                constrained_processor.update_state(tokens[b].item())
+    def _forward_pass(
+        self,
+        model: Any,
+        generated_ids: torch.Tensor,
+        model_kwargs: Dict[str, Any],
+        past_key_values: Optional[Any],
+        use_cache: bool,
+    ) -> Any:
+        """Perform forward pass with KV cache support"""
+        if past_key_values is None:
+            outputs = model(
+                input_ids=generated_ids,
+                **model_kwargs,
+                use_cache=use_cache,
+            )
+        else:
+            outputs = model(
+                input_ids=generated_ids[:, -1:],
+                past_key_values=past_key_values,
+                **model_kwargs,
+                use_cache=use_cache,
+            )
+        return outputs
+    def _normalize_batch_input(self, formatted_prompts: Union[str, List[str]]) -> Tuple[List[str], bool]:
+        """Normalize batch input: convert single string to list and return (list, is_batch)"""
+        is_batch = isinstance(formatted_prompts, list)
+        if is_batch:
+            return formatted_prompts, is_batch
+        else:
+            return [formatted_prompts], is_batch
     def initialize(
         self,
         checkpoint_dir: str,
                     # vllm initialization failed, fallback to PyTorch
                     if not self.llm_initialized:
                         logger.warning("vllm initialization failed, falling back to PyTorch backend")
+                        success, status_msg = self._load_pytorch_model(full_lm_model_path, device)
+                        if not success:
+                            return status_msg, False
+                        status_msg = f"✅ 5Hz LM initialized successfully (PyTorch fallback)\nModel: {full_lm_model_path}\nBackend: PyTorch"
                 # If vllm initialization succeeded, self.llm_initialized should already be True
             else:
                 # Use PyTorch backend (pt)
+                success, status_msg = self._load_pytorch_model(full_lm_model_path, device)
+                if not success:
+                    return status_msg, False
             return status_msg, True
         except Exception as e:
+            return f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}", False
     def _initialize_5hz_lm_vllm(self, model_path: str) -> str:
         """Initialize 5Hz LM model using vllm backend"""
             return f"✅ 5Hz LM initialized successfully\nModel: {model_path}\nDevice: {device_name}\nGPU Memory Utilization: {gpu_memory_utilization:.2f}"
         except Exception as e:
             self.llm_initialized = False
+            return f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+    def _run_vllm(
         self,
+        formatted_prompts: Union[str, List[str]],
         temperature: float,
         cfg_scale: float,
         negative_prompt: str,
         repetition_penalty: float,
         use_constrained_decoding: bool = True,
         constrained_decoding_debug: bool = False,
+        metadata_temperature: Optional[float] = None,
         codes_temperature: Optional[float] = None,
         target_duration: Optional[float] = None,
         user_metadata: Optional[Dict[str, Optional[str]]] = None,
         caption: str = "",
         lyrics: str = "",
         cot_text: str = "",
+        seeds: Optional[List[int]] = None,
+    ) -> Union[str, List[str]]:
+        """
+        Unified vllm generation function supporting both single and batch modes.
+        Accepts either a single formatted prompt (str) or a list of formatted prompts (List[str]).
+        Returns a single string for single mode, or a list of strings for batch mode.
+        """
         from nanovllm import SamplingParams
+        # Determine if batch mode
+        formatted_prompt_list, is_batch = self._normalize_batch_input(formatted_prompts)
+        batch_size = len(formatted_prompt_list)
         # Determine effective temperature for sampler
+        # Batch mode doesn't support phase temperatures, so use simple temperature
+        # Single mode supports phase temperatures
+        use_phase_temperatures = not is_batch and (metadata_temperature is not None or codes_temperature is not None)
         effective_sampler_temp = 1.0 if use_phase_temperatures else temperature
+        # Setup constrained processor
+        constrained_processor = self._setup_constrained_processor(
+            use_constrained_decoding=use_constrained_decoding or use_phase_temperatures,
+            constrained_decoding_debug=constrained_decoding_debug,
+            target_duration=target_duration,
+            user_metadata=user_metadata,
+            stop_at_reasoning=stop_at_reasoning,
+            skip_genres=skip_genres,
+            skip_caption=skip_caption,
+            skip_language=skip_language,
+            generation_phase=generation_phase,
+            is_batch=is_batch,
+            metadata_temperature=metadata_temperature,
+            codes_temperature=codes_temperature,
+        )
         sampling_params = SamplingParams(
             max_tokens=self.max_model_len - 64,
         if cfg_scale > 1.0:
             # Build unconditional prompt based on generation phase
+            formatted_unconditional_prompt = self._build_unconditional_prompt(
+                caption=caption,
+                lyrics=lyrics,
+                cot_text=cot_text,
+                negative_prompt=negative_prompt,
+                generation_phase=generation_phase,
+                is_batch=is_batch,
             )
             unconditional_prompts = [formatted_unconditional_prompt] * batch_size
             outputs = self.llm.generate(
+                formatted_prompt_list,
                 sampling_params,
                 unconditional_prompts=unconditional_prompts,
             )
         else:
+            outputs = self.llm.generate(formatted_prompt_list, sampling_params)
+        # Extract text from outputs
         output_texts = []
         for output in outputs:
             if hasattr(output, "outputs") and len(output.outputs) > 0:
                 output_texts.append(output["text"])
             else:
                 output_texts.append(str(output))
+        # Return single string for single mode, list for batch mode
+        return output_texts[0] if not is_batch else output_texts
+    def _run_pt_single(
         self,
         formatted_prompt: str,
         temperature: float,
         top_k: Optional[int],
         top_p: Optional[float],
         repetition_penalty: float,
+        use_constrained_decoding: bool,
+        constrained_decoding_debug: bool,
+        target_duration: Optional[float],
+        user_metadata: Optional[Dict[str, Optional[str]]],
+        stop_at_reasoning: bool,
+        skip_genres: bool,
+        skip_caption: bool,
+        skip_language: bool,
+        generation_phase: str,
+        caption: str,
+        lyrics: str,
+        cot_text: str,
     ) -> str:
+        """Internal helper function for single-item PyTorch generation."""
         inputs = self.llm_tokenizer(
             formatted_prompt,
             return_tensors="pt",
             truncation=True,
         )
+        # Setup constrained processor
+        constrained_processor = self._setup_constrained_processor(
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+            target_duration=target_duration,
+            user_metadata=user_metadata,
+            stop_at_reasoning=stop_at_reasoning,
+            skip_genres=skip_genres,
+            skip_caption=skip_caption,
+            skip_language=skip_language,
+            generation_phase=generation_phase,
+            is_batch=False,
+        )
         with self._load_model_context():
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
                 max_new_tokens = min(max_new_tokens, self.max_model_len - 64)
             # Build logits processor list (only for CFG and repetition penalty)
+            logits_processor = self._build_logits_processor(repetition_penalty)
             if cfg_scale > 1.0:
                 # Build unconditional prompt based on generation phase
+                formatted_unconditional_prompt = self._build_unconditional_prompt(
+                    caption=caption,
+                    lyrics=lyrics,
+                    cot_text=cot_text,
+                    negative_prompt=negative_prompt,
+                    generation_phase=generation_phase,
+                    is_batch=False,
+                )
                 # Tokenize both prompts together to ensure same length (with left padding)
                 # Left padding is important for generation tasks
         output_text = self.llm_tokenizer.decode(generated_ids, skip_special_tokens=False)
         return output_text
+    def _run_pt(
+        self,
+        formatted_prompts: Union[str, List[str]],
+        temperature: float,
+        cfg_scale: float,
+        negative_prompt: str,
+        top_k: Optional[int],
+        top_p: Optional[float],
+        repetition_penalty: float,
+        use_constrained_decoding: bool = True,
+        constrained_decoding_debug: bool = False,
+        target_duration: Optional[float] = None,
+        user_metadata: Optional[Dict[str, Optional[str]]] = None,
+        stop_at_reasoning: bool = False,
+        skip_genres: bool = True,
+        skip_caption: bool = False,
+        skip_language: bool = False,
+        generation_phase: str = "cot",
+        caption: str = "",
+        lyrics: str = "",
+        cot_text: str = "",
+        seeds: Optional[List[int]] = None,
+    ) -> Union[str, List[str]]:
+        """
+        Unified PyTorch generation function supporting both single and batch modes.
+        Accepts either a single formatted prompt (str) or a list of formatted prompts (List[str]).
+        Returns a single string for single mode, or a list of strings for batch mode.
+        Note: PyTorch backend processes batch items sequentially (doesn't support true batching efficiently).
+        """
+        # Determine if batch mode
+        formatted_prompt_list, is_batch = self._normalize_batch_input(formatted_prompts)
+        # For batch mode, process each item sequentially with different seeds
+        if is_batch:
+            output_texts = []
+            for i, formatted_prompt in enumerate(formatted_prompt_list):
+                # Set seed for this item if provided
+                if seeds and i < len(seeds):
+                    torch.manual_seed(seeds[i])
+                    if torch.cuda.is_available():
+                        torch.cuda.manual_seed_all(seeds[i])
+                # Generate using single-item method with batch-mode defaults
+                output_text = self._run_pt_single(
+                    formatted_prompt=formatted_prompt,
+                    temperature=temperature,
+                    cfg_scale=cfg_scale,
+                    negative_prompt=negative_prompt,
+                    top_k=top_k,
+                    top_p=top_p,
+                    repetition_penalty=repetition_penalty,
+                    use_constrained_decoding=use_constrained_decoding,
+                    constrained_decoding_debug=constrained_decoding_debug,
+                    target_duration=target_duration,
+                    user_metadata=None,
+                    stop_at_reasoning=False,
+                    skip_genres=True,
+                    skip_caption=True,
+                    skip_language=True,
+                    generation_phase=generation_phase,
+                    caption=caption,
+                    lyrics=lyrics,
+                    cot_text=cot_text,
+                )
+                output_texts.append(output_text)
+            return output_texts
+        # Single mode: process the formatted prompt
+        formatted_prompt = formatted_prompt_list[0]
+        return self._run_pt_single(
+            formatted_prompt=formatted_prompt,
+            temperature=temperature,
+            cfg_scale=cfg_scale,
+            negative_prompt=negative_prompt,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+            target_duration=target_duration,
+            user_metadata=user_metadata,
+            stop_at_reasoning=stop_at_reasoning,
+            skip_genres=skip_genres,
+            skip_caption=skip_caption,
+            skip_language=skip_language,
+            generation_phase=generation_phase,
+            caption=caption,
+            lyrics=lyrics,
+            cot_text=cot_text,
+        )
     def has_all_metas(self, user_metadata: Optional[Dict[str, Optional[str]]]) -> bool:
         """Check if all required metadata are present."""
         if user_metadata is None:
         use_cot_caption: bool = True,
         use_cot_language: bool = True,
         is_format_caption: bool = False,
+        batch_size: Optional[int] = None,
+        seeds: Optional[List[int]] = None,
+    ) -> Union[Tuple[Dict[str, Any], str, str], Tuple[List[Dict[str, Any]], List[str], str]]:
         """Two-phase LM generation: CoT generation followed by audio codes generation.
         - infer_type='dit': Phase 1 only - generate CoT and return metas (no audio codes)
                            If specified, constrained decoding will inject these values directly.
             use_cot_caption: Whether to generate caption in CoT (default True).
             use_cot_language: Whether to generate language in CoT (default True).
+            batch_size: Optional batch size for batch generation. If None or 1, returns single result.
+                       If > 1, returns batch results (lists).
+            seeds: Optional list of seeds for batch generation (for reproducibility).
+                  Only used when batch_size > 1.
+        Returns:
+            If batch_size is None or 1: (metadata, audio_codes, status_msg)
+            If batch_size > 1: (metadata_list, audio_codes_list, status_msg)
         """
         import time
+        import random
         infer_type = (infer_type or "").strip().lower()
         if infer_type not in {"dit", "llm_dit"}:
+            if batch_size and batch_size > 1:
+                return [], [], f"❌ invalid infer_type: {infer_type!r} (expected 'dit' or 'llm_dit')"
             return {}, "", f"❌ invalid infer_type: {infer_type!r} (expected 'dit' or 'llm_dit')"
+        # Determine if batch mode
+        is_batch = batch_size and batch_size > 1
+        actual_batch_size = batch_size if is_batch else 1
+        # Initialize variables
         metadata = {}
         audio_codes = ""
         has_all_metas = self.has_all_metas(user_metadata)
         phase1_time = 0.0
         phase2_time = 0.0
+        # Handle seeds for batch mode
+        if is_batch:
+            if seeds is None:
+                seeds = [random.randint(0, 2**32 - 1) for _ in range(actual_batch_size)]
+            elif len(seeds) < actual_batch_size:
+                seeds = list(seeds) + [random.randint(0, 2**32 - 1) for _ in range(actual_batch_size - len(seeds))]
+            else:
+                seeds = seeds[:actual_batch_size]
         # ========== PHASE 1: CoT Generation ==========
+        # Skip CoT if all metadata are user-provided OR caption is already formatted
+        if not has_all_metas and not is_format_caption:
+            if is_batch:
+                logger.info("Batch Phase 1: Generating CoT metadata (once for all items)...")
+            else:
+                logger.info("Phase 1: Generating CoT metadata...")
             phase1_start = time.time()
             # Build formatted prompt for CoT phase
             formatted_prompt = self.build_formatted_prompt(caption, lyrics, generation_phase="cot")
             logger.info(f"generate_with_stop_condition: formatted_prompt={formatted_prompt}")
             # Generate CoT (stop at </think>)
             cot_output_text, status = self.generate_from_formatted_prompt(
             phase1_time = time.time() - phase1_start
             if not cot_output_text:
+                if is_batch:
+                    return [], [], status
                 return {}, "", status
             # Parse metadata from CoT output
             metadata, _ = self.parse_lm_output(cot_output_text)
+            if is_batch:
+                logger.info(f"Batch Phase 1 completed in {phase1_time:.2f}s. Generated metadata: {list(metadata.keys())}")
+            else:
+                logger.info(f"Phase 1 completed in {phase1_time:.2f}s. Generated metadata: {list(metadata.keys())}")
         else:
             # Use user-provided metadata
+            if is_batch:
+                logger.info("Batch Phase 1: Using user-provided metadata (skipping generation)")
+            else:
+                logger.info("Phase 1: Using user-provided metadata (skipping generation)")
             metadata = {k: v for k, v in user_metadata.items() if v is not None}
         # If infer_type is 'dit', stop here and return only metadata
         if infer_type == "dit":
+            if is_batch:
+                metadata_list = [metadata.copy() for _ in range(actual_batch_size)]
+                status_msg = f"✅ Generated CoT metadata successfully (batch mode)\nFields: {', '.join(metadata.keys())}\nPhase1: {phase1_time:.2f}s"
+                return metadata_list, [""] * actual_batch_size, status_msg
+            else:
+                status_msg = f"✅ Generated CoT metadata successfully\nFields: {', '.join(metadata.keys())}\nPhase1: {phase1_time:.2f}s"
+                return metadata, "", status_msg
         # ========== PHASE 2: Audio Codes Generation ==========
+        if is_batch:
+            logger.info(f"Batch Phase 2: Generating audio codes for {actual_batch_size} items...")
+        else:
+            logger.info("Phase 2: Generating audio codes...")
         phase2_start = time.time()
         # Format metadata as CoT using YAML (matching training format)
         # Build formatted prompt with CoT for codes generation phase
         formatted_prompt_with_cot = self.build_formatted_prompt_with_cot(caption, lyrics, cot_text)
         logger.info(f"generate_with_stop_condition: formatted_prompt_with_cot={formatted_prompt_with_cot}")
+        if is_batch:
+            # Batch mode: generate codes for all items
+            formatted_prompts = [formatted_prompt_with_cot] * actual_batch_size
+            # Call backend-specific batch generation
+            try:
+                if self.llm_backend == "vllm":
+                    codes_outputs = self._run_vllm(
+                        formatted_prompts=formatted_prompts,
+                        temperature=temperature,
+                        cfg_scale=cfg_scale,
+                        negative_prompt=negative_prompt,
+                        top_k=top_k,
+                        top_p=top_p,
+                        repetition_penalty=repetition_penalty,
+                        use_constrained_decoding=use_constrained_decoding,
+                        constrained_decoding_debug=constrained_decoding_debug,
+                        target_duration=target_duration,
+                        generation_phase="codes",
+                        caption=caption,
+                        lyrics=lyrics,
+                        cot_text=cot_text,
+                        seeds=seeds,
+                    )
+                else:  # pt backend
+                    codes_outputs = self._run_pt(
+                        formatted_prompts=formatted_prompts,
+                        temperature=temperature,
+                        cfg_scale=cfg_scale,
+                        negative_prompt=negative_prompt,
+                        top_k=top_k,
+                        top_p=top_p,
+                        repetition_penalty=repetition_penalty,
+                        use_constrained_decoding=use_constrained_decoding,
+                        constrained_decoding_debug=constrained_decoding_debug,
+                        target_duration=target_duration,
+                        generation_phase="codes",
+                        caption=caption,
+                        lyrics=lyrics,
+                        cot_text=cot_text,
+                        seeds=seeds,
+                    )
+            except Exception as e:
+                error_msg = f"❌ Error in batch codes generation: {str(e)}"
+                logger.error(error_msg)
+                return [], [], error_msg
+            # Parse audio codes from each output
+            audio_codes_list = []
+            metadata_list = []
+            for output_text in codes_outputs:
+                _, audio_codes_item = self.parse_lm_output(output_text)
+                audio_codes_list.append(audio_codes_item)
+                metadata_list.append(metadata.copy())  # Same metadata for all
+            phase2_time = time.time() - phase2_start
+            # Log results
+            codes_counts = [len(codes.split('<|audio_code_')) - 1 if codes else 0 for codes in audio_codes_list]
+            logger.info(f"Batch Phase 2 completed in {phase2_time:.2f}s. Generated codes: {codes_counts}")
+            status_msg = f"✅ Batch generation completed ({actual_batch_size} items)\nPhase 1: CoT metadata\nPhase 2: {sum(codes_counts)} total codes ({codes_counts})\nPhase1: {phase1_time:.2f}s, Phase2: {phase2_time:.2f}s"
+            return metadata_list, audio_codes_list, status_msg
+        else:
+            # Single mode: generate codes for one item
+            codes_output_text, status = self.generate_from_formatted_prompt(
+                formatted_prompt=formatted_prompt_with_cot,
+                cfg={
+                    "temperature": temperature,
+                    "cfg_scale": cfg_scale,
+                    "negative_prompt": negative_prompt,
+                    "top_k": top_k,
+                    "top_p": top_p,
+                    "repetition_penalty": repetition_penalty,
+                    "target_duration": target_duration,
+                    "user_metadata": None,  # No user metadata injection in Phase 2
+                    "skip_caption": True,  # Skip caption since CoT is already included
+                    "skip_language": True,  # Skip language since CoT is already included
+                    "generation_phase": "codes",
+                    # Pass context for building unconditional prompt in codes phase
+                    "caption": caption,
+                    "lyrics": lyrics,
+                    "cot_text": cot_text,
+                },
                 use_constrained_decoding=use_constrained_decoding,
                 constrained_decoding_debug=constrained_decoding_debug,
+                stop_at_reasoning=False,  # Generate codes until EOS
             )
+            if not codes_output_text:
+                return metadata, "", status
+            phase2_time = time.time() - phase2_start
+            # Parse audio codes from output (metadata should be same as Phase 1)
+            _, audio_codes = self.parse_lm_output(codes_output_text)
+            codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
+            logger.info(f"Phase 2 completed in {phase2_time:.2f}s. Generated {codes_count} audio codes")
+            status_msg = f"✅ Generated successfully (2-phase)\nPhase 1: CoT metadata\nPhase 2: {codes_count} audio codes\nPhase1: {phase1_time:.2f}s, Phase2: {phase2_time:.2f}s"
+            return metadata, audio_codes, status_msg
     def build_formatted_prompt(self, caption: str, lyrics: str = "", is_negative_prompt: bool = False, generation_phase: str = "cot", negative_prompt: str = "NO USER INPUT") -> str:
         """
         Build the chat-formatted prompt for 5Hz LM from caption/lyrics.
         if is_negative_prompt:
             # Unconditional prompt for CFG
             # Check if user provided a meaningful negative prompt (not the default)
+            has_negative_prompt = self._has_meaningful_negative_prompt(negative_prompt)
             if generation_phase == "cot":
                 # CoT phase unconditional prompt
         if is_negative_prompt:
             # Unconditional prompt for codes phase
             # Check if user provided a meaningful negative prompt
+            has_negative_prompt = self._has_meaningful_negative_prompt(negative_prompt)
             # Use empty CoT for unconditional
             cot_for_prompt = "<think>\n</think>"
         try:
             if self.llm_backend == "vllm":
+                output_text = self._run_vllm(
+                    formatted_prompts=formatted_prompt,
                     temperature=temperature,
                     cfg_scale=cfg_scale,
                     negative_prompt=negative_prompt,
                 return output_text, f"✅ Generated successfully (vllm) | length={len(output_text)}"
             # PyTorch backend
+            output_text = self._run_pt(
+                formatted_prompts=formatted_prompt,
                 temperature=temperature,
                 cfg_scale=cfg_scale,
                 negative_prompt=negative_prompt,
             eos_token_id = pad_token_id
         # Build logits processor for repetition penalty
+        logits_processor = self._build_logits_processor(repetition_penalty)
         with torch.no_grad():
             for step in range(max_new_tokens):
                 # Forward pass
+                outputs = self._forward_pass(model, generated_ids, model_kwargs, past_key_values, use_cache)
                 # Get logits for the last position
                 next_token_logits = outputs.logits[:, -1, :]  # [batch_size, vocab_size]
                 for processor in logits_processor:
                     next_token_logits = processor(generated_ids, next_token_logits)
+                # Apply top-k and top-p filtering
+                next_token_logits = self._apply_top_k_filter(next_token_logits, top_k)
+                next_token_logits = self._apply_top_p_filter(next_token_logits, top_p)
                 # Apply temperature and sample
+                next_tokens = self._sample_tokens(next_token_logits, temperature)
                 # Update constrained processor state
+                self._update_constrained_processor_state(constrained_processor, next_tokens)
                 # Check for EOS token
+                should_stop = self._check_eos_token(next_tokens, eos_token_id, pad_token_id)
                 # Append token to sequence
                 next_tokens_unsqueezed = next_tokens.unsqueeze(1)
             eos_token_id = pad_token_id
         # Build logits processor for non-CFG operations (repetition penalty, top_k, top_p)
+        logits_processor = self._build_logits_processor(repetition_penalty)
         with torch.no_grad():
             for step in range(max_new_tokens):
                 # Forward pass for the entire batch (conditional + unconditional)
+                outputs = self._forward_pass(model, generated_ids, model_kwargs, past_key_values, use_cache)
                 # Get logits for the last position
                 next_token_logits = outputs.logits[:, -1, :]  # [batch_size*2, vocab_size]
                 for processor in logits_processor:
                     cfg_logits = processor(current_input_ids, cfg_logits)
+                # Apply top-k and top-p filtering
+                cfg_logits = self._apply_top_k_filter(cfg_logits, top_k)
+                cfg_logits = self._apply_top_p_filter(cfg_logits, top_p)
                 # Apply temperature and sample
+                next_tokens = self._sample_tokens(cfg_logits, temperature)
                 # Update constrained processor state AFTER sampling
+                self._update_constrained_processor_state(constrained_processor, next_tokens)
                 # Check for EOS token in conditional sequences BEFORE unsqueezing
                 # Stop if any conditional sequence generates EOS token
                 # next_tokens shape: [batch_size] (only conditional tokens)
+                should_stop = self._check_eos_token(next_tokens, eos_token_id, pad_token_id)
                 # Apply the same sampled tokens to both conditional and unconditional sequences
                 next_tokens_unsqueezed = next_tokens.unsqueeze(1)

acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py CHANGED Viewed

@@ -68,10 +68,16 @@ class ModelRunner:
         self.model = Qwen3ForCausalLM(hf_config)
         load_model(self.model, config.model)
         self.sampler = Sampler()
         self.warmup_model()
         self.allocate_kv_cache()
         if not self.enforce_eager:
             self.capture_cudagraph()
         torch.set_default_device("cpu")
         torch.set_default_dtype(default_dtype)
@@ -84,6 +90,24 @@ class ModelRunner:
                 self.shm = SharedMemory(name="nanovllm")
                 self.loop()
     def exit(self):
         if self.world_size > 1:
             self.shm.close()
@@ -216,57 +240,49 @@ class ModelRunner:
         return input_ids, positions
     def prepare_decode(self, seqs: list[Sequence]):
-        input_ids = []
-        positions = []
-        slot_mapping = []
-        context_lens = []
-        for seq in seqs:
-            input_ids.append(seq.last_token)
-            positions.append(len(seq) - 1)
-            context_lens.append(len(seq))
-            slot_mapping.append(seq.block_table[-1] * self.block_size + seq.last_block_num_tokens  - 1)
-        input_ids = torch.tensor(input_ids, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
-        positions = torch.tensor(positions, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
-        slot_mapping = torch.tensor(slot_mapping, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
-        context_lens = torch.tensor(context_lens, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
         block_tables = self.prepare_block_tables(seqs)
         set_context(False, slot_mapping=slot_mapping, context_lens=context_lens, block_tables=block_tables)
         return input_ids, positions
     def prepare_sample(self, seqs: list[Sequence], is_cfg_batch: bool = False):
-        """Prepare sampling parameters. For CFG batch, only return parameters for conditional sequences."""
         if is_cfg_batch:
-            # For CFG batch, seqs contains [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
-            # We only need parameters for conditional sequences (first half)
-            num_cond = len(seqs) // 2
-            temperatures = []
-            cfg_scales = []
-            top_ks = []
-            top_ps = []
-            repetition_penalties = []
-            for seq in seqs[:num_cond]:
-                temperatures.append(seq.temperature)
-                cfg_scales.append(seq.cfg_scale)
-                top_ks.append(seq.top_k if seq.top_k is not None else 0)
-                top_ps.append(seq.top_p if seq.top_p is not None else 1.0)
-                repetition_penalties.append(seq.repetition_penalty)
         else:
-            temperatures = []
-            cfg_scales = []
-            top_ks = []
-            top_ps = []
-            repetition_penalties = []
-            for seq in seqs:
-                temperatures.append(seq.temperature)
-                cfg_scales.append(seq.cfg_scale)
-                top_ks.append(seq.top_k if seq.top_k is not None else 0)
-                top_ps.append(seq.top_p if seq.top_p is not None else 1.0)
-                repetition_penalties.append(seq.repetition_penalty)
-        temperatures = torch.tensor(temperatures, dtype=torch.float32, pin_memory=True).cuda(non_blocking=True)
-        cfg_scales = torch.tensor(cfg_scales, dtype=torch.float32, pin_memory=True).cuda(non_blocking=True)
-        top_ks = torch.tensor(top_ks, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
-        top_ps = torch.tensor(top_ps, dtype=torch.float32, pin_memory=True).cuda(non_blocking=True)
-        repetition_penalties = torch.tensor(repetition_penalties, dtype=torch.float32, pin_memory=True).cuda(non_blocking=True)
         return temperatures, cfg_scales, top_ks, top_ps, repetition_penalties
     @torch.inference_mode()

         self.model = Qwen3ForCausalLM(hf_config)
         load_model(self.model, config.model)
         self.sampler = Sampler()
+        # Pre-allocate buffers for sampling (optimization: avoid repeated tensor creation)
+        # Must be called before warmup_model() since it uses these buffers
+        self._allocate_sample_buffers()
         self.warmup_model()
         self.allocate_kv_cache()
         if not self.enforce_eager:
             self.capture_cudagraph()
         torch.set_default_device("cpu")
         torch.set_default_dtype(default_dtype)
                 self.shm = SharedMemory(name="nanovllm")
                 self.loop()
+    def _allocate_sample_buffers(self):
+        """Pre-allocate reusable buffers for sampling to avoid repeated tensor creation."""
+        max_bs = self.config.max_num_seqs
+        # Pre-allocate pinned memory buffers on CPU for fast transfer
+        # Must explicitly specify device="cpu" since default device may be "cuda"
+        self._cpu_temperatures = torch.zeros(max_bs, dtype=torch.float32, device="cpu", pin_memory=True)
+        self._cpu_cfg_scales = torch.zeros(max_bs, dtype=torch.float32, device="cpu", pin_memory=True)
+        self._cpu_top_ks = torch.zeros(max_bs, dtype=torch.int32, device="cpu", pin_memory=True)
+        self._cpu_top_ps = torch.zeros(max_bs, dtype=torch.float32, device="cpu", pin_memory=True)
+        self._cpu_repetition_penalties = torch.zeros(max_bs, dtype=torch.float32, device="cpu", pin_memory=True)
+        # Pre-allocate decode buffers on CPU with pinned memory
+        self._cpu_input_ids = torch.zeros(max_bs, dtype=torch.int64, device="cpu", pin_memory=True)
+        self._cpu_positions = torch.zeros(max_bs, dtype=torch.int64, device="cpu", pin_memory=True)
+        self._cpu_slot_mapping = torch.zeros(max_bs, dtype=torch.int32, device="cpu", pin_memory=True)
+        self._cpu_context_lens = torch.zeros(max_bs, dtype=torch.int32, device="cpu", pin_memory=True)
     def exit(self):
         if self.world_size > 1:
             self.shm.close()
         return input_ids, positions
     def prepare_decode(self, seqs: list[Sequence]):
+        """Optimized decode preparation using pre-allocated buffers."""
+        bs = len(seqs)
+        # Use pre-allocated CPU buffers
+        for i, seq in enumerate(seqs):
+            self._cpu_input_ids[i] = seq.last_token
+            self._cpu_positions[i] = len(seq) - 1
+            self._cpu_context_lens[i] = len(seq)
+            self._cpu_slot_mapping[i] = seq.block_table[-1] * self.block_size + seq.last_block_num_tokens - 1
+        # Transfer to GPU using sliced views
+        input_ids = self._cpu_input_ids[:bs].cuda(non_blocking=True)
+        positions = self._cpu_positions[:bs].cuda(non_blocking=True)
+        slot_mapping = self._cpu_slot_mapping[:bs].cuda(non_blocking=True)
+        context_lens = self._cpu_context_lens[:bs].cuda(non_blocking=True)
         block_tables = self.prepare_block_tables(seqs)
         set_context(False, slot_mapping=slot_mapping, context_lens=context_lens, block_tables=block_tables)
         return input_ids, positions
     def prepare_sample(self, seqs: list[Sequence], is_cfg_batch: bool = False):
+        """Optimized sample preparation using pre-allocated buffers."""
         if is_cfg_batch:
+            num_seqs = len(seqs) // 2
+            target_seqs = seqs[:num_seqs]
         else:
+            num_seqs = len(seqs)
+            target_seqs = seqs
+        # Fill pre-allocated CPU buffers
+        for i, seq in enumerate(target_seqs):
+            self._cpu_temperatures[i] = seq.temperature
+            self._cpu_cfg_scales[i] = seq.cfg_scale
+            self._cpu_top_ks[i] = seq.top_k if seq.top_k is not None else 0
+            self._cpu_top_ps[i] = seq.top_p if seq.top_p is not None else 1.0
+            self._cpu_repetition_penalties[i] = seq.repetition_penalty if seq.repetition_penalty is not None else 1.0
+        # Transfer to GPU using sliced views (single batched transfer)
+        temperatures = self._cpu_temperatures[:num_seqs].cuda(non_blocking=True)
+        cfg_scales = self._cpu_cfg_scales[:num_seqs].cuda(non_blocking=True)
+        top_ks = self._cpu_top_ks[:num_seqs].cuda(non_blocking=True)
+        top_ps = self._cpu_top_ps[:num_seqs].cuda(non_blocking=True)
+        repetition_penalties = self._cpu_repetition_penalties[:num_seqs].cuda(non_blocking=True)
         return temperatures, cfg_scales, top_ks, top_ps, repetition_penalties
     @torch.inference_mode()

acestep/third_parts/nano-vllm/nanovllm/layers/sampler.py CHANGED Viewed

@@ -3,12 +3,88 @@ from torch import nn
 from typing import Optional
 class Sampler(nn.Module):
     def __init__(self):
         super().__init__()
-    @torch.compile
     def forward(
         self,
         logits: torch.Tensor,
@@ -19,56 +95,34 @@ class Sampler(nn.Module):
         input_ids: Optional[torch.Tensor] = None,
     ):
         """
-        Sample tokens from logits with optional top-k, top-p, and repetition penalty.
-        Args:
-            logits: [batch_size, vocab_size] logits tensor
-            temperatures: [batch_size] temperature values
-            top_ks: Optional [batch_size] top-k values (None or 0 means no top-k filtering)
-            top_ps: Optional [batch_size] top-p values (None or 1.0 means no top-p filtering)
-            repetition_penalties: Optional [batch_size] repetition penalty values (1.0 means no penalty)
-            input_ids: Optional [batch_size, seq_len] input token ids for repetition penalty
         """
-        batch_size, vocab_size = logits.shape
-        # Note: Repetition penalty is applied in ModelRunner before calling sampler
-        # This allows us to use the full sequence context
         # Apply temperature
         logits = logits.float().div_(temperatures.unsqueeze(dim=1))
-        # Apply top-k filtering if specified
-        if top_ks is not None:
-            for i in range(batch_size):
-                top_k = top_ks[i].item()
-                if top_k > 0 and top_k < vocab_size:
-                    # Get top-k logits, set others to -inf
-                    top_k_logits, top_k_indices = torch.topk(logits[i], int(top_k), dim=-1)
-                    filtered_logits = torch.full_like(logits[i], float('-inf'))
-                    filtered_logits[top_k_indices] = top_k_logits
-                    logits[i] = filtered_logits
-        # Apply top-p (nucleus) filtering if specified
-        if top_ps is not None:
-            probs = torch.softmax(logits, dim=-1)
-            for i in range(batch_size):
-                top_p = top_ps[i].item()
-                if 0.0 < top_p < 1.0:
-                    # Sort probabilities in descending order
-                    sorted_probs, sorted_indices = torch.sort(probs[i], descending=True)
-                    # Calculate cumulative probabilities
-                    cumsum_probs = torch.cumsum(sorted_probs, dim=-1)
-                    # Find the cutoff point
-                    cutoff_idx = (cumsum_probs <= top_p).sum().item()
-                    if cutoff_idx < len(sorted_indices):
-                        cutoff_idx += 1  # Include one more token to ensure we have at least one
-                    # Create mask for tokens to keep
-                    mask = torch.zeros_like(probs[i])
-                    mask[sorted_indices[:cutoff_idx]] = 1.0
-                    # Apply mask: set filtered tokens to -inf
-                    logits[i] = torch.where(mask > 0, logits[i], torch.tensor(float('-inf'), device=logits.device))
-        # Sample using Gumbel-max trick (equivalent to sampling from softmax)
-        probs = torch.softmax(logits, dim=-1)
-        sample_tokens = probs.div_(torch.empty_like(probs).exponential_(1).clamp_min_(1e-10)).argmax(dim=-1)
-        return sample_tokens

 from typing import Optional
+def apply_top_k_top_p(
+    logits: torch.Tensor,
+    k: Optional[torch.Tensor],
+    p: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """Apply top-k and top-p masks to the logits (vLLM style).
+    The logits tensor is updated in-place.
+    """
+    if p is None:
+        if k is None:
+            return logits
+        # Avoid sorting vocab for top-k only case
+        return apply_top_k_only(logits, k)
+    # Need to sort for top-p
+    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
+    if k is not None:
+        # Apply top-k first
+        vocab_size = logits_sort.size(1)
+        # Clamp k to valid range
+        k_clamped = k.clamp(1, vocab_size).long()
+        top_k_mask_idx = vocab_size - k_clamped  # shape: [B]
+        # Get the threshold value for each batch
+        top_k_thresh = logits_sort.gather(1, top_k_mask_idx.unsqueeze(1))
+        top_k_mask = logits_sort < top_k_thresh
+        logits_sort.masked_fill_(top_k_mask, float('-inf'))
+    # Apply top-p
+    probs_sort = logits_sort.softmax(dim=-1)
+    probs_sum = torch.cumsum(probs_sort, dim=-1, out=probs_sort)  # reuse buffer
+    top_p_mask = probs_sum <= (1.0 - p.unsqueeze(1))
+    # Ensure at least one token is kept
+    top_p_mask[:, -1] = False
+    logits_sort.masked_fill_(top_p_mask, float('-inf'))
+    # Re-sort back to original positions
+    logits.scatter_(dim=-1, index=logits_idx, src=logits_sort)
+    return logits
+def apply_top_k_only(
+    logits: torch.Tensor,
+    k: torch.Tensor,
+) -> torch.Tensor:
+    """Apply top-k mask without sorting the entire vocab (vLLM style).
+    This is much faster than sorting for top-k only cases.
+    The logits tensor is updated in-place.
+    """
+    vocab_size = logits.shape[1]
+    # Handle cases where k >= vocab_size (no filtering needed)
+    no_top_k_mask = (k <= 0) | (k >= vocab_size)
+    # Set invalid k to 1 so we can still gather
+    k_safe = k.masked_fill(no_top_k_mask, 1).long()
+    # NOTE: This int() causes CPU-GPU sync, but torch.topk requires Python int
+    max_top_k = int(k_safe.max().clamp(max=vocab_size))
+    # Get top-k values for all batches
+    # topk.values has shape [batch_size, max_top_k]
+    topk_values = logits.topk(max_top_k, dim=1).values
+    # Convert k to 0-based index: we want the k-th largest value (index k-1)
+    # Clamp to valid range for gather
+    k_index = (k_safe - 1).clamp(0, max_top_k - 1).unsqueeze(1)  # shape: [B, 1]
+    # Gather the threshold value (the k-th largest)
+    top_k_thresh = topk_values.gather(1, k_index)
+    # For rows with no top-k filtering, set threshold to -inf so nothing gets masked
+    top_k_thresh.masked_fill_(no_top_k_mask.unsqueeze(1), float('-inf'))
+    # Mask all values below the threshold
+    logits.masked_fill_(logits < top_k_thresh, float('-inf'))
+    return logits
 class Sampler(nn.Module):
     def __init__(self):
         super().__init__()
     def forward(
         self,
         logits: torch.Tensor,
         input_ids: Optional[torch.Tensor] = None,
     ):
         """
+        Sample tokens from logits with optional top-k and top-p filtering.
+        Condition checking is done OUTSIDE the compiled function to avoid
+        graph breaks from .any() calls.
         """
         # Apply temperature
         logits = logits.float().div_(temperatures.unsqueeze(dim=1))
+        # Check conditions OUTSIDE compiled code to avoid graph breaks
+        # These .any() calls cause CPU-GPU sync, but we do it once here
+        # instead of inside the compiled function
+        need_topk = top_ks is not None and bool((top_ks > 0).any()) and bool((top_ks < logits.shape[1]).any())
+        need_topp = top_ps is not None and bool((top_ps < 1.0).any()) and bool((top_ps > 0.0).any())
+        if need_topk or need_topp:
+            # Apply filtering (this part is not compiled due to dynamic control flow)
+            logits = apply_top_k_top_p(
+                logits,
+                top_ks if need_topk else None,
+                top_ps if need_topp else None,
+            )
+        # Sample using compiled function
+        return self._sample(logits)
+    @torch.compile(dynamic=True)
+    def _sample(self, logits: torch.Tensor) -> torch.Tensor:
+        """Compiled sampling kernel - no graph breaks here."""
+        probs = logits.softmax(dim=-1, dtype=torch.float32)
+        q = torch.empty_like(probs).exponential_()
+        return probs.div(q).argmax(dim=-1)

acestep/third_parts/nano-vllm/pyproject.toml CHANGED Viewed

@@ -15,8 +15,6 @@ dependencies = [
     "triton-windows>=3.0.0; sys_platform == 'win32'",
     "triton>=3.0.0; sys_platform != 'win32'",
     "transformers>=4.51.0",
-    "flash-attn @ https://github.com/sdbds/flash-attention-for-windows/releases/download/2.8.3/flash_attn-2.8.3+cu128torch2.8.0cxx11abiFALSEfullbackward-cp311-cp311-win_amd64.whl; sys_platform == 'win32'",
-    "flash-attn; sys_platform != 'win32'",
     "xxhash",
 ]

     "triton-windows>=3.0.0; sys_platform == 'win32'",
     "triton>=3.0.0; sys_platform != 'win32'",
     "transformers>=4.51.0",
     "xxhash",
 ]

profile_inference.py ADDED Viewed

	@@ -0,0 +1,223 @@

+#!/usr/bin/env python3
+"""
+Profiling script for acestep/inference.py using cProfile
+Usage:
+    python profile_inference.py
+    python profile_inference.py --warmup
+"""
+import cProfile
+import pstats
+import io
+import time
+import argparse
+import sys
+import os
+# Add project root to path
+project_root = os.path.abspath(os.path.dirname(__file__))
+if project_root not in sys.path:
+    sys.path.insert(0, project_root)
+from acestep.inference import generate_music, GenerationParams, GenerationConfig
+from acestep.handler import AceStepHandler
+from acestep.llm_inference import LLMHandler
+import json
+from typing import Tuple
+def profile_with_cprofile(dit_handler, llm_handler, params, config, warmup=False):
+    """Profile using Python's built-in cProfile.
+    Args:
+        warmup: If True, run once for warmup before profiling (default: False)
+    """
+    print("=" * 80)
+    print("Profiling with cProfile")
+    print("=" * 80)
+    # Warmup run (to exclude PyTorch compilation overhead)
+    if warmup:
+        print("\n[Warmup] Running first generation to warm up (PyTorch compilation, etc.)...")
+        warmup_start = time.time()
+        params.use_cot_metas = False
+        config.is_format_caption = True
+        config.use_constrained_decoding = False
+        warmup_result = generate_music(dit_handler, llm_handler, params, config, save_dir="./")
+        warmup_time = time.time() - warmup_start
+        print(f"[Warmup] Completed in {warmup_time:.2f}s")
+        if not warmup_result.success:
+            print(f"[Warmup] ⚠ Warmup generation failed: {warmup_result.error}")
+            return warmup_result
+    # Actual profiling run (first inference)
+    print("\n[Profiling] Running first generation for profiling...")
+    profiler = cProfile.Profile()
+    profiler.enable()
+    profiling_start = time.time()
+    try:
+        result = generate_music(dit_handler, llm_handler, params, config, save_dir="./")
+    finally:
+        profiler.disable()
+    profiling_time = time.time() - profiling_start
+    # Create stats
+    s = io.StringIO()
+    ps = pstats.Stats(profiler, stream=s)
+    ps.sort_stats('cumulative')
+    print(f"\n[Profiling] Completed in {profiling_time:.2f}s")
+    print("\nTop 30 functions by cumulative time:")
+    print("-" * 80)
+    ps.print_stats(30)
+    print("\nTop 30 functions by total time:")
+    print("-" * 80)
+    ps.sort_stats('tottime')
+    ps.print_stats(30)
+    # Save detailed report to file
+    output_file = "profile_cprofile.txt"
+    with open(output_file, 'w') as f:
+        # Create a new Stats object with file as stream
+        ps_file = pstats.Stats(profiler, stream=f)
+        ps_file.sort_stats('cumulative')
+        ps_file.print_stats()
+    print(f"\nDetailed profile saved to: {output_file}")
+    return result
+def main():
+    parser = argparse.ArgumentParser(description="Profile acestep/inference.py")
+    parser.add_argument(
+        "--checkpoint-dir",
+        type=str,
+        default="./checkpoints",
+        help="Path to checkpoints directory"
+    )
+    parser.add_argument(
+        "--config-path",
+        type=str,
+        default="acestep-v15-turbo-rl",
+        help="Model config path"
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="Device to use (cuda/cpu)"
+    )
+    parser.add_argument(
+        "--lm-model",
+        type=str,
+        default="acestep-5Hz-lm-0.6B-v3",
+        help="LM model path"
+    )
+    parser.add_argument(
+        "--lm-backend",
+        type=str,
+        default="vllm",
+        help="LM backend"
+    )
+    parser.add_argument(
+        "--warmup",
+        action="store_true",
+        help="Enable warmup run before profiling (default: False, profile first run)"
+    )
+    args = parser.parse_args()
+    # Initialize handlers
+    print("Initializing handlers...")
+    dit_handler = AceStepHandler()
+    llm_handler = LLMHandler()
+    # Initialize DiT
+    print("  - Initializing DiT model...")
+    status_dit, success_dit = dit_handler.initialize_service(
+        project_root=project_root,
+        config_path=args.config_path,
+        device=args.device,
+    )
+    if not success_dit:
+        print(f"  ❌ DiT initialization failed: {status_dit}")
+        sys.exit(1)
+    print("  ✓ DiT model initialized")
+    # Initialize LLM
+    print("  - Initializing LLM model...")
+    status_llm, success_llm = llm_handler.initialize(
+        checkpoint_dir=args.checkpoint_dir,
+        lm_model_path=args.lm_model,
+        backend=args.lm_backend,
+        device=args.device,
+    )
+    if success_llm:
+        print("  ✓ LM model initialized")
+    else:
+        print(f"  ⚠ LM initialization failed: {status_llm}")
+    # Load test parameters from example file (same as acestep/inference.py)
+    def load_example_config(example_file: str) -> Tuple[GenerationParams, GenerationConfig]:
+        """Load configuration from an example JSON file."""
+        try:
+            with open(example_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            # Convert example format to GenerationParams and GenerationConfig
+            # Handle time signature format (example uses "4" instead of "4/4")
+            time_sig = data.get('timesignature', '')
+            params = GenerationParams(
+                caption=data.get('caption', ''),
+                lyrics=data.get('lyrics', ''),
+                bpm=data.get('bpm'),
+                keyscale=data.get('keyscale', ''),
+                timesignature=time_sig,
+                vocal_language=data.get('language', 'unknown'),
+                duration=data.get('duration'),
+                thinking=data.get('think', False),
+                inference_steps=data.get('inference_steps', 8),
+                seed=42,
+            )
+            config = GenerationConfig()
+            config.batch_size = data.get('batch_size', 1)
+            return params, config
+        except Exception as e:
+            print(f"  ⚠ Failed to load example file: {e}")
+            return None, None
+    # Load production example (same as acestep/inference.py)
+    example_file = os.path.join(project_root, "examples", "text2music", "example_05.json")
+    if not os.path.exists(example_file):
+        print(f"\n  ❌ Example file not found: {example_file}")
+        print("     Please ensure the examples directory exists.")
+        sys.exit(1)
+    print(f"\n  Loading example: {os.path.basename(example_file)}")
+    params, config = load_example_config(example_file)
+    if not params or not config:
+        print("  ❌ Failed to load example configuration")
+        sys.exit(1)
+    print("\n" + "=" * 80)
+    print("Starting profiling...")
+    print("=" * 80)
+    result = profile_with_cprofile(dit_handler, llm_handler, params, config, warmup=args.warmup)
+    if result and not result.success:
+        print(f"\n⚠ Generation failed: {result.error}")
+if __name__ == "__main__":
+    main()