Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on A100

App Files Files Community

ChuxiJ commited on 26 days ago

Commit

6922ca4

1 Parent(s): bb87271

add lyc support

Browse files

Files changed (9) hide show

acestep/dit_alignment_score.py +547 -0
acestep/gradio_ui/events/__init__.py +35 -11
acestep/gradio_ui/events/results_handlers.py +130 -22
acestep/gradio_ui/i18n/en.json +11 -8
acestep/gradio_ui/i18n/ja.json +11 -8
acestep/gradio_ui/i18n/zh.json +11 -8
acestep/gradio_ui/interfaces/result.py +148 -57
acestep/handler.py +259 -13
pyproject.toml +3 -3

acestep/dit_alignment_score.py ADDED Viewed

	@@ -0,0 +1,547 @@

+"""
+DiT Alignment Score Module
+This module provides lyrics-to-audio alignment using cross-attention matrices
+from DiT model for generating LRC timestamps.
+Refactored from lyrics_alignment_infos.py for integration with ACE-Step.
+"""
+import numba
+import torch
+import numpy as np
+import torch.nn.functional as F
+from dataclasses import dataclass, asdict
+from typing import List, Dict, Any, Optional
+# ================= Data Classes =================
+@dataclass
+class TokenTimestamp:
+    """Stores per-token timing information."""
+    token_id: int
+    text: str
+    start: float
+    end: float
+    probability: float
+@dataclass
+class SentenceTimestamp:
+    """Stores per-sentence timing information with token list."""
+    text: str
+    start: float
+    end: float
+    tokens: List[TokenTimestamp]
+    confidence: float
+# ================= DTW Algorithm (Numba Optimized) =================
+@numba.jit(nopython=True)
+def dtw_cpu(x: np.ndarray):
+    """
+    Dynamic Time Warping algorithm optimized with Numba.
+    Args:
+        x: Cost matrix of shape [N, M]
+    Returns:
+        Tuple of (text_indices, time_indices) arrays
+    """
+    N, M = x.shape
+    # Use float32 for memory efficiency
+    cost = np.ones((N + 1, M + 1), dtype=np.float32) * np.inf
+    trace = -np.ones((N + 1, M + 1), dtype=np.float32)
+    cost[0, 0] = 0
+    for j in range(1, M + 1):
+        for i in range(1, N + 1):
+            c0 = cost[i - 1, j - 1]
+            c1 = cost[i - 1, j]
+            c2 = cost[i, j - 1]
+            if c0 < c1 and c0 < c2:
+                c, t = c0, 0
+            elif c1 < c0 and c1 < c2:
+                c, t = c1, 1
+            else:
+                c, t = c2, 2
+            cost[i, j] = x[i - 1, j - 1] + c
+            trace[i, j] = t
+    return _backtrace(trace, N, M)
+@numba.jit(nopython=True)
+def _backtrace(trace: np.ndarray, N: int, M: int):
+    """
+    Optimized backtrace function for DTW.
+    Args:
+        trace: Trace matrix of shape (N+1, M+1)
+        N, M: Original matrix dimensions
+    Returns:
+        Path array of shape (2, path_len) - first row is text indices, second is time indices
+    """
+    # Boundary handling
+    trace[0, :] = 2
+    trace[:, 0] = 1
+    # Pre-allocate array, max path length is N+M
+    max_path_len = N + M
+    path = np.zeros((2, max_path_len), dtype=np.int32)
+    i, j = N, M
+    path_idx = max_path_len - 1
+    while i > 0 or j > 0:
+        path[0, path_idx] = i - 1  # text index
+        path[1, path_idx] = j - 1  # time index
+        path_idx -= 1
+        t = trace[i, j]
+        if t == 0:
+            i -= 1
+            j -= 1
+        elif t == 1:
+            i -= 1
+        elif t == 2:
+            j -= 1
+        else:
+            break
+    actual_len = max_path_len - path_idx - 1
+    return path[:, path_idx + 1:max_path_len]
+# ================= Utility Functions =================
+def median_filter(x: torch.Tensor, filter_width: int) -> torch.Tensor:
+    """
+    Apply median filter to tensor.
+    Args:
+        x: Input tensor
+        filter_width: Width of median filter
+    Returns:
+        Filtered tensor
+    """
+    pad_width = filter_width // 2
+    if x.shape[-1] <= pad_width:
+        return x
+    if x.ndim == 2:
+        x = x[None, :]
+    x = F.pad(x, (filter_width // 2, filter_width // 2, 0, 0), mode="reflect")
+    result = x.unfold(-1, filter_width, 1).sort()[0][..., filter_width // 2]
+    if result.ndim > 2:
+        result = result.squeeze(0)
+    return result
+# ================= Main Aligner Class =================
+class MusicStampsAligner:
+    """
+    Aligner class for generating lyrics timestamps from cross-attention matrices.
+    Uses bidirectional consensus denoising and DTW for alignment.
+    """
+    def __init__(self, tokenizer):
+        """
+        Initialize the aligner.
+        Args:
+            tokenizer: Text tokenizer for decoding tokens
+        """
+        self.tokenizer = tokenizer
+    def _apply_bidirectional_consensus(
+        self,
+        weights_stack: torch.Tensor,
+        violence_level: float,
+        medfilt_width: int
+    ) -> tuple:
+        """
+        Core denoising logic using bidirectional consensus.
+        Args:
+            weights_stack: Attention weights [Heads, Tokens, Frames]
+            violence_level: Denoising strength coefficient
+            medfilt_width: Median filter width
+        Returns:
+            Tuple of (calc_matrix, energy_matrix) as numpy arrays
+        """
+        # A. Bidirectional Consensus
+        row_prob = F.softmax(weights_stack, dim=-1)  # Token -> Frame
+        col_prob = F.softmax(weights_stack, dim=-2)  # Frame -> Token
+        processed = row_prob * col_prob
+        # 1. Row suppression (kill horizontal crossing lines)
+        row_medians = torch.quantile(processed, 0.5, dim=-1, keepdim=True)
+        processed = processed - (violence_level * row_medians)
+        processed = torch.relu(processed)
+        # 2. Column suppression (kill vertical crossing lines)
+        col_medians = torch.quantile(processed, 0.5, dim=-2, keepdim=True)
+        processed = processed - (violence_level * col_medians)
+        processed = torch.relu(processed)
+        # C. Power sharpening
+        processed = processed ** 2
+        # Energy matrix for confidence
+        energy_matrix = processed.mean(dim=0).cpu().numpy()
+        # D. Z-Score normalization
+        std, mean = torch.std_mean(processed, unbiased=False)
+        weights_processed = (processed - mean) / (std + 1e-9)
+        # E. Median filtering
+        weights_processed = median_filter(weights_processed, filter_width=medfilt_width)
+        calc_matrix = weights_processed.mean(dim=0).numpy()
+        return calc_matrix, energy_matrix
+    def _preprocess_attention(
+        self,
+        attention_matrix: torch.Tensor,
+        custom_config: Dict[int, List[int]],
+        violence_level: float,
+        medfilt_width: int = 7
+    ) -> tuple:
+        """
+        Preprocess attention matrix for alignment.
+        Args:
+            attention_matrix: Attention tensor [Layers, Heads, Tokens, Frames]
+            custom_config: Dict mapping layer indices to head indices
+            violence_level: Denoising strength
+            medfilt_width: Median filter width
+        Returns:
+            Tuple of (calc_matrix, energy_matrix, visual_matrix)
+        """
+        if not isinstance(attention_matrix, torch.Tensor):
+            weights = torch.tensor(attention_matrix)
+        else:
+            weights = attention_matrix.clone()
+        weights = weights.cpu().float()
+        selected_tensors = []
+        for layer_idx, head_indices in custom_config.items():
+            for head_idx in head_indices:
+                if layer_idx < weights.shape[0] and head_idx < weights.shape[1]:
+                    head_matrix = weights[layer_idx, head_idx]
+                    selected_tensors.append(head_matrix)
+        if not selected_tensors:
+            return None, None, None
+        # Stack selected heads: [Heads, Tokens, Frames]
+        weights_stack = torch.stack(selected_tensors, dim=0)
+        visual_matrix = weights_stack.mean(dim=0).numpy()
+        calc_matrix, energy_matrix = self._apply_bidirectional_consensus(
+            weights_stack, violence_level, medfilt_width
+        )
+        return calc_matrix, energy_matrix, visual_matrix
+    def stamps_align_info(
+        self,
+        attention_matrix: torch.Tensor,
+        lyrics_tokens: List[int],
+        total_duration_seconds: float,
+        custom_config: Dict[int, List[int]],
+        return_matrices: bool = False,
+        violence_level: float = 2.0,
+        medfilt_width: int = 1
+    ) -> Dict[str, Any]:
+        """
+        Get alignment information from attention matrix.
+        Args:
+            attention_matrix: Cross-attention tensor [Layers, Heads, Tokens, Frames]
+            lyrics_tokens: List of lyrics token IDs
+            total_duration_seconds: Total audio duration in seconds
+            custom_config: Dict mapping layer indices to head indices
+            return_matrices: Whether to return intermediate matrices
+            violence_level: Denoising strength
+            medfilt_width: Median filter width
+        Returns:
+            Dict containing calc_matrix, lyrics_tokens, total_duration_seconds,
+            and optionally energy_matrix and vis_matrix
+        """
+        calc_matrix, energy_matrix, visual_matrix = self._preprocess_attention(
+            attention_matrix, custom_config, violence_level, medfilt_width
+        )
+        if calc_matrix is None:
+            return {
+                "calc_matrix": None,
+                "lyrics_tokens": lyrics_tokens,
+                "total_duration_seconds": total_duration_seconds,
+                "error": "No valid attention heads found"
+            }
+        return_dict = {
+            "calc_matrix": calc_matrix,
+            "lyrics_tokens": lyrics_tokens,
+            "total_duration_seconds": total_duration_seconds
+        }
+        if return_matrices:
+            return_dict['energy_matrix'] = energy_matrix
+            return_dict['vis_matrix'] = visual_matrix
+        return return_dict
+    def _decode_tokens_incrementally(self, token_ids: List[int]) -> List[str]:
+        """
+        Decode tokens incrementally to properly handle multi-byte UTF-8 characters.
+        For Chinese and other multi-byte characters, the tokenizer may split them
+        into multiple byte-level tokens. Decoding each token individually produces
+        invalid UTF-8 sequences (showing as �). This method uses byte-level comparison
+        to correctly track which characters each token contributes.
+        Args:
+            token_ids: List of token IDs
+        Returns:
+            List of decoded text for each token position
+        """
+        decoded_tokens = []
+        prev_bytes = b""
+        for i in range(len(token_ids)):
+            # Decode tokens from start to current position
+            current_text = self.tokenizer.decode(token_ids[:i+1], skip_special_tokens=False)
+            current_bytes = current_text.encode('utf-8', errors='surrogatepass')
+            # The contribution of current token is the new bytes added
+            if len(current_bytes) >= len(prev_bytes):
+                new_bytes = current_bytes[len(prev_bytes):]
+                # Try to decode the new bytes; if incomplete, use empty string
+                try:
+                    token_text = new_bytes.decode('utf-8')
+                except UnicodeDecodeError:
+                    # Incomplete UTF-8 sequence, this token doesn't complete a character
+                    token_text = ""
+            else:
+                # Edge case: current decode is shorter (shouldn't happen normally)
+                token_text = ""
+            decoded_tokens.append(token_text)
+            prev_bytes = current_bytes
+        return decoded_tokens
+    def token_timestamps(
+        self,
+        calc_matrix: np.ndarray,
+        lyrics_tokens: List[int],
+        total_duration_seconds: float
+    ) -> List[TokenTimestamp]:
+        """
+        Generate per-token timestamps using DTW.
+        Args:
+            calc_matrix: Processed attention matrix [Tokens, Frames]
+            lyrics_tokens: List of token IDs
+            total_duration_seconds: Total audio duration
+        Returns:
+            List of TokenTimestamp objects
+        """
+        n_frames = calc_matrix.shape[-1]
+        text_indices, time_indices = dtw_cpu(-calc_matrix.astype(np.float64))
+        seconds_per_frame = total_duration_seconds / n_frames
+        alignment_results = []
+        # Use incremental decoding to properly handle multi-byte UTF-8 characters
+        decoded_tokens = self._decode_tokens_incrementally(lyrics_tokens)
+        for i in range(len(lyrics_tokens)):
+            mask = (text_indices == i)
+            if not np.any(mask):
+                start = alignment_results[-1].end if alignment_results else 0.0
+                end = start
+                token_conf = 0.0
+            else:
+                times = time_indices[mask] * seconds_per_frame
+                start = times[0]
+                end = times[-1]
+                token_conf = 0.0
+            if end < start:
+                end = start
+            alignment_results.append(TokenTimestamp(
+                token_id=lyrics_tokens[i],
+                text=decoded_tokens[i],
+                start=float(start),
+                end=float(end),
+                probability=token_conf
+            ))
+        return alignment_results
+    def _decode_sentence_from_tokens(self, tokens: List[TokenTimestamp]) -> str:
+        """
+        Decode a sentence by decoding all token IDs together.
+        This avoids UTF-8 encoding issues from joining individual token texts.
+        Args:
+            tokens: List of TokenTimestamp objects
+        Returns:
+            Properly decoded sentence text
+        """
+        token_ids = [t.token_id for t in tokens]
+        return self.tokenizer.decode(token_ids, skip_special_tokens=False)
+    def sentence_timestamps(
+        self,
+        token_alignment: List[TokenTimestamp]
+    ) -> List[SentenceTimestamp]:
+        """
+        Group token timestamps into sentence timestamps.
+        Args:
+            token_alignment: List of TokenTimestamp objects
+        Returns:
+            List of SentenceTimestamp objects
+        """
+        results = []
+        current_tokens = []
+        for token in token_alignment:
+            current_tokens.append(token)
+            if '\n' in token.text:
+                # Decode all token IDs together to avoid UTF-8 issues
+                full_text = self._decode_sentence_from_tokens(current_tokens)
+                if full_text.strip():
+                    valid_scores = [t.probability for t in current_tokens if t.probability > 0]
+                    sent_conf = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
+                    results.append(SentenceTimestamp(
+                        text=full_text.strip(),
+                        start=round(current_tokens[0].start, 3),
+                        end=round(current_tokens[-1].end, 3),
+                        tokens=list(current_tokens),
+                        confidence=sent_conf
+                    ))
+                current_tokens = []
+        # Handle last sentence
+        if current_tokens:
+            # Decode all token IDs together to avoid UTF-8 issues
+            full_text = self._decode_sentence_from_tokens(current_tokens)
+            if full_text.strip():
+                valid_scores = [t.probability for t in current_tokens if t.probability > 0]
+                sent_conf = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
+                results.append(SentenceTimestamp(
+                    text=full_text.strip(),
+                    start=round(current_tokens[0].start, 3),
+                    end=round(current_tokens[-1].end, 3),
+                    tokens=list(current_tokens),
+                    confidence=sent_conf
+                ))
+        # Normalize confidence scores
+        if results:
+            all_scores = [s.confidence for s in results]
+            min_score = min(all_scores)
+            max_score = max(all_scores)
+            score_range = max_score - min_score
+            if score_range > 1e-9:
+                for s in results:
+                    normalized_score = (s.confidence - min_score) / score_range
+                    s.confidence = round(normalized_score, 2)
+            else:
+                for s in results:
+                    s.confidence = round(s.confidence, 2)
+        return results
+    def format_lrc(
+        self,
+        sentence_timestamps: List[SentenceTimestamp],
+        include_end_time: bool = False
+    ) -> str:
+        """
+        Format sentence timestamps as LRC lyrics format.
+        Args:
+            sentence_timestamps: List of SentenceTimestamp objects
+            include_end_time: Whether to include end time (enhanced LRC format)
+        Returns:
+            LRC formatted string
+        """
+        lines = []
+        for sentence in sentence_timestamps:
+            # Convert seconds to mm:ss.xx format
+            start_minutes = int(sentence.start // 60)
+            start_seconds = sentence.start % 60
+            if include_end_time:
+                end_minutes = int(sentence.end // 60)
+                end_seconds = sentence.end % 60
+                timestamp = f"[{start_minutes:02d}:{start_seconds:05.2f}][{end_minutes:02d}:{end_seconds:05.2f}]"
+            else:
+                timestamp = f"[{start_minutes:02d}:{start_seconds:05.2f}]"
+            # Clean the text (remove structural tags like [verse], [chorus])
+            text = sentence.text
+            lines.append(f"{timestamp}{text}")
+        return "\n".join(lines)
+    def get_timestamps_and_lrc(
+        self,
+        calc_matrix: np.ndarray,
+        lyrics_tokens: List[int],
+        total_duration_seconds: float
+    ) -> Dict[str, Any]:
+        """
+        Convenience method to get both timestamps and LRC in one call.
+        Args:
+            calc_matrix: Processed attention matrix
+            lyrics_tokens: List of token IDs
+            total_duration_seconds: Total audio duration
+        Returns:
+            Dict containing token_timestamps, sentence_timestamps, and lrc_text
+        """
+        token_stamps = self.token_timestamps(
+            calc_matrix=calc_matrix,
+            lyrics_tokens=lyrics_tokens,
+            total_duration_seconds=total_duration_seconds
+        )
+        sentence_stamps = self.sentence_timestamps(token_stamps)
+        lrc_text = self.format_lrc(sentence_stamps)
+        return {
+            "token_timestamps": token_stamps,
+            "sentence_timestamps": sentence_stamps,
+            "lrc_text": lrc_text
+        }

acestep/gradio_ui/events/__init__.py CHANGED Viewed

@@ -358,19 +358,49 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
         )
     # ========== Score Calculation Handlers ==========
     for btn_idx in range(1, 9):
         results_section[f"score_btn_{btn_idx}"].click(
-            fn=lambda sample_idx, scale, batch_idx, queue: res_h.calculate_score_handler_with_selection(
-                llm_handler, sample_idx, scale, batch_idx, queue
-            ),
             inputs=[
-                gr.State(value=btn_idx),
                 generation_section["score_scale"],
                 results_section["current_batch_index"],
                 results_section["batch_queue"],
             ],
-            outputs=[results_section[f"score_display_{btn_idx}"], results_section["batch_queue"]]
         )
     def generation_wrapper(*args):
         yield from res_h.generate_with_batch_management(dit_handler, llm_handler, *args)
     # ========== Generation Handler ==========
@@ -438,12 +468,6 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             results_section["generation_info"],
             results_section["status_output"],
             generation_section["seed"],
-            results_section["align_score_1"],
-            results_section["align_text_1"],
-            results_section["align_plot_1"],
-            results_section["align_score_2"],
-            results_section["align_text_2"],
-            results_section["align_plot_2"],
             results_section["score_display_1"],
             results_section["score_display_2"],
             results_section["score_display_3"],

         )
     # ========== Score Calculation Handlers ==========
+    # Use default argument to capture btn_idx value at definition time (Python closure fix)
+    def make_score_handler(idx):
+        return lambda scale, batch_idx, queue: res_h.calculate_score_handler_with_selection(
+            llm_handler, idx, scale, batch_idx, queue
+        )
     for btn_idx in range(1, 9):
         results_section[f"score_btn_{btn_idx}"].click(
+            fn=make_score_handler(btn_idx),
             inputs=[
                 generation_section["score_scale"],
                 results_section["current_batch_index"],
                 results_section["batch_queue"],
             ],
+            outputs=[
+                results_section[f"score_display_{btn_idx}"],
+                results_section[f"details_accordion_{btn_idx}"],
+                results_section["batch_queue"]
+            ]
         )
+    # ========== LRC Timestamp Handlers ==========
+    # Use default argument to capture btn_idx value at definition time (Python closure fix)
+    def make_lrc_handler(idx):
+        return lambda batch_idx, queue, vocal_lang, infer_steps: res_h.generate_lrc_handler(
+            dit_handler, idx, batch_idx, queue, vocal_lang, infer_steps
+        )
+    for btn_idx in range(1, 9):
+        results_section[f"lrc_btn_{btn_idx}"].click(
+            fn=make_lrc_handler(btn_idx),
+            inputs=[
+                results_section["current_batch_index"],
+                results_section["batch_queue"],
+                generation_section["vocal_language"],
+                generation_section["inference_steps"],
+            ],
+            outputs=[
+                results_section[f"lrc_display_{btn_idx}"],
+                results_section[f"details_accordion_{btn_idx}"]
+            ]
+        )
     def generation_wrapper(*args):
         yield from res_h.generate_with_batch_management(dit_handler, llm_handler, *args)
     # ========== Generation Handler ==========
             results_section["generation_info"],
             results_section["status_output"],
             generation_section["seed"],
             results_section["score_display_1"],
             results_section["score_display_2"],
             results_section["score_display_3"],

acestep/gradio_ui/events/results_handlers.py CHANGED Viewed

@@ -141,6 +141,7 @@ def store_batch_in_queue(
     batch_size=2,
     generation_params=None,
     lm_generated_metadata=None,
     status="completed"
 ):
     """Store batch results in queue with ALL generation parameters
@@ -152,6 +153,7 @@ def store_batch_in_queue(
         batch_size: Batch size used for this batch
         generation_params: Complete dictionary of ALL generation parameters used
         lm_generated_metadata: LM-generated metadata for scoring (optional)
     """
     batch_queue[batch_index] = {
         "status": status,
@@ -164,6 +166,7 @@ def store_batch_in_queue(
         "batch_size": batch_size,  # Store batch size
         "generation_params": generation_params if generation_params else {},  # Store ALL parameters
         "lm_generated_metadata": lm_generated_metadata,  # Store LM metadata for scoring
         "timestamp": datetime.datetime.now().isoformat()
     }
     return batch_queue
@@ -355,12 +358,6 @@ def generate_with_progress(
     audio_conversion_start_time = time_module.time()
     total_auto_score_time = 0.0
-    align_score_1 = ""
-    align_text_1 = ""
-    align_plot_1 = None
-    align_score_2 = ""
-    align_text_2 = ""
-    align_plot_2 = None
     updated_audio_codes = text2music_audio_code_string if not think_checkbox else ""
     # Build initial generation_info (will be updated with post-processing times at the end)
@@ -373,7 +370,7 @@ def generate_with_progress(
     )
     if not result.success:
-        yield (None,) * 8 + (None, generation_info, result.status_message) + (gr.skip(),) * 26
         return
     audios = result.audios
@@ -421,8 +418,6 @@ def generate_with_progress(
                 generation_info,
                 status_message,
                 seed_value_for_ui,
-                # Align plot placeholders (assume no need to update in real time)
-                gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(),
                 # Scores
                 scores_ui_updates[0], scores_ui_updates[1], scores_ui_updates[2], scores_ui_updates[3], scores_ui_updates[4], scores_ui_updates[5], scores_ui_updates[6], scores_ui_updates[7],
                 updated_audio_codes,
@@ -431,6 +426,7 @@ def generate_with_progress(
                 audio_codes_ui_updates[4], audio_codes_ui_updates[5], audio_codes_ui_updates[6], audio_codes_ui_updates[7],
                 lm_generated_metadata,
                 is_format_caption,
             )
         else:
             # If i exceeds the generated count (e.g., batch=2, i=2..7), do not yield
@@ -467,7 +463,6 @@ def generate_with_progress(
         generation_info,
         "Generation Complete",
         seed_value_for_ui,
-        align_score_1, align_text_1, align_plot_1, align_score_2, align_text_2, align_plot_2,
         final_scores_list[0], final_scores_list[1], final_scores_list[2], final_scores_list[3],
         final_scores_list[4], final_scores_list[5], final_scores_list[6], final_scores_list[7],
         updated_audio_codes,
@@ -475,6 +470,7 @@ def generate_with_progress(
         final_codes_list[4], final_codes_list[5], final_codes_list[6], final_codes_list[7],
         lm_generated_metadata,
         is_format_caption,
     )
@@ -595,7 +591,7 @@ def calculate_score_handler_with_selection(llm_handler, sample_idx, score_scale,
         batch_queue: Batch queue containing historical generation data
     """
     if current_batch_index not in batch_queue:
-        return t("messages.scoring_failed"), batch_queue
     batch_data = batch_queue[current_batch_index]
     params = batch_data.get("generation_params", {})
@@ -642,7 +638,106 @@ def calculate_score_handler_with_selection(llm_handler, sample_idx, score_scale,
             batch_queue[current_batch_index]["scores"] = [""] * 8
         batch_queue[current_batch_index]["scores"][sample_idx - 1] = score_display
-    return score_display, batch_queue
 def capture_current_params(
@@ -758,7 +853,9 @@ def generate_with_batch_management(
         final_result_from_inner = partial_result
         # current_batch_index, total_batches, batch_queue, next_params,
         # batch_indicator_text, prev_btn, next_btn, next_status, restore_btn
-        yield partial_result + (
             gr.skip(), gr.skip(), gr.skip(), gr.skip(),
             gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
         )
@@ -766,21 +863,23 @@ def generate_with_batch_management(
     all_audio_paths = result[8]
     if all_audio_paths is None:
-        yield result + (
             gr.skip(), gr.skip(), gr.skip(), gr.skip(),
             gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
         )
         return
     # Extract results from generation (使用 result 下标访问)
     generation_info = result[9]
     seed_value_for_ui = result[11]
-    lm_generated_metadata = result[35]  # Fixed: lm_metadata is at index 35, not 34
     # Extract codes
-    generated_codes_single = result[26]
-    generated_codes_batch = [result[27], result[28], result[29], result[30], result[31], result[32], result[33], result[34]]
     # Determine which codes to store based on mode
     if allow_lm_batch and batch_size_input >= 2:
@@ -839,6 +938,9 @@ def generate_with_batch_management(
     next_params["text2music_audio_code_string"] = ""
     next_params["random_seed_checkbox"] = True
     # Store current batch in queue
     batch_queue = store_batch_in_queue(
         batch_queue,
@@ -851,6 +953,7 @@ def generate_with_batch_management(
         batch_size=int(batch_size_input),
         generation_params=saved_params,
         lm_generated_metadata=lm_generated_metadata,
         status="completed"
     )
@@ -870,7 +973,9 @@ def generate_with_batch_management(
     # 4. Yield final result (includes Batch UI updates)
     # The result here is already a tuple structure
-    yield result + (
         current_batch_index,
         total_batches,
         batch_queue,
@@ -1040,14 +1145,15 @@ def generate_next_batch_background(
             final_result = partial_result
         # Extract results from final_result
         all_audio_paths = final_result[8]  # generated_audio_batch
         generation_info = final_result[9]
         seed_value_for_ui = final_result[11]
-        lm_generated_metadata = final_result[35]  # Fixed: lm_metadata is at index 35, not 34
         # Extract codes
-        generated_codes_single = final_result[26]
-        generated_codes_batch = [final_result[27], final_result[28], final_result[29], final_result[30], final_result[31], final_result[32], final_result[33], final_result[34]]
         # Determine which codes to store
         batch_size = params.get("batch_size_input", 2)
@@ -1070,6 +1176,7 @@ def generate_next_batch_background(
             logger.info(f"  - codes_to_store: STRING with {len(codes_to_store) if codes_to_store else 0} chars")
         # Store next batch in queue with codes, batch settings, and ALL generation params
         batch_queue = store_batch_in_queue(
             batch_queue,
             next_batch_idx,
@@ -1081,6 +1188,7 @@ def generate_next_batch_background(
             batch_size=int(batch_size),
             generation_params=params,
             lm_generated_metadata=lm_generated_metadata,
             status="completed"
         )

     batch_size=2,
     generation_params=None,
     lm_generated_metadata=None,
+    extra_outputs=None,
     status="completed"
 ):
     """Store batch results in queue with ALL generation parameters
         batch_size: Batch size used for this batch
         generation_params: Complete dictionary of ALL generation parameters used
         lm_generated_metadata: LM-generated metadata for scoring (optional)
+        extra_outputs: Dictionary containing pred_latents, encoder_hidden_states, etc. for LRC generation
     """
     batch_queue[batch_index] = {
         "status": status,
         "batch_size": batch_size,  # Store batch size
         "generation_params": generation_params if generation_params else {},  # Store ALL parameters
         "lm_generated_metadata": lm_generated_metadata,  # Store LM metadata for scoring
+        "extra_outputs": extra_outputs if extra_outputs else {},  # Store extra outputs for LRC generation
         "timestamp": datetime.datetime.now().isoformat()
     }
     return batch_queue
     audio_conversion_start_time = time_module.time()
     total_auto_score_time = 0.0
     updated_audio_codes = text2music_audio_code_string if not think_checkbox else ""
     # Build initial generation_info (will be updated with post-processing times at the end)
     )
     if not result.success:
+        yield (None,) * 8 + (None, generation_info, result.status_message) + (gr.skip(),) * 20 + (None,)  # +1 for extra_outputs
         return
     audios = result.audios
                 generation_info,
                 status_message,
                 seed_value_for_ui,
                 # Scores
                 scores_ui_updates[0], scores_ui_updates[1], scores_ui_updates[2], scores_ui_updates[3], scores_ui_updates[4], scores_ui_updates[5], scores_ui_updates[6], scores_ui_updates[7],
                 updated_audio_codes,
                 audio_codes_ui_updates[4], audio_codes_ui_updates[5], audio_codes_ui_updates[6], audio_codes_ui_updates[7],
                 lm_generated_metadata,
                 is_format_caption,
+                None,  # Placeholder for extra_outputs (only filled in final yield)
             )
         else:
             # If i exceeds the generated count (e.g., batch=2, i=2..7), do not yield
         generation_info,
         "Generation Complete",
         seed_value_for_ui,
         final_scores_list[0], final_scores_list[1], final_scores_list[2], final_scores_list[3],
         final_scores_list[4], final_scores_list[5], final_scores_list[6], final_scores_list[7],
         updated_audio_codes,
         final_codes_list[4], final_codes_list[5], final_codes_list[6], final_codes_list[7],
         lm_generated_metadata,
         is_format_caption,
+        result.extra_outputs,  # extra_outputs for LRC generation
     )
         batch_queue: Batch queue containing historical generation data
     """
     if current_batch_index not in batch_queue:
+        return gr.skip(), gr.skip(), batch_queue
     batch_data = batch_queue[current_batch_index]
     params = batch_data.get("generation_params", {})
             batch_queue[current_batch_index]["scores"] = [""] * 8
         batch_queue[current_batch_index]["scores"][sample_idx - 1] = score_display
+    # Return: score_display (content + visible), accordion visible, batch_queue
+    return (
+        gr.update(value=score_display, visible=True),  # score_display with content
+        gr.update(visible=True),  # details_accordion
+        batch_queue
+    )
+def generate_lrc_handler(dit_handler, sample_idx, current_batch_index, batch_queue, vocal_language, inference_steps):
+    """
+    Generate LRC timestamps for a specific audio sample.
+    This function retrieves cached generation data from batch_queue and calls
+    the handler's get_lyric_timestamp method to generate LRC format lyrics.
+    Args:
+        dit_handler: DiT handler instance with get_lyric_timestamp method
+        sample_idx: Which sample to generate LRC for (1-8)
+        current_batch_index: Current batch index in batch_queue
+        batch_queue: Dictionary storing all batch generation data
+        vocal_language: Language code for lyrics
+        inference_steps: Number of inference steps used in generation
+    Returns:
+        LRC formatted string or error message
+    """
+    import torch
+    if current_batch_index not in batch_queue:
+        return gr.skip(), gr.skip()
+    batch_data = batch_queue[current_batch_index]
+    extra_outputs = batch_data.get("extra_outputs", {})
+    # Check if required data is available
+    if not extra_outputs:
+        return gr.update(value=t("messages.lrc_no_extra_outputs"), visible=True), gr.update(visible=True)
+    pred_latents = extra_outputs.get("pred_latents")
+    encoder_hidden_states = extra_outputs.get("encoder_hidden_states")
+    encoder_attention_mask = extra_outputs.get("encoder_attention_mask")
+    context_latents = extra_outputs.get("context_latents")
+    lyric_token_idss = extra_outputs.get("lyric_token_idss")
+    if any(x is None for x in [pred_latents, encoder_hidden_states, encoder_attention_mask, context_latents, lyric_token_idss]):
+        return gr.update(value=t("messages.lrc_missing_tensors"), visible=True), gr.update(visible=True)
+    # Adjust sample_idx to 0-based
+    sample_idx_0based = sample_idx - 1
+    # Check if sample exists in batch
+    batch_size = pred_latents.shape[0]
+    if sample_idx_0based >= batch_size:
+        return gr.update(value=t("messages.lrc_sample_not_exist"), visible=True), gr.update(visible=True)
+    # Extract the specific sample's data
+    try:
+        # Get audio duration from batch data
+        params = batch_data.get("generation_params", {})
+        audio_duration = params.get("audio_duration", -1)
+        # Calculate duration from latents if not specified
+        if audio_duration is None or audio_duration <= 0:
+            # latent_length * frames_per_second_ratio ≈ audio_duration
+            # Assuming 25 Hz latent rate: latent_length / 25 = duration
+            latent_length = pred_latents.shape[1]
+            audio_duration = latent_length / 25.0  # 25 Hz latent rate
+        # Get the sample's data (keep batch dimension for handler)
+        sample_pred_latent = pred_latents[sample_idx_0based:sample_idx_0based+1]
+        sample_encoder_hidden_states = encoder_hidden_states[sample_idx_0based:sample_idx_0based+1]
+        sample_encoder_attention_mask = encoder_attention_mask[sample_idx_0based:sample_idx_0based+1]
+        sample_context_latents = context_latents[sample_idx_0based:sample_idx_0based+1]
+        sample_lyric_token_ids = lyric_token_idss[sample_idx_0based:sample_idx_0based+1]
+        # Call handler to generate timestamps
+        result = dit_handler.get_lyric_timestamp(
+            pred_latent=sample_pred_latent,
+            encoder_hidden_states=sample_encoder_hidden_states,
+            encoder_attention_mask=sample_encoder_attention_mask,
+            context_latents=sample_context_latents,
+            lyric_token_ids=sample_lyric_token_ids,
+            total_duration_seconds=float(audio_duration),
+            vocal_language=vocal_language or "en",
+            inference_steps=int(inference_steps),
+            seed=42,  # Use fixed seed for reproducibility
+        )
+        if result.get("success"):
+            lrc_text = result.get("lrc_text", "")
+            if not lrc_text:
+                return gr.update(value=t("messages.lrc_empty_result"), visible=True), gr.update(visible=True)
+            return gr.update(value=lrc_text, visible=True), gr.update(visible=True)
+        else:
+            error_msg = result.get("error", "Unknown error")
+            return gr.update(value=f"❌ {error_msg}", visible=True), gr.update(visible=True)
+    except Exception as e:
+        logger.exception("[generate_lrc_handler] Error generating LRC")
+        return gr.update(value=f"❌ Error: {str(e)}", visible=True), gr.update(visible=True)
 def capture_current_params(
         final_result_from_inner = partial_result
         # current_batch_index, total_batches, batch_queue, next_params,
         # batch_indicator_text, prev_btn, next_btn, next_status, restore_btn
+        # Slice off extra_outputs (last item) before re-yielding to UI
+        ui_result = partial_result[:-1] if len(partial_result) > 31 else partial_result
+        yield ui_result + (
             gr.skip(), gr.skip(), gr.skip(), gr.skip(),
             gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
         )
     all_audio_paths = result[8]
     if all_audio_paths is None:
+        # Slice off extra_outputs before yielding to UI
+        ui_result = result[:-1] if len(result) > 31 else result
+        yield ui_result + (
             gr.skip(), gr.skip(), gr.skip(), gr.skip(),
             gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
         )
         return
     # Extract results from generation (使用 result 下标访问)
+    # New indices after removing 6 align_* items (was 12-17, now shifted down by 6)
     generation_info = result[9]
     seed_value_for_ui = result[11]
+    lm_generated_metadata = result[29]  # was 35, now 29
     # Extract codes
+    generated_codes_single = result[20]  # was 26, now 20
+    generated_codes_batch = [result[21], result[22], result[23], result[24], result[25], result[26], result[27], result[28]]  # was 27-34, now 21-28
     # Determine which codes to store based on mode
     if allow_lm_batch and batch_size_input >= 2:
     next_params["text2music_audio_code_string"] = ""
     next_params["random_seed_checkbox"] = True
+    # Extract extra_outputs from result tuple (index 31)
+    extra_outputs_from_result = result[31] if len(result) > 31 else {}
     # Store current batch in queue
     batch_queue = store_batch_in_queue(
         batch_queue,
         batch_size=int(batch_size_input),
         generation_params=saved_params,
         lm_generated_metadata=lm_generated_metadata,
+        extra_outputs=extra_outputs_from_result,  # Store extra outputs for LRC generation
         status="completed"
     )
     # 4. Yield final result (includes Batch UI updates)
     # The result here is already a tuple structure
+    # Slice off extra_outputs (last item) before yielding to UI - it's already stored in batch_queue
+    ui_result = result[:-1] if len(result) > 31 else result
+    yield ui_result + (
         current_batch_index,
         total_batches,
         batch_queue,
             final_result = partial_result
         # Extract results from final_result
+        # Indices shifted by -6 after removing align_* items
         all_audio_paths = final_result[8]  # generated_audio_batch
         generation_info = final_result[9]
         seed_value_for_ui = final_result[11]
+        lm_generated_metadata = final_result[29]  # was 35, now 29
         # Extract codes
+        generated_codes_single = final_result[20]  # was 26, now 20
+        generated_codes_batch = [final_result[21], final_result[22], final_result[23], final_result[24], final_result[25], final_result[26], final_result[27], final_result[28]]  # was 27-34, now 21-28
         # Determine which codes to store
         batch_size = params.get("batch_size_input", 2)
             logger.info(f"  - codes_to_store: STRING with {len(codes_to_store) if codes_to_store else 0} chars")
         # Store next batch in queue with codes, batch settings, and ALL generation params
+        # Note: extra_outputs not available for background batches (LRC not supported for auto-gen batches)
         batch_queue = store_batch_in_queue(
             batch_queue,
             next_batch_idx,
             batch_size=int(batch_size),
             generation_params=params,
             lm_generated_metadata=lm_generated_metadata,
+            extra_outputs=None,  # Not available for background batches
             status="completed"
         )

acestep/gradio_ui/i18n/en.json CHANGED Viewed

@@ -148,8 +148,6 @@
     "cover_strength_info": "Control how many denoising steps use cover mode",
     "score_sensitivity_label": "Quality Score Sensitivity",
     "score_sensitivity_info": "Lower = more sensitive (default: 1.0). Adjusts how PMI maps to [0,1]",
-    "attention_focus_label": "Output Attention Focus Score (disabled)",
-    "attention_focus_info": "Output attention focus score analysis",
     "think_label": "Think",
     "parallel_thinking_label": "ParallelThinking",
     "generate_btn": "🎵 Generate Music",
@@ -162,8 +160,12 @@
     "send_to_src_btn": "🔗 Send To Src Audio",
     "save_btn": "💾 Save",
     "score_btn": "📊 Score",
     "quality_score_label": "Quality Score (Sample {n})",
     "quality_score_placeholder": "Click 'Score' to calculate perplexity-based quality score",
     "generation_status": "Generation Status",
     "current_batch": "Current Batch",
     "batch_indicator": "Batch {current} / {total}",
@@ -173,11 +175,7 @@
     "restore_params_btn": "↙️ Apply These Settings to UI (Restore Batch Parameters)",
     "batch_results_title": "📁 Batch Results & Generation Details",
     "all_files_label": "📁 All Generated Files (Download)",
-    "generation_details": "Generation Details",
-    "attention_analysis": "⚖️ Attention Focus Score Analysis",
-    "attention_score": "Attention Focus Score (Sample {n})",
-    "lyric_timestamps": "Lyric Timestamps (Sample {n})",
-    "attention_heatmap": "Attention Focus Score Heatmap (Sample {n})"
   },
   "messages": {
     "no_audio_to_save": "❌ No audio to save",
@@ -206,6 +204,11 @@
     "scoring_failed": "❌ Error: Batch data not found",
     "no_codes": "❌ No audio codes available. Please generate music first.",
     "score_failed": "❌ Scoring failed: {error}",
-    "score_error": "❌ Error calculating score: {error}"
   }
 }

     "cover_strength_info": "Control how many denoising steps use cover mode",
     "score_sensitivity_label": "Quality Score Sensitivity",
     "score_sensitivity_info": "Lower = more sensitive (default: 1.0). Adjusts how PMI maps to [0,1]",
     "think_label": "Think",
     "parallel_thinking_label": "ParallelThinking",
     "generate_btn": "🎵 Generate Music",
     "send_to_src_btn": "🔗 Send To Src Audio",
     "save_btn": "💾 Save",
     "score_btn": "📊 Score",
+    "lrc_btn": "🎵 LRC",
     "quality_score_label": "Quality Score (Sample {n})",
     "quality_score_placeholder": "Click 'Score' to calculate perplexity-based quality score",
+    "lrc_label": "Lyrics Timestamps (Sample {n})",
+    "lrc_placeholder": "Click 'LRC' to generate timestamps",
+    "details_accordion": "📊 Score & LRC",
     "generation_status": "Generation Status",
     "current_batch": "Current Batch",
     "batch_indicator": "Batch {current} / {total}",
     "restore_params_btn": "↙️ Apply These Settings to UI (Restore Batch Parameters)",
     "batch_results_title": "📁 Batch Results & Generation Details",
     "all_files_label": "📁 All Generated Files (Download)",
+    "generation_details": "Generation Details"
   },
   "messages": {
     "no_audio_to_save": "❌ No audio to save",
     "scoring_failed": "❌ Error: Batch data not found",
     "no_codes": "❌ No audio codes available. Please generate music first.",
     "score_failed": "❌ Scoring failed: {error}",
+    "score_error": "❌ Error calculating score: {error}",
+    "lrc_no_batch_data": "❌ No batch data found. Please generate music first.",
+    "lrc_no_extra_outputs": "❌ No extra outputs found. Condition tensors not available.",
+    "lrc_missing_tensors": "❌ Missing required tensors for LRC generation.",
+    "lrc_sample_not_exist": "❌ Sample does not exist in current batch.",
+    "lrc_empty_result": "⚠️ LRC generation produced empty result."
   }
 }

acestep/gradio_ui/i18n/ja.json CHANGED Viewed

@@ -148,8 +148,6 @@
     "cover_strength_info": "カバーモードを使用するデノイジングステップ数を制御",
     "score_sensitivity_label": "品質スコア感度",
     "score_sensitivity_info": "低い = より敏感(デフォルト: 1.0)。PMIが[0,1]にマッピングする方法を調整",
-    "attention_focus_label": "注意焦点スコアを出力(無効)",
-    "attention_focus_info": "注意焦点スコア分析を出力",
     "think_label": "思考",
     "parallel_thinking_label": "並列思考",
     "generate_btn": "🎵 音楽を生成",
@@ -162,8 +160,12 @@
     "send_to_src_btn": "🔗 ソースオーディオに送信",
     "save_btn": "💾 保存",
     "score_btn": "📊 スコア",
     "quality_score_label": "品質スコア(サンプル {n})",
     "quality_score_placeholder": "'スコア'をクリックしてパープレキシティベースの品質スコアを計算",
     "generation_status": "生成ステータス",
     "current_batch": "現在のバッチ",
     "batch_indicator": "バッチ {current} / {total}",
@@ -173,11 +175,7 @@
     "restore_params_btn": "↙️ これらの設定をUIに適用(バッチパラメータを復元)",
     "batch_results_title": "📁 バッチ結果と生成詳細",
     "all_files_label": "📁 すべての生成ファイル(ダウンロード)",
-    "generation_details": "生成詳細",
-    "attention_analysis": "⚖️ 注意焦点スコア分析",
-    "attention_score": "注意焦点スコア(サンプル {n})",
-    "lyric_timestamps": "歌詞タイムスタンプ(サンプル {n})",
-    "attention_heatmap": "注意焦点スコアヒートマップ(サンプル {n})"
   },
   "messages": {
     "no_audio_to_save": "❌ 保存するオーディオがありません",
@@ -206,6 +204,11 @@
     "scoring_failed": "❌ エラー: バッチデータが見つかりません",
     "no_codes": "❌ 利用可能なオーディオコードがありません。最初に音楽を生成してください。",
     "score_failed": "❌ スコアリングに失敗しました: {error}",
-    "score_error": "❌ スコア計算エラー: {error}"
   }
 }

     "cover_strength_info": "カバーモードを使用するデノイジングステップ数を制御",
     "score_sensitivity_label": "品質スコア感度",
     "score_sensitivity_info": "低い = より敏感(デフォルト: 1.0)。PMIが[0,1]にマッピングする方法を調整",
     "think_label": "思考",
     "parallel_thinking_label": "並列思考",
     "generate_btn": "🎵 音楽を生成",
     "send_to_src_btn": "🔗 ソースオーディオに送信",
     "save_btn": "💾 保存",
     "score_btn": "📊 スコア",
+    "lrc_btn": "🎵 LRC",
     "quality_score_label": "品質スコア(サンプル {n})",
     "quality_score_placeholder": "'スコア'をクリックしてパープレキシティベースの品質スコアを計算",
+    "lrc_label": "歌詞タイムスタンプ(サンプル {n})",
+    "lrc_placeholder": "'LRC'をクリックしてタイムスタンプを生成",
+    "details_accordion": "📊 スコア & LRC",
     "generation_status": "生成ステータス",
     "current_batch": "現在のバッチ",
     "batch_indicator": "バッチ {current} / {total}",
     "restore_params_btn": "↙️ これらの設定をUIに適用(バッチパラメータを復元)",
     "batch_results_title": "📁 バッチ結果と生成詳細",
     "all_files_label": "📁 すべての生成ファイル(ダウンロード)",
+    "generation_details": "生成詳細"
   },
   "messages": {
     "no_audio_to_save": "❌ 保存するオーディオがありません",
     "scoring_failed": "❌ エラー: バッチデータが見つかりません",
     "no_codes": "❌ 利用可能なオーディオコードがありません。最初に音楽を生成してください。",
     "score_failed": "❌ スコアリングに失敗しました: {error}",
+    "score_error": "❌ スコア計算エラー: {error}",
+    "lrc_no_batch_data": "❌ バッチデータが見つかりません。最初に音楽を生成してください。",
+    "lrc_no_extra_outputs": "❌ 追加出力が見つかりません。条件テンソルが利用できません。",
+    "lrc_missing_tensors": "❌ LRC生成に必要なテンソルがありません。",
+    "lrc_sample_not_exist": "❌ 現在のバッチにサンプルが存在しません。",
+    "lrc_empty_result": "⚠️ LRC生成の結果が空です。"
   }
 }

acestep/gradio_ui/i18n/zh.json CHANGED Viewed

@@ -148,8 +148,6 @@
     "cover_strength_info": "控制使用覆盖模式的去噪步骤数量",
     "score_sensitivity_label": "质量评分敏感度",
     "score_sensitivity_info": "更低 = 更敏感(默认: 1.0). 调整PMI如何映射到[0,1]",
-    "attention_focus_label": "输出注意力焦点分数(已禁用)",
-    "attention_focus_info": "输出注意力焦点分数分析",
     "think_label": "思考",
     "parallel_thinking_label": "并行思考",
     "generate_btn": "🎵 生成音乐",
@@ -162,8 +160,12 @@
     "send_to_src_btn": "🔗 发送到源音频",
     "save_btn": "💾 保存",
     "score_btn": "📊 评分",
     "quality_score_label": "质量分数(样本 {n})",
     "quality_score_placeholder": "点击'评分'以计算基于困惑度的质量分数",
     "generation_status": "生成状态",
     "current_batch": "当前批次",
     "batch_indicator": "批次 {current} / {total}",
@@ -173,11 +175,7 @@
     "restore_params_btn": "↙️ 将这些设置应用到UI(恢复批次参数)",
     "batch_results_title": "📁 批量结果和生成详情",
     "all_files_label": "📁 所有生成的文件(下载)",
-    "generation_details": "生成详情",
-    "attention_analysis": "⚖️ 注意力焦点分数分析",
-    "attention_score": "注意力焦点分数(样本 {n})",
-    "lyric_timestamps": "歌词时间戳(样本 {n})",
-    "attention_heatmap": "注意力焦点分数热图(样本 {n})"
   },
   "messages": {
     "no_audio_to_save": "❌ 没有要保存的音频",
@@ -206,6 +204,11 @@
     "scoring_failed": "❌ 错误: 未找到批次数据",
     "no_codes": "❌ 没有可用的音频代码。请先生成音乐。",
     "score_failed": "❌ 评分失败: {error}",
-    "score_error": "❌ 计算分数时出错: {error}"
   }
 }

     "cover_strength_info": "控制使用覆盖模式的去噪步骤数量",
     "score_sensitivity_label": "质量评分敏感度",
     "score_sensitivity_info": "更低 = 更敏感(默认: 1.0). 调整PMI如何映射到[0,1]",
     "think_label": "思考",
     "parallel_thinking_label": "并行思考",
     "generate_btn": "🎵 生成音乐",
     "send_to_src_btn": "🔗 发送到源音频",
     "save_btn": "💾 保存",
     "score_btn": "📊 评分",
+    "lrc_btn": "🎵 LRC",
     "quality_score_label": "质量分数(样本 {n})",
     "quality_score_placeholder": "点击'评分'以计算基于困惑度的质量分数",
+    "lrc_label": "歌词时间戳(样本 {n})",
+    "lrc_placeholder": "点击'LRC'生成时间戳",
+    "details_accordion": "📊 评分与LRC",
     "generation_status": "生成状态",
     "current_batch": "当前批次",
     "batch_indicator": "批次 {current} / {total}",
     "restore_params_btn": "↙️ 将这些设置应用到UI(恢复批次参数)",
     "batch_results_title": "📁 批量结果和生成详情",
     "all_files_label": "📁 所有生成的文件(下载)",
+    "generation_details": "生成详情"
   },
   "messages": {
     "no_audio_to_save": "❌ 没有要保存的音频",
     "scoring_failed": "❌ 错误: 未找到批次数据",
     "no_codes": "❌ 没有可用的音频代码。请先生成音乐。",
     "score_failed": "❌ 评分失败: {error}",
+    "score_error": "❌ 计算分数时出错: {error}",
+    "lrc_no_batch_data": "❌ 未找到批次数据。请先生成音乐。",
+    "lrc_no_extra_outputs": "❌ 未找到额外输出。条件张量不可用。",
+    "lrc_missing_tensors": "❌ 缺少LRC生成所需的张量。",
+    "lrc_sample_not_exist": "❌ 当前批次中不存在该样本。",
+    "lrc_empty_result": "⚠️ LRC生成结果为空。"
   }
 }

acestep/gradio_ui/interfaces/result.py CHANGED Viewed

@@ -50,11 +50,24 @@ def create_results_section(dit_handler) -> dict:
                         size="sm",
                         scale=1
                     )
-                score_display_1 = gr.Textbox(
-                    label=t("results.quality_score_label", n=1),
-                    interactive=False,
-                    placeholder=t("results.quality_score_placeholder")
-                )
             with gr.Column(visible=True) as audio_col_2:
                 generated_audio_2 = gr.Audio(
                     label=t("results.generated_music", n=2),
@@ -81,11 +94,24 @@ def create_results_section(dit_handler) -> dict:
                         size="sm",
                         scale=1
                     )
-                score_display_2 = gr.Textbox(
-                    label=t("results.quality_score_label", n=2),
-                    interactive=False,
-                    placeholder=t("results.quality_score_placeholder")
-                )
             with gr.Column(visible=False) as audio_col_3:
                 generated_audio_3 = gr.Audio(
                     label=t("results.generated_music", n=3),
@@ -112,11 +138,24 @@ def create_results_section(dit_handler) -> dict:
                         size="sm",
                         scale=1
                     )
-                score_display_3 = gr.Textbox(
-                    label=t("results.quality_score_label", n=3),
-                    interactive=False,
-                    placeholder=t("results.quality_score_placeholder")
-                )
             with gr.Column(visible=False) as audio_col_4:
                 generated_audio_4 = gr.Audio(
                     label=t("results.generated_music", n=4),
@@ -143,11 +182,24 @@ def create_results_section(dit_handler) -> dict:
                         size="sm",
                         scale=1
                     )
-                score_display_4 = gr.Textbox(
-                    label=t("results.quality_score_label", n=4),
-                    interactive=False,
-                    placeholder=t("results.quality_score_placeholder")
-                )
         # Second row for batch size 5-8 (initially hidden)
         with gr.Row(visible=False) as audio_row_5_8:
@@ -162,11 +214,19 @@ def create_results_section(dit_handler) -> dict:
                     send_to_src_btn_5 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
                     save_btn_5 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
                     score_btn_5 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
-                score_display_5 = gr.Textbox(
-                    label=t("results.quality_score_label", n=5),
-                    interactive=False,
-                    placeholder=t("results.quality_score_placeholder")
-                )
             with gr.Column() as audio_col_6:
                 generated_audio_6 = gr.Audio(
                     label=t("results.generated_music", n=6),
@@ -178,11 +238,19 @@ def create_results_section(dit_handler) -> dict:
                     send_to_src_btn_6 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
                     save_btn_6 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
                     score_btn_6 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
-                score_display_6 = gr.Textbox(
-                    label=t("results.quality_score_label", n=6),
-                    interactive=False,
-                    placeholder=t("results.quality_score_placeholder")
-                )
             with gr.Column() as audio_col_7:
                 generated_audio_7 = gr.Audio(
                     label=t("results.generated_music", n=7),
@@ -194,11 +262,19 @@ def create_results_section(dit_handler) -> dict:
                     send_to_src_btn_7 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
                     save_btn_7 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
                     score_btn_7 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
-                score_display_7 = gr.Textbox(
-                    label=t("results.quality_score_label", n=7),
-                    interactive=False,
-                    placeholder=t("results.quality_score_placeholder")
-                )
             with gr.Column() as audio_col_8:
                 generated_audio_8 = gr.Audio(
                     label=t("results.generated_music", n=8),
@@ -210,11 +286,19 @@ def create_results_section(dit_handler) -> dict:
                     send_to_src_btn_8 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
                     save_btn_8 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
                     score_btn_8 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
-                score_display_8 = gr.Textbox(
-                    label=t("results.quality_score_label", n=8),
-                    interactive=False,
-                    placeholder=t("results.quality_score_placeholder")
-                )
         status_output = gr.Textbox(label=t("results.generation_status"), interactive=False)
@@ -262,17 +346,6 @@ def create_results_section(dit_handler) -> dict:
                 interactive=False
             )
             generation_info = gr.Markdown(label=t("results.generation_details"))
-        with gr.Accordion(t("results.attention_analysis"), open=False):
-            with gr.Row():
-                with gr.Column():
-                    align_score_1 = gr.Textbox(label=t("results.attention_score", n=1), interactive=False)
-                    align_text_1 = gr.Textbox(label=t("results.lyric_timestamps", n=1), interactive=False, lines=10)
-                    align_plot_1 = gr.Plot(label=t("results.attention_heatmap", n=1))
-                with gr.Column():
-                    align_score_2 = gr.Textbox(label=t("results.attention_score", n=2), interactive=False)
-                    align_text_2 = gr.Textbox(label=t("results.lyric_timestamps", n=2), interactive=False, lines=10)
-                    align_plot_2 = gr.Plot(label=t("results.attention_heatmap", n=2))
     return {
         "lm_metadata_state": lm_metadata_state,
@@ -337,13 +410,31 @@ def create_results_section(dit_handler) -> dict:
         "score_display_6": score_display_6,
         "score_display_7": score_display_7,
         "score_display_8": score_display_8,
         "generated_audio_batch": generated_audio_batch,
         "generation_info": generation_info,
-        "align_score_1": align_score_1,
-        "align_text_1": align_text_1,
-        "align_plot_1": align_plot_1,
-        "align_score_2": align_score_2,
-        "align_text_2": align_text_2,
-        "align_plot_2": align_plot_2,
     }

                         size="sm",
                         scale=1
                     )
+                    lrc_btn_1 = gr.Button(
+                        t("results.lrc_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_1:
+                    score_display_1 = gr.Textbox(
+                        label=t("results.quality_score_label", n=1),
+                        interactive=False,
+                        visible=False
+                    )
+                    lrc_display_1 = gr.Textbox(
+                        label=t("results.lrc_label", n=1),
+                        interactive=False,
+                        lines=8,
+                        visible=False
+                    )
             with gr.Column(visible=True) as audio_col_2:
                 generated_audio_2 = gr.Audio(
                     label=t("results.generated_music", n=2),
                         size="sm",
                         scale=1
                     )
+                    lrc_btn_2 = gr.Button(
+                        t("results.lrc_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_2:
+                    score_display_2 = gr.Textbox(
+                        label=t("results.quality_score_label", n=2),
+                        interactive=False,
+                        visible=False
+                    )
+                    lrc_display_2 = gr.Textbox(
+                        label=t("results.lrc_label", n=2),
+                        interactive=False,
+                        lines=8,
+                        visible=False
+                    )
             with gr.Column(visible=False) as audio_col_3:
                 generated_audio_3 = gr.Audio(
                     label=t("results.generated_music", n=3),
                         size="sm",
                         scale=1
                     )
+                    lrc_btn_3 = gr.Button(
+                        t("results.lrc_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_3:
+                    score_display_3 = gr.Textbox(
+                        label=t("results.quality_score_label", n=3),
+                        interactive=False,
+                        visible=False
+                    )
+                    lrc_display_3 = gr.Textbox(
+                        label=t("results.lrc_label", n=3),
+                        interactive=False,
+                        lines=8,
+                        visible=False
+                    )
             with gr.Column(visible=False) as audio_col_4:
                 generated_audio_4 = gr.Audio(
                     label=t("results.generated_music", n=4),
                         size="sm",
                         scale=1
                     )
+                    lrc_btn_4 = gr.Button(
+                        t("results.lrc_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_4:
+                    score_display_4 = gr.Textbox(
+                        label=t("results.quality_score_label", n=4),
+                        interactive=False,
+                        visible=False
+                    )
+                    lrc_display_4 = gr.Textbox(
+                        label=t("results.lrc_label", n=4),
+                        interactive=False,
+                        lines=8,
+                        visible=False
+                    )
         # Second row for batch size 5-8 (initially hidden)
         with gr.Row(visible=False) as audio_row_5_8:
                     send_to_src_btn_5 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
                     save_btn_5 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
                     score_btn_5 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
+                    lrc_btn_5 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1)
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_5:
+                    score_display_5 = gr.Textbox(
+                        label=t("results.quality_score_label", n=5),
+                        interactive=False,
+                        visible=False
+                    )
+                    lrc_display_5 = gr.Textbox(
+                        label=t("results.lrc_label", n=5),
+                        interactive=False,
+                        lines=8,
+                        visible=False
+                    )
             with gr.Column() as audio_col_6:
                 generated_audio_6 = gr.Audio(
                     label=t("results.generated_music", n=6),
                     send_to_src_btn_6 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
                     save_btn_6 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
                     score_btn_6 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
+                    lrc_btn_6 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1)
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_6:
+                    score_display_6 = gr.Textbox(
+                        label=t("results.quality_score_label", n=6),
+                        interactive=False,
+                        visible=False
+                    )
+                    lrc_display_6 = gr.Textbox(
+                        label=t("results.lrc_label", n=6),
+                        interactive=False,
+                        lines=8,
+                        visible=False
+                    )
             with gr.Column() as audio_col_7:
                 generated_audio_7 = gr.Audio(
                     label=t("results.generated_music", n=7),
                     send_to_src_btn_7 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
                     save_btn_7 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
                     score_btn_7 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
+                    lrc_btn_7 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1)
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_7:
+                    score_display_7 = gr.Textbox(
+                        label=t("results.quality_score_label", n=7),
+                        interactive=False,
+                        visible=False
+                    )
+                    lrc_display_7 = gr.Textbox(
+                        label=t("results.lrc_label", n=7),
+                        interactive=False,
+                        lines=8,
+                        visible=False
+                    )
             with gr.Column() as audio_col_8:
                 generated_audio_8 = gr.Audio(
                     label=t("results.generated_music", n=8),
                     send_to_src_btn_8 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
                     save_btn_8 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
                     score_btn_8 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
+                    lrc_btn_8 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1)
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=False) as details_accordion_8:
+                    score_display_8 = gr.Textbox(
+                        label=t("results.quality_score_label", n=8),
+                        interactive=False,
+                        visible=False
+                    )
+                    lrc_display_8 = gr.Textbox(
+                        label=t("results.lrc_label", n=8),
+                        interactive=False,
+                        lines=8,
+                        visible=False
+                    )
         status_output = gr.Textbox(label=t("results.generation_status"), interactive=False)
                 interactive=False
             )
             generation_info = gr.Markdown(label=t("results.generation_details"))
     return {
         "lm_metadata_state": lm_metadata_state,
         "score_display_6": score_display_6,
         "score_display_7": score_display_7,
         "score_display_8": score_display_8,
+        "lrc_btn_1": lrc_btn_1,
+        "lrc_btn_2": lrc_btn_2,
+        "lrc_btn_3": lrc_btn_3,
+        "lrc_btn_4": lrc_btn_4,
+        "lrc_btn_5": lrc_btn_5,
+        "lrc_btn_6": lrc_btn_6,
+        "lrc_btn_7": lrc_btn_7,
+        "lrc_btn_8": lrc_btn_8,
+        "lrc_display_1": lrc_display_1,
+        "lrc_display_2": lrc_display_2,
+        "lrc_display_3": lrc_display_3,
+        "lrc_display_4": lrc_display_4,
+        "lrc_display_5": lrc_display_5,
+        "lrc_display_6": lrc_display_6,
+        "lrc_display_7": lrc_display_7,
+        "lrc_display_8": lrc_display_8,
+        "details_accordion_1": details_accordion_1,
+        "details_accordion_2": details_accordion_2,
+        "details_accordion_3": details_accordion_3,
+        "details_accordion_4": details_accordion_4,
+        "details_accordion_5": details_accordion_5,
+        "details_accordion_6": details_accordion_6,
+        "details_accordion_7": details_accordion_7,
+        "details_accordion_8": details_accordion_8,
         "generated_audio_batch": generated_audio_batch,
         "generation_info": generation_info,
     }

acestep/handler.py CHANGED Viewed

@@ -31,6 +31,7 @@ from acestep.constants import (
     SFT_GEN_PROMPT,
     DEFAULT_DIT_INSTRUCTION,
 )
 warnings.filterwarnings("ignore")
@@ -65,13 +66,7 @@ class AceStepHandler:
         self.batch_size = 2
         # Custom layers config
-        self.custom_layers_config = {
-            2: [6, 7],
-            3: [10, 11],
-            4: [3],
-            5: [8, 9, 11],
-            6: [8]
-        }
         self.offload_to_cpu = False
         self.offload_dit_to_cpu = False
         self.current_offload_cost = 0.0
@@ -1953,6 +1948,23 @@ class AceStepHandler:
         }
         logger.info("[service_generate] Generating audio...")
         with self._load_model_context("model"):
             outputs = self.model.generate_audio(**generate_kwargs)
         # Add intermediate information to outputs for extra_outputs
@@ -1962,6 +1974,12 @@ class AceStepHandler:
         outputs["spans"] = spans
         outputs["latent_masks"] = batch.get("latent_masks")  # Latent masks for valid length
         return outputs
     def tiled_decode(self, latents, chunk_size=512, overlap=64):
@@ -2268,16 +2286,27 @@ class AceStepHandler:
             spans = outputs.get("spans", [])  # List of tuples
             latent_masks = outputs.get("latent_masks")  # [batch, T]
-            # Move latents to CPU to save memory (they can be large)
             extra_outputs = {
-                "pred_latents": pred_latents.cpu() if pred_latents is not None else None,
-                "target_latents": target_latents_input.cpu() if target_latents_input is not None else None,
-                "src_latents": src_latents.cpu() if src_latents is not None else None,
-                "chunk_masks": chunk_masks.cpu() if chunk_masks is not None else None,
-                "latent_masks": latent_masks.cpu() if latent_masks is not None else None,
                 "spans": spans,
                 "time_costs": time_costs,
                 "seed_value": seed_value_for_ui,
             }
             # Build audios list with tensor data (no file paths, no UUIDs, handled outside)
@@ -2307,3 +2336,220 @@ class AceStepHandler:
                 "success": False,
                 "error": str(e),
             }

     SFT_GEN_PROMPT,
     DEFAULT_DIT_INSTRUCTION,
 )
+from acestep.dit_alignment_score import MusicStampsAligner
 warnings.filterwarnings("ignore")
         self.batch_size = 2
         # Custom layers config
+        self.custom_layers_config = {2: [6], 3: [10, 11], 4: [3], 5: [8, 9], 6: [8]}
         self.offload_to_cpu = False
         self.offload_dit_to_cpu = False
         self.current_offload_cost = 0.0
         }
         logger.info("[service_generate] Generating audio...")
         with self._load_model_context("model"):
+            # Prepare condition tensors first (for LRC timestamp generation)
+            encoder_hidden_states, encoder_attention_mask, context_latents = self.model.prepare_condition(
+                text_hidden_states=text_hidden_states,
+                text_attention_mask=text_attention_mask,
+                lyric_hidden_states=lyric_hidden_states,
+                lyric_attention_mask=lyric_attention_mask,
+                refer_audio_acoustic_hidden_states_packed=refer_audio_acoustic_hidden_states_packed,
+                refer_audio_order_mask=refer_audio_order_mask,
+                hidden_states=src_latents,
+                attention_mask=torch.ones(src_latents.shape[0], src_latents.shape[1], device=src_latents.device, dtype=src_latents.dtype),
+                silence_latent=self.silence_latent,
+                src_latents=src_latents,
+                chunk_masks=chunk_mask,
+                is_covers=is_covers,
+                precomputed_lm_hints_25Hz=precomputed_lm_hints_25Hz,
+            )
             outputs = self.model.generate_audio(**generate_kwargs)
         # Add intermediate information to outputs for extra_outputs
         outputs["spans"] = spans
         outputs["latent_masks"] = batch.get("latent_masks")  # Latent masks for valid length
+        # Add condition tensors for LRC timestamp generation
+        outputs["encoder_hidden_states"] = encoder_hidden_states
+        outputs["encoder_attention_mask"] = encoder_attention_mask
+        outputs["context_latents"] = context_latents
+        outputs["lyric_token_idss"] = lyric_token_idss
         return outputs
     def tiled_decode(self, latents, chunk_size=512, overlap=64):
             spans = outputs.get("spans", [])  # List of tuples
             latent_masks = outputs.get("latent_masks")  # [batch, T]
+            # Extract condition tensors for LRC timestamp generation
+            encoder_hidden_states = outputs.get("encoder_hidden_states")
+            encoder_attention_mask = outputs.get("encoder_attention_mask")
+            context_latents = outputs.get("context_latents")
+            lyric_token_idss = outputs.get("lyric_token_idss")
+            # Move all tensors to CPU to save VRAM (detach to release computation graph)
             extra_outputs = {
+                "pred_latents": pred_latents.detach().cpu() if pred_latents is not None else None,
+                "target_latents": target_latents_input.detach().cpu() if target_latents_input is not None else None,
+                "src_latents": src_latents.detach().cpu() if src_latents is not None else None,
+                "chunk_masks": chunk_masks.detach().cpu() if chunk_masks is not None else None,
+                "latent_masks": latent_masks.detach().cpu() if latent_masks is not None else None,
                 "spans": spans,
                 "time_costs": time_costs,
                 "seed_value": seed_value_for_ui,
+                # Condition tensors for LRC timestamp generation
+                "encoder_hidden_states": encoder_hidden_states.detach().cpu() if encoder_hidden_states is not None else None,
+                "encoder_attention_mask": encoder_attention_mask.detach().cpu() if encoder_attention_mask is not None else None,
+                "context_latents": context_latents.detach().cpu() if context_latents is not None else None,
+                "lyric_token_idss": lyric_token_idss.detach().cpu() if lyric_token_idss is not None else None,
             }
             # Build audios list with tensor data (no file paths, no UUIDs, handled outside)
                 "success": False,
                 "error": str(e),
             }
+    @torch.no_grad()
+    def get_lyric_timestamp(
+        self,
+        pred_latent: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_attention_mask: torch.Tensor,
+        context_latents: torch.Tensor,
+        lyric_token_ids: torch.Tensor,
+        total_duration_seconds: float,
+        vocal_language: str = "en",
+        inference_steps: int = 8,
+        seed: int = 42,
+        custom_layers_config: Optional[Dict] = None,
+    ) -> Dict[str, Any]:
+        """
+        Generate lyrics timestamps from generated audio latents using cross-attention alignment.
+        This method adds noise to the final pred_latent and re-infers one step to get
+        cross-attention matrices, then uses DTW to align lyrics tokens with audio frames.
+        Args:
+            pred_latent: Generated latent tensor [batch, T, D]
+            encoder_hidden_states: Cached encoder hidden states
+            encoder_attention_mask: Cached encoder attention mask
+            context_latents: Cached context latents
+            lyric_token_ids: Tokenized lyrics tensor [batch, seq_len]
+            total_duration_seconds: Total audio duration in seconds
+            vocal_language: Language code for lyrics header parsing
+            inference_steps: Number of inference steps (for noise level calculation)
+            seed: Random seed for noise generation
+            custom_layers_config: Dict mapping layer indices to head indices
+        Returns:
+            Dict containing:
+            - lrc_text: LRC formatted lyrics with timestamps
+            - sentence_timestamps: List of SentenceTimestamp objects
+            - token_timestamps: List of TokenTimestamp objects
+            - success: Whether generation succeeded
+            - error: Error message if failed
+        """
+        from transformers.cache_utils import EncoderDecoderCache, DynamicCache
+        if self.model is None:
+            return {
+                "lrc_text": "",
+                "sentence_timestamps": [],
+                "token_timestamps": [],
+                "success": False,
+                "error": "Model not initialized"
+            }
+        if custom_layers_config is None:
+            custom_layers_config = self.custom_layers_config
+        try:
+            # Move tensors to device
+            device = self.device
+            dtype = self.dtype
+            pred_latent = pred_latent.to(device=device, dtype=dtype)
+            encoder_hidden_states = encoder_hidden_states.to(device=device, dtype=dtype)
+            encoder_attention_mask = encoder_attention_mask.to(device=device, dtype=dtype)
+            context_latents = context_latents.to(device=device, dtype=dtype)
+            bsz = pred_latent.shape[0]
+            # Calculate noise level: t_last = 1.0 / inference_steps
+            t_last_val = 1.0 / inference_steps
+            t_curr_tensor = torch.tensor([t_last_val] * bsz, device=device, dtype=dtype)
+            x1 = pred_latent
+            # Generate noise
+            if seed is None:
+                x0 = torch.randn_like(x1)
+            else:
+                generator = torch.Generator(device=device).manual_seed(int(seed))
+                x0 = torch.randn(x1.shape, generator=generator, device=device, dtype=dtype)
+            # Add noise to pred_latent: xt = t * noise + (1 - t) * x1
+            xt = t_last_val * x0 + (1.0 - t_last_val) * x1
+            xt_in = xt
+            t_in = t_curr_tensor
+            # Get null condition embedding
+            encoder_hidden_states_in = encoder_hidden_states
+            encoder_attention_mask_in = encoder_attention_mask
+            context_latents_in = context_latents
+            latent_length = x1.shape[1]
+            attention_mask = torch.ones(bsz, latent_length, device=device, dtype=dtype)
+            attention_mask_in = attention_mask
+            past_key_values = None
+            # Run decoder with output_attentions=True
+            with self._load_model_context("model"):
+                decoder = self.model.decoder
+                decoder_outputs = decoder(
+                    hidden_states=xt_in,
+                    timestep=t_in,
+                    timestep_r=t_in,
+                    attention_mask=attention_mask_in,
+                    encoder_hidden_states=encoder_hidden_states_in,
+                    use_cache=False,
+                    past_key_values=past_key_values,
+                    encoder_attention_mask=encoder_attention_mask_in,
+                    context_latents=context_latents_in,
+                    output_attentions=True,
+                    custom_layers_config=custom_layers_config,
+                    enable_early_exit=True
+                )
+                # Extract cross-attention matrices
+                if decoder_outputs[2] is None:
+                    return {
+                        "lrc_text": "",
+                        "sentence_timestamps": [],
+                        "token_timestamps": [],
+                        "success": False,
+                        "error": "Model did not return attentions"
+                    }
+                cross_attns = decoder_outputs[2]  # Tuple of tensors (some may be None)
+                captured_layers_list = []
+                for layer_attn in cross_attns:
+                    # Skip None values (layers that didn't return attention)
+                    if layer_attn is None:
+                        continue
+                    # Only take conditional part (first half of batch)
+                    cond_attn = layer_attn[:bsz]
+                    layer_matrix = cond_attn.transpose(-1, -2)
+                    captured_layers_list.append(layer_matrix)
+                if not captured_layers_list:
+                    return {
+                        "lrc_text": "",
+                        "sentence_timestamps": [],
+                        "token_timestamps": [],
+                        "success": False,
+                        "error": "No valid attention layers returned"
+                    }
+                stacked = torch.stack(captured_layers_list)
+                if bsz == 1:
+                    all_layers_matrix = stacked.squeeze(1)
+                else:
+                    all_layers_matrix = stacked
+            # Process lyric token IDs to extract pure lyrics
+            if isinstance(lyric_token_ids, torch.Tensor):
+                raw_lyric_ids = lyric_token_ids[0].tolist()
+            else:
+                raw_lyric_ids = lyric_token_ids
+            # Parse header to find lyrics start position
+            header_str = f"# Languages\n{vocal_language}\n\n# Lyric\n"
+            header_ids = self.text_tokenizer.encode(header_str, add_special_tokens=False)
+            start_idx = len(header_ids)
+            # Find end of lyrics (before endoftext token)
+            try:
+                end_idx = raw_lyric_ids.index(151643)  # <|endoftext|> token
+            except ValueError:
+                end_idx = len(raw_lyric_ids)
+            pure_lyric_ids = raw_lyric_ids[start_idx:end_idx]
+            pure_lyric_matrix = all_layers_matrix[:, :, start_idx:end_idx, :]
+            # Create aligner and generate timestamps
+            aligner = MusicStampsAligner(self.text_tokenizer)
+            align_info = aligner.stamps_align_info(
+                attention_matrix=pure_lyric_matrix,
+                lyrics_tokens=pure_lyric_ids,
+                total_duration_seconds=total_duration_seconds,
+                custom_config=custom_layers_config,
+                return_matrices=False,
+                violence_level=2.0,
+                medfilt_width=1,
+            )
+            if align_info.get("calc_matrix") is None:
+                return {
+                    "lrc_text": "",
+                    "sentence_timestamps": [],
+                    "token_timestamps": [],
+                    "success": False,
+                    "error": align_info.get("error", "Failed to process attention matrix")
+                }
+            # Generate timestamps
+            result = aligner.get_timestamps_and_lrc(
+                calc_matrix=align_info["calc_matrix"],
+                lyrics_tokens=pure_lyric_ids,
+                total_duration_seconds=total_duration_seconds
+            )
+            return {
+                "lrc_text": result["lrc_text"],
+                "sentence_timestamps": result["sentence_timestamps"],
+                "token_timestamps": result["token_timestamps"],
+                "success": True,
+                "error": None
+            }
+        except Exception as e:
+            error_msg = f"Error generating timestamps: {str(e)}"
+            logger.exception("[get_lyric_timestamp] Failed")
+            return {
+                "lrc_text": "",
+                "sentence_timestamps": [],
+                "token_timestamps": [],
+                "success": False,
+                "error": error_msg
+            }

pyproject.toml CHANGED Viewed

@@ -30,7 +30,7 @@ dependencies = [
     "uvicorn[standard]>=0.27.0",
     # Local third-party packages
-    "nano-vllm @ file:///${PROJECT_ROOT}/acestep/third_parts/nano-vllm",
 ]
 [project.scripts]
@@ -41,8 +41,8 @@ acestep-api = "acestep.api_server:main"
 requires = ["hatchling"]
 build-backend = "hatchling.build"
-[tool.uv]
-dev-dependencies = []
 [[tool.uv.index]]
 name = "pytorch"

     "uvicorn[standard]>=0.27.0",
     # Local third-party packages
+    "nano-vllm @ {root:uri}/acestep/third_parts/nano-vllm",
 ]
 [project.scripts]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
+[dependency-groups]
+dev = []
 [[tool.uv.index]]
 name = "pytorch"