Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on Zero

App Files Files Community

keylxiao commited on Jan 13

Commit

5ab4485

1 Parent(s): 745e016

feat :sparkles: : add lyrics alignment scores

Browse files

Files changed (4) hide show

acestep/dit_alignment_score.py +324 -1
acestep/gradio_ui/events/__init__.py +1 -1
acestep/gradio_ui/events/results_handlers.py +108 -9
acestep/handler.py +227 -1

acestep/dit_alignment_score.py CHANGED Viewed

@@ -11,7 +11,7 @@ import torch
 import numpy as np
 import torch.nn.functional as F
 from dataclasses import dataclass, asdict
-from typing import List, Dict, Any, Optional
 # ================= Data Classes =================
@@ -545,3 +545,326 @@ class MusicStampsAligner:
             "lrc_text": lrc_text
         }

 import numpy as np
 import torch.nn.functional as F
 from dataclasses import dataclass, asdict
+from typing import List, Dict, Any, Optional, Tuple, Union
 # ================= Data Classes =================
             "lrc_text": lrc_text
         }
+class MusicLyricScorer:
+    """
+    Scorer class for evaluating lyrics-to-audio alignment quality.
+    Focuses on calculating alignment quality metrics (Coverage, Monotonicity, Confidence)
+    using tensor operations for potential differentiability or GPU acceleration.
+    """
+    def __init__(self, tokenizer: Any):
+        """
+        Initialize the aligner.
+        Args:
+            tokenizer: Tokenizer instance (must implement .decode()).
+        """
+        self.tokenizer = tokenizer
+    def _generate_token_type_mask(self, token_ids: List[int]) -> np.ndarray:
+        """
+        Generate a mask distinguishing lyrics (1) from structural tags (0).
+        Uses self.tokenizer to decode tokens.
+        Args:
+            token_ids: List of token IDs.
+        Returns:
+            Numpy array of shape [len(token_ids)] with 1 or 0.
+        """
+        decoded_tokens = [self.tokenizer.decode([tid]) for tid in token_ids]
+        mask = np.ones(len(token_ids), dtype=np.int32)
+        in_bracket = False
+        for i, token_str in enumerate(decoded_tokens):
+            if '[' in token_str:
+                in_bracket = True
+            if in_bracket:
+                mask[i] = 0
+            if ']' in token_str:
+                in_bracket = False
+                mask[i] = 0
+        return mask
+    def _preprocess_attention(
+            self,
+            attention_matrix: Union[torch.Tensor, np.ndarray],
+            custom_config: Dict[int, List[int]],
+            medfilt_width: int = 1
+    ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[torch.Tensor]]:
+        """
+        Extracts and normalizes the attention matrix.
+        Logic V4: Uses Min-Max normalization to highlight energy differences.
+        Args:
+            attention_matrix: Raw attention tensor [Layers, Heads, Tokens, Frames].
+            custom_config: Config mapping layers to heads.
+            medfilt_width: Width for median filtering.
+        Returns:
+            Tuple of (calc_matrix, energy_matrix, avg_weights_tensor).
+        """
+        # 1. Prepare Tensor
+        if not isinstance(attention_matrix, torch.Tensor):
+            weights = torch.tensor(attention_matrix)
+        else:
+            weights = attention_matrix.clone()
+        weights = weights.cpu().float()
+        # 2. Select Heads based on config
+        selected_tensors = []
+        for layer_idx, head_indices in custom_config.items():
+            for head_idx in head_indices:
+                if layer_idx < weights.shape[0] and head_idx < weights.shape[1]:
+                    selected_tensors.append(weights[layer_idx, head_idx])
+        if not selected_tensors:
+            return None, None, None
+        weights_stack = torch.stack(selected_tensors, dim=0)
+        # 3. Average Heads
+        avg_weights = weights_stack.mean(dim=0)  # [Tokens, Frames]
+        # 4. Preprocessing Logic
+        # Min-Max normalization preserving energy distribution
+        # Median filter is applied to the energy matrix
+        energy_tensor = median_filter(avg_weights, filter_width=medfilt_width)
+        energy_matrix = energy_tensor.numpy()
+        e_min, e_max = energy_matrix.min(), energy_matrix.max()
+        if e_max - e_min > 1e-9:
+            energy_matrix = (energy_matrix - e_min) / (e_max - e_min)
+        else:
+            energy_matrix = np.zeros_like(energy_matrix)
+        # Contrast enhancement for DTW pathfinding
+        # calc_matrix is used for pathfinding, energy_matrix for scoring
+        calc_matrix = energy_matrix ** 2
+        return calc_matrix, energy_matrix, avg_weights
+    def _compute_alignment_metrics(
+            self,
+            energy_matrix: torch.Tensor,
+            path_coords: torch.Tensor,
+            type_mask: torch.Tensor,
+            time_weight: float = 0.01,
+            overlap_frames: float = 9.0,
+            instrumental_weight: float = 1.0
+    ) -> Tuple[float, float, float]:
+        """
+        Core metric calculation logic using high-precision Tensor operations.
+        Args:
+            energy_matrix: Normalized energy [Rows, Cols].
+            path_coords: DTW path coordinates [Steps, 2].
+            type_mask: Token type mask [Rows] (1=Lyrics, 0=Tags).
+            time_weight: Minimum energy threshold for monotonicity.
+            overlap_frames: Allowed overlap for monotonicity check.
+            instrumental_weight: Weight for non-lyric tokens in confidence calc.
+        Returns:
+            Tuple of (coverage, monotonicity, confidence).
+        """
+        # Ensure high precision for internal calculation
+        energy_matrix = energy_matrix.to(dtype=torch.float64)
+        path_coords = path_coords.long()
+        type_mask = type_mask.long()
+        device = energy_matrix.device
+        rows, cols = energy_matrix.shape
+        is_lyrics_row = (type_mask == 1)
+        # ================= A. Coverage Score =================
+        # Ratio of lyric lines that have significant energy peak
+        row_max_energies = energy_matrix.max(dim=1).values
+        total_sung_rows = is_lyrics_row.sum().double()
+        coverage_threshold = 0.1
+        valid_sung_mask = is_lyrics_row & (row_max_energies > coverage_threshold)
+        valid_sung_rows = valid_sung_mask.sum().double()
+        if total_sung_rows > 0:
+            coverage_score = valid_sung_rows / total_sung_rows
+        else:
+            coverage_score = torch.tensor(1.0, device=device, dtype=torch.float64)
+        # ================= B. Monotonicity Score =================
+        # Check if the "center of mass" of lyric lines moves forward in time
+        col_indices = torch.arange(cols, device=device, dtype=torch.float64)
+        # Zero out low energy noise
+        weights = torch.where(
+            energy_matrix > time_weight,
+            energy_matrix,
+            torch.zeros_like(energy_matrix)
+        )
+        sum_w = weights.sum(dim=1)
+        sum_t = (weights * col_indices).sum(dim=1)
+        # Calculate centroids
+        centroids = torch.full((rows,), -1.0, device=device, dtype=torch.float64)
+        valid_w_mask = sum_w > 1e-9
+        centroids[valid_w_mask] = sum_t[valid_w_mask] / sum_w[valid_w_mask]
+        # Extract sequence of valid lyrics centroids
+        valid_sequence_mask = is_lyrics_row & (centroids >= 0)
+        sung_centroids = centroids[valid_sequence_mask]
+        cnt = sung_centroids.shape[0]
+        if cnt > 1:
+            curr_c = sung_centroids[:-1]
+            next_c = sung_centroids[1:]
+            # Check non-decreasing order with overlap tolerance
+            non_decreasing = (next_c >= (curr_c - overlap_frames)).double().sum()
+            pairs = torch.tensor(cnt - 1, device=device, dtype=torch.float64)
+            monotonicity_score = non_decreasing / pairs
+        else:
+            monotonicity_score = torch.tensor(1.0, device=device, dtype=torch.float64)
+        # ================= C. Path Confidence =================
+        # Average energy along the optimal path
+        if path_coords.shape[0] > 0:
+            p_rows = path_coords[:, 0]
+            p_cols = path_coords[:, 1]
+            path_energies = energy_matrix[p_rows, p_cols]
+            step_weights = torch.ones_like(path_energies)
+            # Lower weight for instrumental/tag steps
+            is_inst_step = (type_mask[p_rows] == 0)
+            step_weights[is_inst_step] = instrumental_weight
+            total_energy = (path_energies * step_weights).sum()
+            total_steps = step_weights.sum()
+            if total_steps > 0:
+                path_confidence = total_energy / total_steps
+            else:
+                path_confidence = torch.tensor(0.0, device=device, dtype=torch.float64)
+        else:
+            path_confidence = torch.tensor(0.0, device=device, dtype=torch.float64)
+        return coverage_score.item(), monotonicity_score.item(), path_confidence.item()
+    def lyrics_alignment_info(
+            self,
+            attention_matrix: Union[torch.Tensor, np.ndarray],
+            token_ids: List[int],
+            custom_config: Dict[int, List[int]],
+            return_matrices: bool = False,
+            medfilt_width: int = 1
+    ) -> Dict[str, Any]:
+        """
+        Generates alignment path and processed matrices.
+        Args:
+            attention_matrix: Input attention tensor.
+            token_ids: Corresponding token IDs.
+            custom_config: Layer/Head configuration.
+            return_matrices: If True, returns matrices in the output.
+            medfilt_width: Median filter width.
+        Returns:
+            Dict or AlignmentInfo object containing path and masks.
+        """
+        calc_matrix, energy_matrix, vis_matrix = self._preprocess_attention(
+            attention_matrix, custom_config, medfilt_width
+        )
+        if calc_matrix is None:
+            return {
+                "calc_matrix": None,
+                "error": "No valid attention heads found"
+            }
+        # 1. Generate Semantic Mask (1=Lyrics, 0=Tags)
+        # Uses self.tokenizer internally
+        type_mask = self._generate_token_type_mask(token_ids)
+        # Safety check for shape mismatch
+        if len(type_mask) != energy_matrix.shape[0]:
+            # Fallback to all lyrics if shapes don't align
+            type_mask = np.ones(energy_matrix.shape[0], dtype=np.int32)
+        # 2. DTW Pathfinding
+        # Using negative calc_matrix because DTW minimizes cost
+        text_indices, time_indices = dtw_cpu(-calc_matrix.astype(np.float32))
+        path_coords = np.stack([text_indices, time_indices], axis=1)
+        return_dict = {
+            "path_coords": path_coords,
+            "type_mask": type_mask,
+            "energy_matrix": energy_matrix
+        }
+        if return_matrices:
+            return_dict['calc_matrix'] = calc_matrix
+            return_dict['vis_matrix'] = vis_matrix
+        return return_dict
+    def calculate_score(
+            self,
+            energy_matrix: Union[torch.Tensor, np.ndarray],
+            type_mask: Union[torch.Tensor, np.ndarray],
+            path_coords: Union[torch.Tensor, np.ndarray],
+            time_weight: float = 0.01,
+            overlap_frames: float = 9.0,
+            instrumental_weight: float = 1.0
+    ) -> Dict[str, Any]:
+        """
+        Calculates the final alignment score based on pre-computed components.
+        Args:
+            energy_matrix: Processed energy matrix.
+            type_mask: Token type mask.
+            path_coords: DTW path coordinates.
+            time_weight: Minimum energy threshold for monotonicity.
+            overlap_frames: Allowed backward movement frames.
+            instrumental_weight: Weight for non-lyric path steps.
+        Returns:
+            AlignmentScore object containing individual metrics and final score.
+        """
+        # Ensure Inputs are Tensors on the correct device
+        if not isinstance(energy_matrix, torch.Tensor):
+            energy_matrix = torch.tensor(energy_matrix, device='cuda', dtype=torch.float32)
+        device = energy_matrix.device
+        if not isinstance(type_mask, torch.Tensor):
+            type_mask = torch.tensor(type_mask, device=device, dtype=torch.long)
+        else:
+            type_mask = type_mask.to(device=device, dtype=torch.long)
+        if not isinstance(path_coords, torch.Tensor):
+            path_coords = torch.tensor(path_coords, device=device, dtype=torch.long)
+        else:
+            path_coords = path_coords.to(device=device, dtype=torch.long)
+        # Compute Metrics
+        coverage, monotonicity, confidence = self._compute_alignment_metrics(
+            energy_matrix=energy_matrix,
+            path_coords=path_coords,
+            type_mask=type_mask,
+            time_weight=time_weight,
+            overlap_frames=overlap_frames,
+            instrumental_weight=instrumental_weight
+        )
+        # Final Score Calculation
+        # (Cov^2 * Mono^2 * Conf)
+        final_score = (coverage ** 2) * (monotonicity ** 2) * confidence
+        final_score = float(np.clip(final_score, 0.0, 1.0))
+        return {
+            "lyrics_score": round(final_score, 4)
+        }

acestep/gradio_ui/events/__init__.py CHANGED Viewed

@@ -336,7 +336,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
     # Use default argument to capture btn_idx value at definition time (Python closure fix)
     def make_score_handler(idx):
         return lambda scale, batch_idx, queue: res_h.calculate_score_handler_with_selection(
-            llm_handler, idx, scale, batch_idx, queue
         )
     for btn_idx in range(1, 9):

     # Use default argument to capture btn_idx value at definition time (Python closure fix)
     def make_score_handler(idx):
         return lambda scale, batch_idx, queue: res_h.calculate_score_handler_with_selection(
+            dit_handler, llm_handler, idx, scale, batch_idx, queue
         )
     for btn_idx in range(1, 9):

acestep/gradio_ui/events/results_handlers.py CHANGED Viewed

@@ -714,7 +714,22 @@ def generate_with_progress(
-def calculate_score_handler(llm_handler, audio_codes_str, caption, lyrics, lm_metadata, bpm, key_scale, time_signature, audio_duration, vocal_language, score_scale):
     """
     Calculate PMI-based quality score for generated audio.
@@ -733,6 +748,9 @@ def calculate_score_handler(llm_handler, audio_codes_str, caption, lyrics, lm_me
         audio_duration: Audio duration value
         vocal_language: Vocal language value
         score_scale: Sensitivity scale parameter
     Returns:
         Score display string
@@ -791,7 +809,37 @@ def calculate_score_handler(llm_handler, audio_codes_str, caption, lyrics, lm_me
             topk=10,
             score_scale=score_scale
         )
         # Format display string with per-condition breakdown
         if global_score == 0.0 and not scores_per_condition:
             return t("messages.score_failed", error=status)
@@ -804,12 +852,17 @@ def calculate_score_handler(llm_handler, audio_codes_str, caption, lyrics, lm_me
                 )
             conditions_display = "\n".join(condition_lines) if condition_lines else "  (no conditions)"
-            return (
                 f"✅ Global Quality Score: {global_score:.4f} (0-1, higher=better)\n\n"
-                f"📊 Per-Condition Scores (0-1):\n{conditions_display}\n\n"
-                f"Note: Metadata uses Top-k Recall, Caption/Lyrics use PMI\n"
             )
     except Exception as e:
         import traceback
@@ -817,12 +870,19 @@ def calculate_score_handler(llm_handler, audio_codes_str, caption, lyrics, lm_me
         return error_msg
-def calculate_score_handler_with_selection(llm_handler, sample_idx, score_scale, current_batch_index, batch_queue):
     """
     Calculate PMI-based quality score - REFACTORED to read from batch_queue only.
     This ensures scoring uses the actual generation parameters, not current UI values.
     Args:
         llm_handler: LLM handler instance
         sample_idx: Which sample to score (1-8)
         score_scale: Sensitivity scale parameter (tool setting, can be from UI)
@@ -843,6 +903,7 @@ def calculate_score_handler_with_selection(llm_handler, sample_idx, score_scale,
     time_signature = params.get("time_signature", "")
     audio_duration = params.get("audio_duration", -1)
     vocal_language = params.get("vocal_language", "")
     # Get LM metadata from batch_data (if it was saved during generation)
     lm_metadata = batch_data.get("lm_generated_metadata", None)
@@ -862,13 +923,51 @@ def calculate_score_handler_with_selection(llm_handler, sample_idx, score_scale,
     else:
         # Single mode: all samples use same codes
         audio_codes_str = stored_codes if isinstance(stored_codes, str) else ""
     # Calculate score using historical parameters
     score_display = calculate_score_handler(
         llm_handler,
         audio_codes_str, caption, lyrics, lm_metadata,
         bpm, key_scale, time_signature, audio_duration, vocal_language,
-        score_scale
     )
     # Update batch_queue with the calculated score

+def calculate_score_handler(
+        llm_handler,
+        audio_codes_str,
+        caption,
+        lyrics,
+        lm_metadata,
+        bpm,
+        key_scale,
+        time_signature,
+        audio_duration,
+        vocal_language,
+        score_scale,
+        dit_handler,
+        extra_tensor_data,
+        inference_steps,
+):
     """
     Calculate PMI-based quality score for generated audio.
         audio_duration: Audio duration value
         vocal_language: Vocal language value
         score_scale: Sensitivity scale parameter
+        dit_handler: DiT handler instance (for alignment scoring)
+        extra_tensor_data: Dictionary containing tensors for the specific sample
+        inference_steps: Number of inference steps used
     Returns:
         Score display string
             topk=10,
             score_scale=score_scale
         )
+        alignment_report = ""
+        # Only calculate if we have the handler, tensor data, and actual lyrics
+        if dit_handler and extra_tensor_data and lyrics and lyrics.strip():
+            try:
+                align_result = dit_handler.get_lyric_score(
+                    pred_latent=extra_tensor_data.get('pred_latent'),
+                    encoder_hidden_states=extra_tensor_data.get('encoder_hidden_states'),
+                    encoder_attention_mask=extra_tensor_data.get('encoder_attention_mask'),
+                    context_latents=extra_tensor_data.get('context_latents'),
+                    lyric_token_ids=extra_tensor_data.get('lyric_token_ids'),
+                    vocal_language=vocal_language or "en",
+                    inference_steps=int(inference_steps),
+                    seed=42,
+                )
+                if align_result.get("success"):
+                    lm_align_score = align_result.get("lm_score", 0.0)
+                    dit_align_score = align_result.get("dit_score", 0.0)
+                    alignment_report = (
+                        f"  • llm lyrics alignment score: {lm_align_score:.4f}\n"
+                        f"  • dit lyrics alignment score: {dit_align_score:.4f}\n"
+                        "\n(Measures how well lyrics timestamps match audio energy using Cross-Attention)"
+                    )
+                else:
+                    align_err = align_result.get("error", "Unknown error")
+                    alignment_report = f"\n⚠️ Alignment Score Failed: {align_err}"
+            except Exception as e:
+                alignment_report = f"\n⚠️ Alignment Score Error: {str(e)}"
         # Format display string with per-condition breakdown
         if global_score == 0.0 and not scores_per_condition:
             return t("messages.score_failed", error=status)
                 )
             conditions_display = "\n".join(condition_lines) if condition_lines else "  (no conditions)"
+            final_output = (
                 f"✅ Global Quality Score: {global_score:.4f} (0-1, higher=better)\n\n"
+                f"📊 Per-Condition Scores (0-1):\n{conditions_display}\n"
             )
+            if alignment_report:
+                final_output += alignment_report + "\n"
+            final_output += "Note: Metadata uses Top-k Recall, Caption/Lyrics use PMI"
+            return final_output
     except Exception as e:
         import traceback
         return error_msg
+def calculate_score_handler_with_selection(
+        dit_handler,
+        llm_handler,
+        sample_idx,
+        score_scale,
+        current_batch_index,
+        batch_queue):
     """
     Calculate PMI-based quality score - REFACTORED to read from batch_queue only.
     This ensures scoring uses the actual generation parameters, not current UI values.
     Args:
+        dit_handler: DiT Handler
         llm_handler: LLM handler instance
         sample_idx: Which sample to score (1-8)
         score_scale: Sensitivity scale parameter (tool setting, can be from UI)
     time_signature = params.get("time_signature", "")
     audio_duration = params.get("audio_duration", -1)
     vocal_language = params.get("vocal_language", "")
+    inference_steps = params.get("inference_steps", 8)
     # Get LM metadata from batch_data (if it was saved during generation)
     lm_metadata = batch_data.get("lm_generated_metadata", None)
     else:
         # Single mode: all samples use same codes
         audio_codes_str = stored_codes if isinstance(stored_codes, str) else ""
+    # Extract Tensor Data for Alignment Score (Extra Outputs)
+    extra_tensor_data = None
+    extra_outputs = batch_data.get("extra_outputs", {})
+    # Only proceed if we have tensors and a valid index
+    if extra_outputs and dit_handler:
+        pred_latents = extra_outputs.get("pred_latents")
+        # Ensure we have the critical tensor to check batch size
+        if pred_latents is not None:
+            sample_idx_0based = sample_idx - 1
+            batch_size = pred_latents.shape[0]
+            if 0 <= sample_idx_0based < batch_size:
+                # Slice tensors for this specific sample (keep dimension [1, ...])
+                # We assume all stored tensors are aligned in batch dim 0
+                try:
+                    extra_tensor_data = {
+                        "pred_latent": pred_latents[sample_idx_0based:sample_idx_0based + 1],
+                        "encoder_hidden_states": extra_outputs.get("encoder_hidden_states")[
+                                                 sample_idx_0based:sample_idx_0based + 1],
+                        "encoder_attention_mask": extra_outputs.get("encoder_attention_mask")[
+                                                  sample_idx_0based:sample_idx_0based + 1],
+                        "context_latents": extra_outputs.get("context_latents")[
+                                           sample_idx_0based:sample_idx_0based + 1],
+                        "lyric_token_ids": extra_outputs.get("lyric_token_idss")[
+                                           sample_idx_0based:sample_idx_0based + 1]
+                    }
+                    # Verify no None values in the sliced dict
+                    if any(v is None for v in extra_tensor_data.values()):
+                        extra_tensor_data = None
+                except Exception as e:
+                    print(f"Error slicing tensor data for score: {e}")
+                    extra_tensor_data = None
     # Calculate score using historical parameters
     score_display = calculate_score_handler(
         llm_handler,
         audio_codes_str, caption, lyrics, lm_metadata,
         bpm, key_scale, time_signature, audio_duration, vocal_language,
+        score_scale,
+        dit_handler,
+        extra_tensor_data,
+        inference_steps,
     )
     # Update batch_queue with the calculated score

acestep/handler.py CHANGED Viewed

@@ -31,7 +31,7 @@ from acestep.constants import (
     SFT_GEN_PROMPT,
     DEFAULT_DIT_INSTRUCTION,
 )
-from acestep.dit_alignment_score import MusicStampsAligner
 warnings.filterwarnings("ignore")
@@ -2553,3 +2553,229 @@ class AceStepHandler:
                 "success": False,
                 "error": error_msg
             }

     SFT_GEN_PROMPT,
     DEFAULT_DIT_INSTRUCTION,
 )
+from acestep.dit_alignment_score import MusicStampsAligner, MusicLyricScorer
 warnings.filterwarnings("ignore")
                 "success": False,
                 "error": error_msg
             }
+    @torch.no_grad()
+    def get_lyric_score(
+            self,
+            pred_latent: torch.Tensor,
+            encoder_hidden_states: torch.Tensor,
+            encoder_attention_mask: torch.Tensor,
+            context_latents: torch.Tensor,
+            lyric_token_ids: torch.Tensor,
+            vocal_language: str = "en",
+            inference_steps: int = 8,
+            seed: int = 42,
+            custom_layers_config: Optional[Dict] = None,
+    ) -> Dict[str, Any]:
+        """
+        Calculate both LM and DiT alignment scores in one pass.
+        - lm_score: Checks structural alignment using pure noise at t=1.0.
+        - dit_score: Checks denoising alignment using regressed latents at t=1/steps.
+        Args:
+            pred_latent: Generated latent tensor [batch, T, D]
+            encoder_hidden_states: Cached encoder hidden states
+            encoder_attention_mask: Cached encoder attention mask
+            context_latents: Cached context latents
+            lyric_token_ids: Tokenized lyrics tensor [batch, seq_len]
+            vocal_language: Language code for lyrics header parsing
+            inference_steps: Number of inference steps (for noise level calculation)
+            seed: Random seed for noise generation
+            custom_layers_config: Dict mapping layer indices to head indices
+        Returns:
+            Dict containing:
+            - lm_score: float
+            - dit_score: float
+            - success: Whether generation succeeded
+            - error: Error message if failed
+        """
+        from transformers.cache_utils import EncoderDecoderCache, DynamicCache
+        if self.model is None:
+            return {
+                "lm_score": 0.0,
+                "dit_score": 0.0,
+                "success": False,
+                "error": "Model not initialized"
+            }
+        if custom_layers_config is None:
+            custom_layers_config = self.custom_layers_config
+        try:
+            # Move tensors to device
+            device = self.device
+            dtype = self.dtype
+            pred_latent = pred_latent.to(device=device, dtype=dtype)
+            encoder_hidden_states = encoder_hidden_states.to(device=device, dtype=dtype)
+            encoder_attention_mask = encoder_attention_mask.to(device=device, dtype=dtype)
+            context_latents = context_latents.to(device=device, dtype=dtype)
+            bsz = pred_latent.shape[0]
+            if seed is None:
+                x0 = torch.randn_like(pred_latent)
+            else:
+                generator = torch.Generator(device=device).manual_seed(int(seed))
+                x0 = torch.randn(pred_latent.shape, generator=generator, device=device, dtype=dtype)
+            # --- Input A: LM Score ---
+            # t = 1.0, xt = Pure Noise
+            t_lm = torch.tensor([1.0] * bsz, device=device, dtype=dtype)
+            xt_lm = x0
+            # --- Input B: DiT Score ---
+            # t = 1.0/steps, xt = Regressed Latent
+            t_last_val = 1.0 / inference_steps
+            t_dit = torch.tensor([t_last_val] * bsz, device=device, dtype=dtype)
+            # Flow Matching Regression: xt = t*x0 + (1-t)*x1
+            xt_dit = t_last_val * x0 + (1.0 - t_last_val) * pred_latent
+            # Order: [Think_Batch, DiT_Batch]
+            xt_in = torch.cat([xt_lm, xt_dit], dim=0)
+            t_in = torch.cat([t_lm, t_dit], dim=0)
+            # Duplicate conditions
+            encoder_hidden_states_in = torch.cat([encoder_hidden_states, encoder_hidden_states], dim=0)
+            encoder_attention_mask_in = torch.cat([encoder_attention_mask, encoder_attention_mask], dim=0)
+            context_latents_in = torch.cat([context_latents, context_latents], dim=0)
+            # Prepare Attention Mask
+            latent_length = xt_in.shape[1]
+            attention_mask_in = torch.ones(2 * bsz, latent_length, device=device, dtype=dtype)
+            past_key_values = None
+            # Run decoder with output_attentions=True
+            with self._load_model_context("model"):
+                decoder = self.model.decoder
+                if hasattr(decoder, 'eval'):
+                    decoder.eval()
+                decoder_outputs = decoder(
+                    hidden_states=xt_in,
+                    timestep=t_in,
+                    timestep_r=t_in,
+                    attention_mask=attention_mask_in,
+                    encoder_hidden_states=encoder_hidden_states_in,
+                    use_cache=False,
+                    past_key_values=past_key_values,
+                    encoder_attention_mask=encoder_attention_mask_in,
+                    context_latents=context_latents_in,
+                    output_attentions=True,
+                    custom_layers_config=custom_layers_config,
+                    enable_early_exit=True
+                )
+                # Extract cross-attention matrices
+                if decoder_outputs[2] is None:
+                    return {
+                        "lm_score": 0.0,
+                        "dit_score": 0.0,
+                        "success": False,
+                        "error": "Model did not return attentions"
+                    }
+                cross_attns = decoder_outputs[2]  # Tuple of tensors (some may be None)
+                captured_layers_list = []
+                for layer_attn in cross_attns:
+                    if layer_attn is None:
+                        continue
+                    # Only take conditional part (first half of batch)
+                    layer_matrix = layer_attn.transpose(-1, -2)
+                    captured_layers_list.append(layer_matrix)
+                if not captured_layers_list:
+                    return {
+                        "lm_score": 0.0,
+                        "dit_score": 0.0,
+                        "success": False,
+                        "error": "No valid attention layers returned"
+                    }
+                stacked = torch.stack(captured_layers_list)
+                all_layers_matrix_lm = stacked[:, :bsz, ...]
+                all_layers_matrix_dit = stacked[:, bsz:, ...]
+                if bsz == 1:
+                    all_layers_matrix_lm = all_layers_matrix_lm.squeeze(1)
+                    all_layers_matrix_dit = all_layers_matrix_dit.squeeze(1)
+                else:
+                    pass
+            # Process lyric token IDs to extract pure lyrics
+            if isinstance(lyric_token_ids, torch.Tensor):
+                raw_lyric_ids = lyric_token_ids[0].tolist()
+            else:
+                raw_lyric_ids = lyric_token_ids
+            # Parse header to find lyrics start position
+            header_str = f"# Languages\n{vocal_language}\n\n# Lyric\n"
+            header_ids = self.text_tokenizer.encode(header_str, add_special_tokens=False)
+            start_idx = len(header_ids)
+            # Find end of lyrics (before endoftext token)
+            try:
+                end_idx = raw_lyric_ids.index(151643)  # <|endoftext|> token
+            except ValueError:
+                end_idx = len(raw_lyric_ids)
+            pure_lyric_ids = raw_lyric_ids[start_idx:end_idx]
+            if start_idx >= all_layers_matrix_lm.shape[-2]:  # Check text dim
+                return {
+                    "lm_score": 0.0,
+                    "dit_score": 0.0,
+                    "success": False,
+                    "error": "Lyrics indices out of bounds"
+                }
+            pure_matrix_lm = all_layers_matrix_lm[..., start_idx:end_idx, :]
+            pure_matrix_dit = all_layers_matrix_dit[..., start_idx:end_idx, :]
+            # Create aligner and calculate alignment info
+            aligner = MusicLyricScorer(self.text_tokenizer)
+            def calculate_single_score(matrix):
+                """Helper to run aligner on a matrix"""
+                info = aligner.lyrics_alignment_info(
+                    attention_matrix=matrix,
+                    token_ids=pure_lyric_ids,
+                    custom_config=custom_layers_config,
+                    return_matrices=False,
+                    medfilt_width=1,
+                )
+                if info.get("energy_matrix") is None:
+                    return 0.0
+                res = aligner.calculate_score(
+                    energy_matrix=info["energy_matrix"],
+                    type_mask=info["type_mask"],
+                    path_coords=info["path_coords"],
+                )
+                # Return the final score (check return key)
+                return res.get("lyrics_score", res.get("final_score", 0.0))
+            lm_score = calculate_single_score(pure_matrix_lm)
+            dit_score = calculate_single_score(pure_matrix_dit)
+            return {
+                "lm_score": lm_score,
+                "dit_score": dit_score,
+                "success": True,
+                "error": None
+            }
+        except Exception as e:
+            error_msg = f"Error generating score: {str(e)}"
+            logger.exception("[get_lyric_score] Failed")
+            return {
+                "lm_score": 0.0,
+                "dit_score": 0.0,
+                "success": False,
+                "error": error_msg
+            }