Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on Zero

App Files Files Community

ChuxiJ commited on Jan 8

Commit

0659e3b

1 Parent(s): a161649

support test time scaling & auto score & next batch

Browse files

Files changed (3) hide show

acestep/gradio_ui.py +0 -0
acestep/llm_inference.py +341 -4
acestep/test_time_scaling.py +274 -216

acestep/gradio_ui.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

acestep/llm_inference.py CHANGED Viewed

@@ -337,6 +337,155 @@ class LLMHandler:
             output_text = str(outputs)
         return output_text
     def _run_pt_from_formatted(
         self,
@@ -573,6 +722,8 @@ class LLMHandler:
             use_cot_caption: Whether to generate caption in CoT (default True).
             use_cot_language: Whether to generate language in CoT (default True).
         """
         infer_type = (infer_type or "").strip().lower()
         if infer_type not in {"dit", "llm_dit"}:
             return {}, "", f"❌ invalid infer_type: {infer_type!r} (expected 'dit' or 'llm_dit')"
@@ -581,10 +732,15 @@ class LLMHandler:
         audio_codes = ""
         has_all_metas = self.has_all_metas(user_metadata)
         # ========== PHASE 1: CoT Generation ==========
         # Always generate CoT unless all metadata are user-provided
         if not has_all_metas or not is_format_caption:
             logger.info("Phase 1: Generating CoT metadata...")
             # Build formatted prompt for CoT phase
             formatted_prompt = self.build_formatted_prompt(caption, lyrics, generation_phase="cot")
@@ -615,12 +771,14 @@ class LLMHandler:
                 stop_at_reasoning=True,  # Always stop at </think> in Phase 1
             )
             if not cot_output_text:
                 return {}, "", status
             # Parse metadata from CoT output
             metadata, _ = self.parse_lm_output(cot_output_text)
-            logger.info(f"Phase 1 completed. Generated metadata: {list(metadata.keys())}")
         else:
             # Use user-provided metadata
             logger.info("Phase 1: Using user-provided metadata (skipping generation)")
@@ -628,11 +786,12 @@ class LLMHandler:
         # If infer_type is 'dit', stop here and return only metadata
         if infer_type == "dit":
-            status_msg = f"✅ Generated CoT metadata successfully\nFields: {', '.join(metadata.keys())}"
             return metadata, "", status_msg
         # ========== PHASE 2: Audio Codes Generation ==========
         logger.info("Phase 2: Generating audio codes...")
         # Format metadata as CoT using YAML (matching training format)
         cot_text = self._format_metadata_as_cot(metadata)
@@ -668,14 +827,192 @@ class LLMHandler:
         if not codes_output_text:
             return metadata, "", status
         # Parse audio codes from output (metadata should be same as Phase 1)
         _, audio_codes = self.parse_lm_output(codes_output_text)
         codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
-        logger.info(f"Phase 2 completed. Generated {codes_count} audio codes")
-        status_msg = f"✅ Generated successfully (2-phase)\nPhase 1: CoT metadata\nPhase 2: {codes_count} audio codes"
         return metadata, audio_codes, status_msg
     def build_formatted_prompt(self, caption: str, lyrics: str = "", is_negative_prompt: bool = False, generation_phase: str = "cot", negative_prompt: str = "NO USER INPUT") -> str:
         """

             output_text = str(outputs)
         return output_text
+    def _run_vllm_batch(
+        self,
+        formatted_prompts: List[str],
+        temperature: float,
+        cfg_scale: float,
+        negative_prompt: str,
+        top_k: Optional[int],
+        top_p: Optional[float],
+        repetition_penalty: float,
+        use_constrained_decoding: bool = True,
+        constrained_decoding_debug: bool = False,
+        target_duration: Optional[float] = None,
+        generation_phase: str = "codes",
+        caption: str = "",
+        lyrics: str = "",
+        cot_text: str = "",
+        seeds: Optional[List[int]] = None,
+    ) -> List[str]:
+        """Batch generation using vllm backend"""
+        from nanovllm import SamplingParams
+        batch_size = len(formatted_prompts)
+        # Determine effective temperature for sampler
+        effective_sampler_temp = temperature
+        # Use shared constrained processor if enabled
+        # Note: vllm batch mode uses same processor for all items
+        constrained_processor = None
+        if use_constrained_decoding:
+            # Reset processor state for new generation
+            self.constrained_processor.reset()
+            self.constrained_processor.enabled = use_constrained_decoding
+            self.constrained_processor.debug = constrained_decoding_debug
+            self.constrained_processor.metadata_temperature = None
+            self.constrained_processor.codes_temperature = None
+            self.constrained_processor.set_target_duration(target_duration)
+            self.constrained_processor.set_user_metadata(None)
+            self.constrained_processor.set_stop_at_reasoning(False)
+            self.constrained_processor.set_skip_genres(True)
+            self.constrained_processor.set_skip_caption(True)
+            self.constrained_processor.set_skip_language(True)
+            self.constrained_processor.set_generation_phase(generation_phase)
+            constrained_processor = self.constrained_processor
+        # Build sampling params
+        sampling_params = SamplingParams(
+            max_tokens=self.max_model_len - 64,
+            temperature=effective_sampler_temp,
+            cfg_scale=cfg_scale,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            logits_processor=constrained_processor,
+            logits_processor_update_state=constrained_processor.update_state if constrained_processor else None,
+        )
+        # Generate with or without CFG
+        if cfg_scale > 1.0:
+            # Build unconditional prompts
+            formatted_unconditional_prompt = self.build_formatted_prompt_with_cot(
+                caption, lyrics, cot_text, is_negative_prompt=True, negative_prompt=negative_prompt
+            )
+            unconditional_prompts = [formatted_unconditional_prompt] * batch_size
+            outputs = self.llm.generate(
+                formatted_prompts,
+                sampling_params,
+                unconditional_prompts=unconditional_prompts,
+            )
+        else:
+            outputs = self.llm.generate(formatted_prompts, sampling_params)
+        # Extract text from each output
+        output_texts = []
+        for output in outputs:
+            if hasattr(output, "outputs") and len(output.outputs) > 0:
+                output_texts.append(output.outputs[0].text)
+            elif hasattr(output, "text"):
+                output_texts.append(output.text)
+            elif isinstance(output, dict) and "text" in output:
+                output_texts.append(output["text"])
+            else:
+                output_texts.append(str(output))
+        return output_texts
+    def _run_pt_batch(
+        self,
+        formatted_prompts: List[str],
+        temperature: float,
+        cfg_scale: float,
+        negative_prompt: str,
+        top_k: Optional[int],
+        top_p: Optional[float],
+        repetition_penalty: float,
+        use_constrained_decoding: bool = True,
+        constrained_decoding_debug: bool = False,
+        target_duration: Optional[float] = None,
+        generation_phase: str = "codes",
+        caption: str = "",
+        lyrics: str = "",
+        cot_text: str = "",
+        seeds: Optional[List[int]] = None,
+    ) -> List[str]:
+        """Batch generation using PyTorch backend"""
+        import random
+        batch_size = len(formatted_prompts)
+        output_texts = []
+        # Generate each item sequentially with different seeds
+        # (PyTorch backend doesn't support true batching efficiently)
+        for i, formatted_prompt in enumerate(formatted_prompts):
+            # Set seed for this item if provided
+            if seeds and i < len(seeds):
+                torch.manual_seed(seeds[i])
+                if torch.cuda.is_available():
+                    torch.cuda.manual_seed_all(seeds[i])
+            # Generate using single-item method
+            output_text = self._run_pt_from_formatted(
+                formatted_prompt=formatted_prompt,
+                temperature=temperature,
+                cfg_scale=cfg_scale,
+                negative_prompt=negative_prompt,
+                top_k=top_k,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                use_constrained_decoding=use_constrained_decoding,
+                constrained_decoding_debug=constrained_decoding_debug,
+                target_duration=target_duration,
+                user_metadata=None,
+                stop_at_reasoning=False,
+                skip_genres=True,
+                skip_caption=True,
+                skip_language=True,
+                generation_phase=generation_phase,
+                caption=caption,
+                lyrics=lyrics,
+                cot_text=cot_text,
+            )
+            output_texts.append(output_text)
+        return output_texts
     def _run_pt_from_formatted(
         self,
             use_cot_caption: Whether to generate caption in CoT (default True).
             use_cot_language: Whether to generate language in CoT (default True).
         """
+        import time
         infer_type = (infer_type or "").strip().lower()
         if infer_type not in {"dit", "llm_dit"}:
             return {}, "", f"❌ invalid infer_type: {infer_type!r} (expected 'dit' or 'llm_dit')"
         audio_codes = ""
         has_all_metas = self.has_all_metas(user_metadata)
+        # Timing variables
+        phase1_time = 0.0
+        phase2_time = 0.0
         # ========== PHASE 1: CoT Generation ==========
         # Always generate CoT unless all metadata are user-provided
         if not has_all_metas or not is_format_caption:
             logger.info("Phase 1: Generating CoT metadata...")
+            phase1_start = time.time()
             # Build formatted prompt for CoT phase
             formatted_prompt = self.build_formatted_prompt(caption, lyrics, generation_phase="cot")
                 stop_at_reasoning=True,  # Always stop at </think> in Phase 1
             )
+            phase1_time = time.time() - phase1_start
             if not cot_output_text:
                 return {}, "", status
             # Parse metadata from CoT output
             metadata, _ = self.parse_lm_output(cot_output_text)
+            logger.info(f"Phase 1 completed in {phase1_time:.2f}s. Generated metadata: {list(metadata.keys())}")
         else:
             # Use user-provided metadata
             logger.info("Phase 1: Using user-provided metadata (skipping generation)")
         # If infer_type is 'dit', stop here and return only metadata
         if infer_type == "dit":
+            status_msg = f"✅ Generated CoT metadata successfully\nFields: {', '.join(metadata.keys())}\nPhase1: {phase1_time:.2f}s"
             return metadata, "", status_msg
         # ========== PHASE 2: Audio Codes Generation ==========
         logger.info("Phase 2: Generating audio codes...")
+        phase2_start = time.time()
         # Format metadata as CoT using YAML (matching training format)
         cot_text = self._format_metadata_as_cot(metadata)
         if not codes_output_text:
             return metadata, "", status
+        phase2_time = time.time() - phase2_start
         # Parse audio codes from output (metadata should be same as Phase 1)
         _, audio_codes = self.parse_lm_output(codes_output_text)
         codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
+        logger.info(f"Phase 2 completed in {phase2_time:.2f}s. Generated {codes_count} audio codes")
+        status_msg = f"✅ Generated successfully (2-phase)\nPhase 1: CoT metadata\nPhase 2: {codes_count} audio codes\nPhase1: {phase1_time:.2f}s, Phase2: {phase2_time:.2f}s"
         return metadata, audio_codes, status_msg
+    def generate_with_stop_condition_batch(
+        self,
+        caption: str,
+        lyrics: str,
+        batch_size: int,
+        infer_type: str = "llm_dit",
+        temperature: float = 0.85,
+        cfg_scale: float = 1.0,
+        negative_prompt: str = "NO USER INPUT",
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        repetition_penalty: float = 1.0,
+        use_constrained_decoding: bool = True,
+        constrained_decoding_debug: bool = False,
+        target_duration: Optional[float] = None,
+        user_metadata: Optional[Dict[str, Optional[str]]] = None,
+        use_cot_caption: bool = True,
+        use_cot_language: bool = True,
+        is_format_caption: bool = False,
+        seeds: Optional[List[int]] = None,
+    ) -> Tuple[List[Dict[str, Any]], List[str], str]:
+        """
+        Batch version of generate_with_stop_condition.
+        Generates multiple audio codes with same conditions but different seeds (for diversity).
+        Args:
+            caption: Same caption for all items
+            lyrics: Same lyrics for all items
+            batch_size: Number of items to generate
+            seeds: Optional list of seeds for each batch item (for reproducibility)
+            ... (other args same as generate_with_stop_condition)
+        Returns:
+            Tuple of (metadata_list, audio_codes_list, status_message)
+            - metadata_list: List of metadata dicts (same metadata for all items)
+            - audio_codes_list: List of audio code strings (one per item, different due to sampling)
+            - status_message: Generation status
+        """
+        import random
+        import time
+        infer_type = (infer_type or "").strip().lower()
+        if infer_type not in {"dit", "llm_dit"}:
+            return [], [], f"❌ invalid infer_type: {infer_type!r} (expected 'dit' or 'llm_dit')"
+        # Generate seeds if not provided
+        if seeds is None:
+            seeds = [random.randint(0, 2**32 - 1) for _ in range(batch_size)]
+        elif len(seeds) < batch_size:
+            # Pad with random seeds if not enough provided
+            seeds = list(seeds) + [random.randint(0, 2**32 - 1) for _ in range(batch_size - len(seeds))]
+        else:
+            seeds = seeds[:batch_size]  # Truncate if too many
+        # Timing variables
+        phase1_time = 0.0
+        phase2_time = 0.0
+        # ========== PHASE 1: CoT Generation (ONCE for all items) ==========
+        has_all_metas = self.has_all_metas(user_metadata)
+        if not has_all_metas or not is_format_caption:
+            logger.info("Batch Phase 1: Generating CoT metadata (once for all items)...")
+            phase1_start = time.time()
+            # Generate CoT metadata once (same for all batch items)
+            metadata, _, status = self.generate_with_stop_condition(
+                caption=caption,
+                lyrics=lyrics,
+                infer_type="dit",  # Only generate metadata
+                temperature=temperature,
+                cfg_scale=cfg_scale,
+                negative_prompt=negative_prompt,
+                top_k=top_k,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                use_constrained_decoding=use_constrained_decoding,
+                constrained_decoding_debug=constrained_decoding_debug,
+                target_duration=target_duration,
+                user_metadata=user_metadata,
+                use_cot_caption=use_cot_caption,
+                use_cot_language=use_cot_language,
+                is_format_caption=is_format_caption,
+            )
+            phase1_time = time.time() - phase1_start
+            if not metadata:
+                return [], [], status
+            logger.info(f"Batch Phase 1 completed in {phase1_time:.2f}s. Generated metadata: {list(metadata.keys())}")
+        else:
+            # Use user-provided metadata
+            logger.info("Batch Phase 1: Using user-provided metadata (skipping generation)")
+            metadata = {k: v for k, v in user_metadata.items() if v is not None}
+        # If infer_type is 'dit', stop here and return only metadata
+        if infer_type == "dit":
+            metadata_list = [metadata.copy() for _ in range(batch_size)]
+            status_msg = f"✅ Generated CoT metadata successfully (batch mode)\nFields: {', '.join(metadata.keys())}\nPhase1: {phase1_time:.2f}s"
+            return metadata_list, [""] * batch_size, status_msg
+        # ========== PHASE 2: Audio Codes Generation (BATCH) ==========
+        logger.info(f"Batch Phase 2: Generating audio codes for {batch_size} items...")
+        phase2_start = time.time()
+        # Format metadata as CoT
+        cot_text = self._format_metadata_as_cot(metadata)
+        # Build formatted prompt with CoT
+        formatted_prompt = self.build_formatted_prompt_with_cot(caption, lyrics, cot_text)
+        # Replicate prompt for batch (all items have same prompt, differ by seeds)
+        formatted_prompts = [formatted_prompt] * batch_size
+        # Call backend-specific batch generation
+        try:
+            if self.llm_backend == "vllm":
+                codes_outputs = self._run_vllm_batch(
+                    formatted_prompts=formatted_prompts,
+                    temperature=temperature,
+                    cfg_scale=cfg_scale,
+                    negative_prompt=negative_prompt,
+                    top_k=top_k,
+                    top_p=top_p,
+                    repetition_penalty=repetition_penalty,
+                    use_constrained_decoding=use_constrained_decoding,
+                    constrained_decoding_debug=constrained_decoding_debug,
+                    target_duration=target_duration,
+                    generation_phase="codes",
+                    caption=caption,
+                    lyrics=lyrics,
+                    cot_text=cot_text,
+                    seeds=seeds,
+                )
+            else:  # pt backend
+                codes_outputs = self._run_pt_batch(
+                    formatted_prompts=formatted_prompts,
+                    temperature=temperature,
+                    cfg_scale=cfg_scale,
+                    negative_prompt=negative_prompt,
+                    top_k=top_k,
+                    top_p=top_p,
+                    repetition_penalty=repetition_penalty,
+                    use_constrained_decoding=use_constrained_decoding,
+                    constrained_decoding_debug=constrained_decoding_debug,
+                    target_duration=target_duration,
+                    generation_phase="codes",
+                    caption=caption,
+                    lyrics=lyrics,
+                    cot_text=cot_text,
+                    seeds=seeds,
+                )
+        except Exception as e:
+            error_msg = f"❌ Error in batch codes generation: {str(e)}"
+            logger.error(error_msg)
+            return [], [], error_msg
+        # Parse audio codes from each output
+        audio_codes_list = []
+        metadata_list = []
+        for output_text in codes_outputs:
+            _, audio_codes = self.parse_lm_output(output_text)
+            audio_codes_list.append(audio_codes)
+            metadata_list.append(metadata.copy())  # Same metadata for all
+        phase2_time = time.time() - phase2_start
+        # Log results
+        codes_counts = [len(codes.split('<|audio_code_')) - 1 if codes else 0 for codes in audio_codes_list]
+        logger.info(f"Batch Phase 2 completed in {phase2_time:.2f}s. Generated codes: {codes_counts}")
+        status_msg = f"✅ Batch generation completed ({batch_size} items)\nPhase 1: CoT metadata\nPhase 2: {sum(codes_counts)} total codes ({codes_counts})\nPhase1: {phase1_time:.2f}s, Phase2: {phase2_time:.2f}s"
+        return metadata_list, audio_codes_list, status_msg
     def build_formatted_prompt(self, caption: str, lyrics: str = "", is_negative_prompt: bool = False, generation_phase: str = "cot", negative_prompt: str = "NO USER INPUT") -> str:
         """

acestep/test_time_scaling.py CHANGED Viewed

@@ -4,258 +4,316 @@ Implements perplexity-based scoring for generated audio codes
 """
 import torch
 import torch.nn.functional as F
-from typing import Tuple, Optional, Dict, Any
 from loguru import logger
 import yaml
-def perplexity_to_score(perplexity: float, scale: float = 100.0) -> float:
     """
-    Convert perplexity to a normalized score in [0, 1] range.
-    Lower perplexity = higher score (better quality)
-    Uses exponential decay: score = exp(-perplexity / scale)
     Args:
-        perplexity: Perplexity value (typically 1 to 1000+)
-        scale: Scale parameter to control score distribution (default 100.0)
-               - Smaller scale: more sensitive to perplexity changes
-               - Larger scale: less sensitive to perplexity changes
     Returns:
-        Score in [0, 1] range, where 1 is perfect and 0 is worst
-    Examples:
-        perplexity=1   → score≈0.99  (excellent)
-        perplexity=50  → score≈0.61  (good if scale=100)
-        perplexity=100 → score≈0.37  (medium if scale=100)
-        perplexity=500 → score≈0.01  (poor if scale=100)
     """
-    import math
-    return math.exp(-perplexity / scale)
-def calculate_perplexity(
-    llm_handler,
-    audio_codes: str,
-    caption: str = "",
-    lyrics: str = "",
-    metadata: Optional[Dict[str, Any]] = None,
-    temperature: float = 1.0,
-) -> Tuple[float, str]:
     """
-    Calculate perplexity of generated audio codes conditioned on caption/lyrics/metadata.
-    This reverses the generation task: given audio codes as input, measure how well
-    the model can predict the CoT metadata and lyrics that should generate those codes.
-    Lower perplexity = model is less surprised = better quality generation
-    Score = -perplexity (higher is better)
-    The understanding task format is:
-    Input: <|audio_code_123|><|audio_code_456|>...
-    Output: <think>\nmetadata_yaml\n</think>\n\n# Lyric\nlyrics_text
     Args:
-        llm_handler: LLM handler instance with initialized model
-        audio_codes: Generated audio code string (e.g., "<|audio_code_123|><|audio_code_456|>...")
-        caption: Caption text used for generation
-        lyrics: Lyrics text used for generation
-        metadata: Dictionary with CoT metadata fields (bpm, duration, keyscale, language, timesignature, etc.)
-        temperature: Temperature for probability scaling (default 1.0)
     Returns:
-        Tuple of (perplexity_value, status_message)
-    Example:
-        metadata = {'bpm': 120, 'duration': 30, 'keyscale': 'C major', 'language': 'en', 'timesignature': '4'}
-        perplexity, status = calculate_perplexity(
-            llm_handler,
-            audio_codes="<|audio_code_123|>...",
-            caption="calm piano",
-            lyrics="verse 1...",
-            metadata=metadata
-        )
-        score = -perplexity  # Higher score = better quality
     """
-    if not llm_handler.llm_initialized:
-        return float('inf'), "❌ LLM not initialized"
-    if not audio_codes or not audio_codes.strip():
-        return float('inf'), "❌ No audio codes provided"
-    try:
-        # Build the understanding prompt: codes as input
-        # The model should generate: <think>metadata</think>\n# Lyric\n...
-        formatted_prompt = llm_handler.build_formatted_prompt_for_understanding(
-            audio_codes=audio_codes,
-            is_negative_prompt=False
-        )
-        logger.info(f"Calculating perplexity for {len(audio_codes)} character audio codes")
-        # Build the expected output (target sequence) following understanding task format
-        # Format: <think>\nmetadata_yaml\n</think>\n\n# Lyric\nlyrics_text
-        target_parts = []
-        # Build CoT section with metadata
-        if metadata and isinstance(metadata, dict):
-            # Filter out None values and format as YAML (sorted keys)
-            cot_items = {}
-            for key in ['bpm', 'caption', 'duration', 'genres', 'keyscale', 'language', 'timesignature']:
-                if key in metadata and metadata[key] is not None:
-                    cot_items[key] = metadata[key]
-            if cot_items:
-                cot_yaml = yaml.dump(cot_items, allow_unicode=True, sort_keys=True).strip()
-                target_parts.append(f"<think>\n{cot_yaml}\n</think>\n")
-        # Add Lyric section (note: understanding task uses "# Lyric" not "# Caption")
-        if lyrics:
-            target_parts.append(f"\n# Lyric\n{lyrics}\n")
-        target_text = "".join(target_parts)
-        if not target_text.strip():
-            return float('inf'), "❌ No target text to evaluate (lyrics or metadata required)"
-        logger.debug(f"Target text (first 200 chars): {target_text[:200]}...")
-        # Calculate perplexity using appropriate backend
-        if llm_handler.llm_backend == "vllm":
-            perplexity = _calculate_perplexity_vllm(
-                llm_handler,
-                formatted_prompt,
-                target_text,
-                temperature
-            )
-        else:  # pt backend
-            perplexity = _calculate_perplexity_pt(
-                llm_handler,
-                formatted_prompt,
-                target_text,
-                temperature
-            )
-        status_msg = f"✅ Perplexity calculated: {perplexity:.4f}"
-        logger.info(status_msg)
-        return perplexity, status_msg
-    except Exception as e:
-        error_msg = f"❌ Error calculating perplexity: {str(e)}"
-        logger.error(error_msg)
-        import traceback
-        logger.error(traceback.format_exc())
-        return float('inf'), error_msg
-def _calculate_perplexity_pt(
-    llm_handler,
-    formatted_prompt: str,
-    target_text: str,
-    temperature: float
-) -> float:
     """
-    Calculate perplexity using PyTorch backend.
-    For vllm backend, this uses a shared-weight HuggingFace model.
-    For pt backend, this uses the original model.
     Args:
-        llm_handler: LLM handler with pt or vllm backend
-        formatted_prompt: Formatted input prompt (audio codes)
-        target_text: Expected output text (CoT metadata + lyrics)
-        temperature: Temperature for probability scaling
     Returns:
-        Perplexity value
     """
-    # Get model for scoring (handles both pt and vllm backends)
     model = llm_handler.get_hf_model_for_scoring()
     tokenizer = llm_handler.llm_tokenizer
     device = llm_handler.device if llm_handler.llm_backend == "pt" else next(model.parameters()).device
-    # Tokenize prompt and target separately
-    prompt_tokens = tokenizer(
-        formatted_prompt,
-        return_tensors="pt",
-        padding=False,
-        truncation=True,
-    )
-    target_tokens = tokenizer(
-        target_text,
-        return_tensors="pt",
-        padding=False,
-        truncation=True,
-    )
-    # Concatenate prompt + target for full sequence
-    full_input_ids = torch.cat([
-        prompt_tokens['input_ids'],
-        target_tokens['input_ids']
-    ], dim=1).to(device)
-    # Create attention mask
-    attention_mask = torch.ones_like(full_input_ids)
-    # Forward pass to get logits
     with torch.no_grad():
         with llm_handler._load_model_context():
-            outputs = model(
-                input_ids=full_input_ids,
-                attention_mask=attention_mask
-            )
-            logits = outputs.logits  # [batch_size, seq_len, vocab_size]
-    # Get the logits for predicting target tokens
-    # Shift logits and labels: logits[i] predicts token[i+1]
-    prompt_len = prompt_tokens['input_ids'].shape[1]
-    target_len = target_tokens['input_ids'].shape[1]
-    # Extract logits for positions that predict target tokens
-    # logits at positions [prompt_len-1 : prompt_len+target_len-1] predict target tokens
-    pred_logits = logits[0, prompt_len-1:prompt_len+target_len-1, :]  # [target_len, vocab_size]
-    target_ids = target_tokens['input_ids'][0]  # [target_len]
-    # Apply temperature scaling
-    if temperature != 1.0:
-        pred_logits = pred_logits / temperature
-    # Calculate cross-entropy loss for each position
     log_probs = F.log_softmax(pred_logits, dim=-1)  # [target_len, vocab_size]
-    # Gather log probabilities of target tokens
-    target_log_probs = log_probs[torch.arange(target_len), target_ids]  # [target_len]
-    # Calculate perplexity: exp(-mean(log_probs))
-    mean_neg_log_prob = -target_log_probs.mean()
-    perplexity = torch.exp(mean_neg_log_prob).item()
-    return perplexity
-def _calculate_perplexity_vllm(
     llm_handler,
-    formatted_prompt: str,
-    target_text: str,
-    temperature: float
-) -> float:
     """
-    Calculate perplexity using vllm backend.
-    Uses shared-weight HuggingFace model for perplexity calculation.
-    This avoids the complexity of nanovllm's context management.
-    Args:
-        llm_handler: LLM handler with vllm backend
-        formatted_prompt: Formatted input prompt (audio codes)
-        target_text: Expected output text (CoT metadata + lyrics)
-        temperature: Temperature for probability scaling
-    Returns:
-        Perplexity value
     """
-    logger.debug("Using vllm backend with shared-weight HuggingFace model for perplexity")
-    # Delegate to pt backend implementation which now handles both backends
-    return _calculate_perplexity_pt(llm_handler, formatted_prompt, target_text, temperature)

 """
 import torch
 import torch.nn.functional as F
+from typing import Tuple, Optional, Dict, Any, List
 from loguru import logger
 import yaml
+import math
+import re
+def pmi_score(log_prob_conditional: float, log_prob_unconditional: float) -> float:
     """
+    Calculate Pointwise Mutual Information (PMI) score.
+    PMI = log P(condition|codes) - log P(condition)
+        = log [P(codes|condition) / P(codes)]
+    This removes the bias from P(condition) and measures how much the codes
+    improve our ability to predict the condition.
     Args:
+        log_prob_conditional: Average log probability of condition given codes
+        log_prob_unconditional: Average log probability of condition without codes
     Returns:
+        PMI score (higher is better, can be positive or negative)
+        - Positive: codes improve prediction → good match
+        - Zero: codes don't help → no correlation
+        - Negative: codes hurt prediction → poor match
     """
+    return log_prob_conditional - log_prob_unconditional
+def pmi_to_normalized_score(pmi: float, scale: float = 0.1) -> float:
     """
+    Convert PMI score to normalized [0, 1] range using sigmoid function.
+    score = sigmoid(PMI / scale) = 1 / (1 + exp(-PMI / scale))
     Args:
+        pmi: PMI score (can be positive or negative)
+        scale: Scale parameter to control sensitivity (default 0.1)
+               - Smaller scale: more sensitive to PMI changes
+               - Larger scale: less sensitive to PMI changes
     Returns:
+        Normalized score in [0, 1] range, where:
+        - PMI > 0 → score > 0.5 (good match)
+        - PMI = 0 → score = 0.5 (neutral)
+        - PMI < 0 → score < 0.5 (poor match)
+    Examples (scale=1.0):
+        PMI=2.0  → score≈0.88  (excellent)
+        PMI=1.0  → score≈0.73  (good)
+        PMI=0.0  → score=0.50  (neutral)
+        PMI=-1.0 → score≈0.27  (poor)
+        PMI=-2.0 → score≈0.12  (bad)
     """
+    return 1.0 / (1.0 + math.exp(-pmi / scale))
+def _get_logits_and_target_for_scoring(llm_handler, formatted_prompt: str,
+                                       target_text: str) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Args:
+        llm_handler: The handler containing the model and tokenizer.
+        formatted_prompt: The input context.
+        target_text: The text we want to calculate probability/recall for.
     Returns:
+        Tuple of (target_logits, target_ids)
+        - target_logits: Logits used to predict the target tokens.
+        - target_ids: The ground truth token IDs of the target.
     """
     model = llm_handler.get_hf_model_for_scoring()
     tokenizer = llm_handler.llm_tokenizer
     device = llm_handler.device if llm_handler.llm_backend == "pt" else next(model.parameters()).device
+    # 1. Tokenize prompt ONLY to get its length (used for slicing later).
+    #    We must ensure special tokens are added to count the offset correctly.
+    prompt_tokens_temp = tokenizer(formatted_prompt, return_tensors="pt", add_special_tokens=True)
+    prompt_len = prompt_tokens_temp['input_ids'].shape[1]
+    # 2. Tokenize the FULL text (Prompt + Target).
+    #    This ensures subword merging at boundaries is handled correctly by the tokenizer.
+    full_text = formatted_prompt + target_text
+    full_tokens = tokenizer(full_text, return_tensors="pt", padding=False, truncation=True, add_special_tokens=True).to(device)
+    input_ids = full_tokens['input_ids']
+    # Safety check: if target was empty or truncated entirely
+    if input_ids.shape[1] <= prompt_len:
+        return torch.empty(0, device=device), torch.empty(0, device=device)
+    # 3. Forward Pass (Teacher Forcing)
     with torch.no_grad():
         with llm_handler._load_model_context():
+            outputs = model(input_ids=input_ids, attention_mask=full_tokens['attention_mask'])
+            all_logits = outputs.logits  # [1, seq_len, vocab_size]
+    # 4. Extract Logits and Labels
+    #    We need to predict `input_ids[i]`. The logit for this is at `all_logits[i-1]`.
+    #    Target starts at index `prompt_len`.
+    #    So we need logits from `prompt_len - 1` up to the second to last position.
+    target_logits = all_logits[0, prompt_len - 1:-1, :]  # [target_len, vocab_size]
+    target_ids = input_ids[0, prompt_len:]  # [target_len]
+    return target_logits, target_ids
+# ==============================================================================
+# Scoring Logic
+# ==============================================================================
+def _calculate_topk_recall(llm_handler,
+                           formatted_prompt: str,
+                           target_text: str,
+                           topk: int = 10) -> Tuple[float, Dict[int, float]]:
+    """
+    Calculate top-k recall for target text given prompt.
+    Checks if the ground truth token is within the top-k probabilities at each step.
+    """
+    # Use the fixed helper to get aligned logits/labels
+    pred_logits, target_ids = _get_logits_and_target_for_scoring(llm_handler, formatted_prompt, target_text)
+    if target_ids.shape[0] == 0:
+        return 0.0, {}
+    target_len = target_ids.shape[0]
+    # Get top-k indices for all positions at once
+    # topk_indices: [target_len, topk]
+    _, topk_indices = torch.topk(pred_logits, k=min(topk, pred_logits.shape[-1]), dim=-1)
+    recall_per_k = {}
+    position_scores = []
+    # Convert to list for faster CPU iteration
+    target_ids_list = target_ids.tolist()
+    topk_indices_list = topk_indices.tolist()
+    for k in range(1, topk + 1):
+        hits = 0
+        for pos in range(target_len):
+            gt_token = target_ids_list[pos]
+            # Check the top-k slice
+            topk_at_pos = topk_indices_list[pos][:k]
+            if gt_token in topk_at_pos:
+                hits += 1
+                # Calculate position-weighted score only once (when k=topk)
+                if k == topk:
+                    rank = topk_at_pos.index(gt_token) + 1
+                    # Rank 1 = 1.0, Rank k = small positive
+                    position_weight = 1.0 - (rank - 1) / topk
+                    position_scores.append(position_weight)
+        recall_per_k[k] = hits / target_len if target_len > 0 else 0.0
+    # Fill scores for positions where GT was NOT in top-k
+    while len(position_scores) < target_len:
+        position_scores.append(0.0)
+    average_recall = sum(position_scores) / len(position_scores) if position_scores else 0.0
+    return average_recall, recall_per_k
+def _calculate_metadata_recall(llm_handler,
+                               formatted_prompt: str,
+                               fields_dict: Dict[str, Any],
+                               topk: int = 10) -> Dict[str, float]:
+    """
+    Args:
+        fields_dict: Dictionary of {field_name: field_value}
+    """
+    if not fields_dict:
+        return {}
+    field_scores = {}
+    for field_name in sorted(fields_dict.keys()):
+        # Construct target text for this specific field
+        # e.g. <think>\nbpm: 120\n</think>\n
+        field_yaml = yaml.dump({field_name: fields_dict[field_name]}, allow_unicode=True, sort_keys=True).strip()
+        field_target_text = f"<think>\n{field_yaml}\n</think>\n"
+        # Calculate recall using the robust logic
+        avg_score, _ = _calculate_topk_recall(llm_handler, formatted_prompt, field_target_text, topk=topk)
+        field_scores[field_name] = avg_score
+        logger.debug(f"Recall for {field_name}: {avg_score:.4f}")
+    return field_scores
+def _calculate_log_prob(
+        llm_handler,
+        formatted_prompt: str,
+        target_text: str,
+        temperature: float = 1.0  # Kept for API compatibility, but ignored for scoring
+) -> float:
+    """
+    Calculate average log probability of target text given prompt.
+    """
+    pred_logits, target_ids = _get_logits_and_target_for_scoring(llm_handler, formatted_prompt, target_text)
+    if target_ids.shape[0] == 0:
+        return float('-inf')
+    # FIX: Do not divide by temperature.
+    # Log-probability for PMI/Perplexity should be exact.
+    # Calculate log probabilities (log_softmax)
     log_probs = F.log_softmax(pred_logits, dim=-1)  # [target_len, vocab_size]
+    # Gather log probabilities of the ground truth tokens
+    target_log_probs = log_probs[torch.arange(target_ids.shape[0]), target_ids]
+    # Return average log probability
+    mean_log_prob = target_log_probs.mean().item()
+    return mean_log_prob
+# ==============================================================================
+# Main Public API
+# ==============================================================================
+def calculate_pmi_score_per_condition(
     llm_handler,
+    audio_codes: str,
+    caption: str = "",
+    lyrics: str = "",
+    metadata: Optional[Dict[str, Any]] = None,
+    temperature: float = 1.0,
+    topk: int = 10,
+    score_scale: float = 0.1,
+) -> Tuple[Dict[str, float], float, str]:
     """
+    Calculate quality score separately for each condition.
+    - Metadata: Uses Top-k Recall.
+    - Caption/Lyrics: Uses PMI (Normalized).
     """
+    if not llm_handler.llm_initialized:
+        return {}, 0.0, "❌ LLM not initialized"
+    if not audio_codes or not audio_codes.strip():
+        return {}, 0.0, "❌ No audio codes provided"
+    if "caption" not in metadata:
+        metadata['caption'] = caption
+    formatted_prompt = llm_handler.build_formatted_prompt_for_understanding(audio_codes=audio_codes, is_negative_prompt=False)
+    prompt_uncond = llm_handler.build_formatted_prompt_for_understanding(audio_codes="NO USER INPUT", is_negative_prompt=False)
+    try:
+        # 1. Calculate Recall for Metadata Fields
+        if metadata and isinstance(metadata, dict):
+            scores = {}
+            # Define which fields use which metric
+            metadata_recall_keys = ['bpm', 'duration', 'genres', 'keyscale', 'language', 'timesignature']
+            metadata_pmi_keys = ['caption']
+            for key in metadata_recall_keys:
+                if key in metadata and metadata[key] is not None:
+                    recall_metadata = {key: metadata[key]}
+                    field_scores = _calculate_metadata_recall(llm_handler, formatted_prompt, recall_metadata, topk=topk)
+                    scores.update(field_scores)
+            # 2. Calculate PMI for Caption
+            for key in metadata_pmi_keys:
+                if key in metadata and metadata[key] is not None:
+                    cot_yaml = yaml.dump({key: metadata[key]}, allow_unicode=True, sort_keys=True).strip()
+                    target_text = f"<think>\n{cot_yaml}\n</think>\n"
+                    log_prob_cond = _calculate_log_prob(llm_handler, formatted_prompt, target_text)
+                    log_prob_uncond = _calculate_log_prob(llm_handler, prompt_uncond, target_text)
+                    pmi_normalized = pmi_to_normalized_score(log_prob_cond - log_prob_uncond, scale=score_scale)
+                    scores[key] = pmi_normalized
+        # 3. Calculate PMI for Lyrics
+        if lyrics:
+            target_text = f"<think>\n</think>\n# Lyric\n{lyrics}\n"
+            log_prob_cond = _calculate_log_prob(llm_handler, formatted_prompt, target_text)
+            prompt_uncond = llm_handler.build_formatted_prompt_for_understanding(audio_codes="NO USER INPUT", is_negative_prompt=False)
+            log_prob_uncond = _calculate_log_prob(llm_handler, prompt_uncond, target_text)
+            scores['lyrics'] = pmi_to_normalized_score(log_prob_cond - log_prob_uncond, scale=score_scale)
+        if not scores:
+            return {}, 0.0, "❌ No conditions to evaluate"
+        # 4. Global Score
+        global_score = sum(scores.values()) / len(scores)
+        # Status Message
+        status_lines = ["✅ Per-condition scores (0-1):"]
+        for key, score in sorted(scores.items()):
+            metric = "Top-k Recall" if key in metadata_recall_keys else "PMI (Norm)"
+            status_lines.append(f"  {key}: {score:.4f} ({metric})")
+        status_lines.append(f"Global score: {global_score:.4f}")
+        logger.info(f"Calculated scores: {global_score:.4f}")
+        return scores, global_score, "\n".join(status_lines)
+    except Exception as e:
+        import traceback
+        error_msg = f"❌ Error: {str(e)}"
+        logger.error(error_msg)
+        logger.error(traceback.format_exc())
+        return {}, float('-inf'), error_msg