Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on Zero

App Files Files Community

ChuxiJ commited on Jan 26

Commit

9fe9970

1 Parent(s): 0b990cd

refact ui (fix repaint/cover score bug)

Browse files

Files changed (4) hide show

Dockerfile +1 -1
acestep/gradio_ui/events/__init__.py +3 -0
acestep/gradio_ui/events/generation_handlers.py +9 -0
acestep/gradio_ui/events/results_handlers.py +103 -72

Dockerfile CHANGED Viewed

@@ -44,7 +44,7 @@ USER user
 RUN pip install --no-cache-dir --user -r requirements.txt
 # Install nano-vllm with --no-deps since all dependencies are already installed
-RUN pip install ./acestep/third_parts/nano-vllm
 # Copy the rest of the application
 COPY --chown=user:user . .

 RUN pip install --no-cache-dir --user -r requirements.txt
 # Install nano-vllm with --no-deps since all dependencies are already installed
+RUN pip install --no-deps ./acestep/third_parts/nano-vllm
 # Copy the rest of the application
 COPY --chown=user:user . .

acestep/gradio_ui/events/__init__.py CHANGED Viewed

@@ -286,6 +286,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["simple_sample_created"],
             generation_section["src_audio_group"],
             generation_section["audio_cover_strength"],
         ]
     )
@@ -680,6 +681,8 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
                 args_list[11] = result.duration  # audio_duration
             # Enable thinking for Simple mode
             args_list[28] = True  # think_checkbox
         # Determine which handler to use
         active_handler = dit_handler  # Default to primary handler

             generation_section["simple_sample_created"],
             generation_section["src_audio_group"],
             generation_section["audio_cover_strength"],
+            generation_section["think_checkbox"],  # Disable thinking for cover/repaint modes
         ]
     )
                 args_list[11] = result.duration  # audio_duration
             # Enable thinking for Simple mode
             args_list[28] = True  # think_checkbox
+            # Mark as formatted caption (LM-generated sample)
+            args_list[36] = True  # is_format_caption_state
         # Determine which handler to use
         active_handler = dit_handler  # Default to primary handler

acestep/gradio_ui/events/generation_handlers.py CHANGED Viewed

@@ -710,6 +710,7 @@ def handle_generation_mode_change(mode: str):
         - simple_sample_created (reset state)
         - src_audio_group (visibility) - shown for cover and repaint
         - audio_cover_strength (visibility) - shown only for cover mode
     """
     is_simple = mode == "simple"
     is_custom = mode == "custom"
@@ -725,6 +726,13 @@ def handle_generation_mode_change(mode: str):
     }
     task_type_value = task_type_map.get(mode, "text2music")
     return (
         gr.update(visible=is_simple),  # simple_mode_group
         gr.update(visible=not is_simple),  # custom_mode_content - visible for custom/cover/repaint
@@ -735,6 +743,7 @@ def handle_generation_mode_change(mode: str):
         False,  # simple_sample_created - reset to False on mode change
         gr.update(visible=is_cover or is_repaint),  # src_audio_group - shown for cover and repaint
         gr.update(visible=is_cover),  # audio_cover_strength - only shown for cover mode
     )

         - simple_sample_created (reset state)
         - src_audio_group (visibility) - shown for cover and repaint
         - audio_cover_strength (visibility) - shown only for cover mode
+        - think_checkbox (value and interactive) - disabled for cover/repaint modes
     """
     is_simple = mode == "simple"
     is_custom = mode == "custom"
     }
     task_type_value = task_type_map.get(mode, "text2music")
+    # think_checkbox: disabled and set to False for cover/repaint modes
+    # (these modes don't use LM thinking, they use source audio codes)
+    if is_cover or is_repaint:
+        think_checkbox_update = gr.update(value=False, interactive=False)
+    else:
+        think_checkbox_update = gr.update(value=True, interactive=True)
     return (
         gr.update(visible=is_simple),  # simple_mode_group
         gr.update(visible=not is_simple),  # custom_mode_content - visible for custom/cover/repaint
         False,  # simple_sample_created - reset to False on mode change
         gr.update(visible=is_cover or is_repaint),  # src_audio_group - shown for cover and repaint
         gr.update(visible=is_cover),  # audio_cover_strength - only shown for cover mode
+        think_checkbox_update,  # think_checkbox - disabled for cover/repaint modes
     )

acestep/gradio_ui/events/results_handlers.py CHANGED Viewed

@@ -277,7 +277,7 @@ def _build_generation_info(
             avg_per_song = generation_total / num_audios if num_audios > 0 else 0
             gen_lines = [
                 f"**🎵 Total generation time {songs_label}: {generation_total:.2f}s**",
-                f"**{avg_per_song:.2f}s per song**",
             ]
             if lm_total > 0:
                 gen_lines.append(f"- LM phase {songs_label}: {lm_total:.2f}s")
@@ -874,6 +874,9 @@ def calculate_score_handler(
     PMI (Pointwise Mutual Information) removes condition bias:
     score = log P(condition|codes) - log P(condition)
     Args:
         llm_handler: LLM handler instance
         audio_codes_str: Generated audio codes string
@@ -895,63 +898,74 @@ def calculate_score_handler(
     """
     from acestep.test_time_scaling import calculate_pmi_score_per_condition
-    if not llm_handler.llm_initialized:
-        return t("messages.lm_not_initialized")
-    if not audio_codes_str or not audio_codes_str.strip():
         return t("messages.no_codes")
     try:
-        # Build metadata dictionary from both LM metadata and user inputs
-        metadata = {}
-        # Priority 1: Use LM-generated metadata if available
-        if lm_metadata and isinstance(lm_metadata, dict):
-            metadata.update(lm_metadata)
-        # Priority 2: Add user-provided metadata (if not already in LM metadata)
-        if bpm is not None and 'bpm' not in metadata:
-            try:
-                metadata['bpm'] = int(bpm)
-            except:
-                pass
-        if caption and 'caption' not in metadata:
-            metadata['caption'] = caption
-        if audio_duration is not None and audio_duration > 0 and 'duration' not in metadata:
-            try:
-                metadata['duration'] = int(audio_duration)
-            except:
-                pass
-        if key_scale and key_scale.strip() and 'keyscale' not in metadata:
-            metadata['keyscale'] = key_scale.strip()
-        if vocal_language and vocal_language.strip() and 'language' not in metadata:
-            metadata['language'] = vocal_language.strip()
-        if time_signature and time_signature.strip() and 'timesignature' not in metadata:
-            metadata['timesignature'] = time_signature.strip()
-        # Calculate per-condition scores with appropriate metrics
-        # - Metadata fields (bpm, duration, etc.): Top-k recall
-        # - Caption and lyrics: PMI (normalized)
-        scores_per_condition, global_score, status = calculate_pmi_score_per_condition(
-            llm_handler=llm_handler,
-            audio_codes=audio_codes_str,
-            caption=caption or "",
-            lyrics=lyrics or "",
-            metadata=metadata if metadata else None,
-            temperature=1.0,
-            topk=10,
-            score_scale=score_scale
-        )
         alignment_report = ""
-        # Only calculate if we have the handler, tensor data, and actual lyrics
-        if dit_handler and extra_tensor_data and lyrics and lyrics.strip():
             try:
                 align_result = dit_handler.get_lyric_score(
                     pred_latent=extra_tensor_data.get('pred_latent'),
@@ -978,29 +992,46 @@ def calculate_score_handler(
             except Exception as e:
                 alignment_report = f"\n⚠️ Alignment Score Error: {str(e)}"
-        # Format display string with per-condition breakdown
-        if global_score == 0.0 and not scores_per_condition:
-            return t("messages.score_failed", error=status)
-        else:
-            # Build per-condition scores display
-            condition_lines = []
-            for condition_name, score_value in sorted(scores_per_condition.items()):
-                condition_lines.append(
-                    f"  • {condition_name}: {score_value:.4f}"
-                )
-            conditions_display = "\n".join(condition_lines) if condition_lines else "  (no conditions)"
-            final_output = (
-                f"✅ Global Quality Score: {global_score:.4f} (0-1, higher=better)\n\n"
-                f"📊 Per-Condition Scores (0-1):\n{conditions_display}\n"
-            )
-            if alignment_report:
-                final_output += alignment_report + "\n"
-            final_output += "Note: Metadata uses Top-k Recall, Caption/Lyrics use PMI"
-            return final_output
     except Exception as e:
         import traceback

             avg_per_song = generation_total / num_audios if num_audios > 0 else 0
             gen_lines = [
                 f"**🎵 Total generation time {songs_label}: {generation_total:.2f}s**",
+                f"\n**{avg_per_song:.2f}s per song**",
             ]
             if lm_total > 0:
                 gen_lines.append(f"- LM phase {songs_label}: {lm_total:.2f}s")
     PMI (Pointwise Mutual Information) removes condition bias:
     score = log P(condition|codes) - log P(condition)
+    For Cover/Repaint modes where audio_codes may not be available,
+    falls back to DiT alignment scoring only.
     Args:
         llm_handler: LLM handler instance
         audio_codes_str: Generated audio codes string
     """
     from acestep.test_time_scaling import calculate_pmi_score_per_condition
+    has_audio_codes = audio_codes_str and audio_codes_str.strip()
+    has_dit_alignment_data = dit_handler and extra_tensor_data and lyrics and lyrics.strip()
+    # Check if we can compute any scores
+    if not has_audio_codes and not has_dit_alignment_data:
+        # No audio codes and no DiT alignment data - can't compute any score
         return t("messages.no_codes")
     try:
+        scores_per_condition = {}
+        global_score = 0.0
         alignment_report = ""
+        # PMI-based scoring (requires audio codes and LLM)
+        if has_audio_codes:
+            if not llm_handler.llm_initialized:
+                # Can still try DiT alignment if available
+                if not has_dit_alignment_data:
+                    return t("messages.lm_not_initialized")
+            else:
+                # Build metadata dictionary from both LM metadata and user inputs
+                metadata = {}
+                # Priority 1: Use LM-generated metadata if available
+                if lm_metadata and isinstance(lm_metadata, dict):
+                    metadata.update(lm_metadata)
+                # Priority 2: Add user-provided metadata (if not already in LM metadata)
+                if bpm is not None and 'bpm' not in metadata:
+                    try:
+                        metadata['bpm'] = int(bpm)
+                    except:
+                        pass
+                if caption and 'caption' not in metadata:
+                    metadata['caption'] = caption
+                if audio_duration is not None and audio_duration > 0 and 'duration' not in metadata:
+                    try:
+                        metadata['duration'] = int(audio_duration)
+                    except:
+                        pass
+                if key_scale and key_scale.strip() and 'keyscale' not in metadata:
+                    metadata['keyscale'] = key_scale.strip()
+                if vocal_language and vocal_language.strip() and 'language' not in metadata:
+                    metadata['language'] = vocal_language.strip()
+                if time_signature and time_signature.strip() and 'timesignature' not in metadata:
+                    metadata['timesignature'] = time_signature.strip()
+                # Calculate per-condition scores with appropriate metrics
+                # - Metadata fields (bpm, duration, etc.): Top-k recall
+                # - Caption and lyrics: PMI (normalized)
+                scores_per_condition, global_score, status = calculate_pmi_score_per_condition(
+                    llm_handler=llm_handler,
+                    audio_codes=audio_codes_str,
+                    caption=caption or "",
+                    lyrics=lyrics or "",
+                    metadata=metadata if metadata else None,
+                    temperature=1.0,
+                    topk=10,
+                    score_scale=score_scale
+                )
+        # DiT alignment scoring (works even without audio codes - for Cover/Repaint modes)
+        if has_dit_alignment_data:
             try:
                 align_result = dit_handler.get_lyric_score(
                     pred_latent=extra_tensor_data.get('pred_latent'),
             except Exception as e:
                 alignment_report = f"\n⚠️ Alignment Score Error: {str(e)}"
+        # Format display string
+        if has_audio_codes and llm_handler.llm_initialized:
+            # Full scoring with PMI + alignment
+            if global_score == 0.0 and not scores_per_condition:
+                # PMI scoring failed but we might have alignment
+                if alignment_report and not alignment_report.startswith("\n⚠️"):
+                    final_output = "📊 DiT Alignment Scores (LM codes not available):\n"
+                    final_output += alignment_report
+                    return final_output
+                return t("messages.score_failed", error="PMI scoring returned no results")
+            else:
+                # Build per-condition scores display
+                condition_lines = []
+                for condition_name, score_value in sorted(scores_per_condition.items()):
+                    condition_lines.append(
+                        f"  • {condition_name}: {score_value:.4f}"
+                    )
+                conditions_display = "\n".join(condition_lines) if condition_lines else "  (no conditions)"
+                final_output = (
+                    f"✅ Global Quality Score: {global_score:.4f} (0-1, higher=better)\n\n"
+                    f"📊 Per-Condition Scores (0-1):\n{conditions_display}\n"
+                )
+                if alignment_report:
+                    final_output += alignment_report + "\n"
+                final_output += "Note: Metadata uses Top-k Recall, Caption/Lyrics use PMI"
+                return final_output
+        else:
+            # Only DiT alignment available (Cover/Repaint mode fallback)
+            if alignment_report and not alignment_report.startswith("\n⚠️"):
+                final_output = "📊 DiT Alignment Scores (LM codes not available for Cover/Repaint mode):\n"
+                final_output += alignment_report
+                return final_output
+            elif alignment_report:
+                return alignment_report
+            else:
+                return "⚠️ No scoring data available"
     except Exception as e:
         import traceback