Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on Zero

App Files Files Community

ChuxiJ commited on Jan 19

Commit

def62ce

1 Parent(s): 7d44db3

fix api server no duration

Browse files

Files changed (5) hide show

acestep/api_server.py +39 -6
acestep/gradio_ui/events/generation_handlers.py +20 -16
acestep/gradio_ui/interfaces/generation.py +3 -3
acestep/llm_inference.py +3 -3
acestep/third_parts/nano-vllm/nanovllm/config.py +1 -1

acestep/api_server.py CHANGED Viewed

@@ -125,7 +125,7 @@ class GenerateMusicRequest(BaseModel):
     is_format_caption: bool = False
     lm_temperature: float = 0.85
-    lm_cfg_scale: float = 2.0
     lm_top_k: Optional[int] = None
     lm_top_p: Optional[float] = 0.9
     lm_repetition_penalty: float = 1.0
@@ -137,7 +137,7 @@ class GenerateMusicRequest(BaseModel):
 _LM_DEFAULT_TEMPERATURE = 0.85
-_LM_DEFAULT_CFG_SCALE = 2.0
 _LM_DEFAULT_TOP_P = 0.9
 _DEFAULT_DIT_INSTRUCTION = DEFAULT_DIT_INSTRUCTION
 _DEFAULT_LM_INSTRUCTION = DEFAULT_LM_INSTRUCTION
@@ -728,16 +728,33 @@ def create_app() -> FastAPI:
                         print(f"[api_server] Sample generated: caption_len={len(caption)}, lyrics_len={len(lyrics)}, bpm={bpm}, duration={audio_duration}")
                 # Apply format_sample() if use_format is True and caption/lyrics are provided
                 if req.use_format and (caption or lyrics):
                     print(f"[api_server] Applying format_sample to enhance input...")
                     _ensure_llm_ready()
                     if getattr(app.state, "_llm_init_error", None):
                         raise RuntimeError(f"5Hz LM init failed (needed for format): {app.state._llm_init_error}")
                     format_result = format_sample(
                         llm_handler=llm,
                         caption=caption,
                         lyrics=lyrics,
                         temperature=req.lm_temperature,
                         top_k=lm_top_k if lm_top_k > 0 else None,
                         top_p=lm_top_p if lm_top_p < 1.0 else None,
@@ -745,9 +762,20 @@ def create_app() -> FastAPI:
                     )
                     if format_result.success:
-                        caption = format_result.caption
-                        lyrics = format_result.lyrics
-                        print(f"[api_server] Format applied: new caption_len={len(caption)}, lyrics_len={len(lyrics)}")
                     else:
                         print(f"[api_server] Warning: format_sample failed: {format_result.error}, using original input")
@@ -811,7 +839,12 @@ def create_app() -> FastAPI:
                     lm_top_k=lm_top_k,
                     lm_top_p=lm_top_p,
                     lm_negative_prompt=req.lm_negative_prompt,
-                    use_cot_metas=not sample_mode,  # Sample mode already generated metas, don't regenerate
                     use_cot_caption=req.use_cot_caption,
                     use_cot_language=req.use_cot_language,
                     use_constrained_decoding=req.constrained_decoding,

     is_format_caption: bool = False
     lm_temperature: float = 0.85
+    lm_cfg_scale: float = 2.5
     lm_top_k: Optional[int] = None
     lm_top_p: Optional[float] = 0.9
     lm_repetition_penalty: float = 1.0
 _LM_DEFAULT_TEMPERATURE = 0.85
+_LM_DEFAULT_CFG_SCALE = 2.5
 _LM_DEFAULT_TOP_P = 0.9
 _DEFAULT_DIT_INSTRUCTION = DEFAULT_DIT_INSTRUCTION
 _DEFAULT_LM_INSTRUCTION = DEFAULT_LM_INSTRUCTION
                         print(f"[api_server] Sample generated: caption_len={len(caption)}, lyrics_len={len(lyrics)}, bpm={bpm}, duration={audio_duration}")
                 # Apply format_sample() if use_format is True and caption/lyrics are provided
+                # Track whether format_sample generated duration (to decide if Phase 1 is needed)
+                format_has_duration = False
                 if req.use_format and (caption or lyrics):
                     print(f"[api_server] Applying format_sample to enhance input...")
                     _ensure_llm_ready()
                     if getattr(app.state, "_llm_init_error", None):
                         raise RuntimeError(f"5Hz LM init failed (needed for format): {app.state._llm_init_error}")
+                    # Build user_metadata from request params (matching bot.py behavior)
+                    user_metadata_for_format = {}
+                    if bpm is not None:
+                        user_metadata_for_format['bpm'] = bpm
+                    if audio_duration is not None and audio_duration > 0:
+                        user_metadata_for_format['duration'] = int(audio_duration)
+                    if key_scale:
+                        user_metadata_for_format['keyscale'] = key_scale
+                    if time_signature:
+                        user_metadata_for_format['timesignature'] = time_signature
+                    if req.vocal_language and req.vocal_language != "unknown":
+                        user_metadata_for_format['language'] = req.vocal_language
                     format_result = format_sample(
                         llm_handler=llm,
                         caption=caption,
                         lyrics=lyrics,
+                        user_metadata=user_metadata_for_format if user_metadata_for_format else None,
                         temperature=req.lm_temperature,
                         top_k=lm_top_k if lm_top_k > 0 else None,
                         top_p=lm_top_p if lm_top_p < 1.0 else None,
                     )
                     if format_result.success:
+                        # Extract all formatted data (matching bot.py behavior)
+                        caption = format_result.caption or caption
+                        lyrics = format_result.lyrics or lyrics
+                        if format_result.duration:
+                            audio_duration = format_result.duration
+                            format_has_duration = True
+                        if format_result.bpm:
+                            bpm = format_result.bpm
+                        if format_result.keyscale:
+                            key_scale = format_result.keyscale
+                        if format_result.timesignature:
+                            time_signature = format_result.timesignature
+                        print(f"[api_server] Format applied: new caption_len={len(caption)}, lyrics_len={len(lyrics)}, bpm={bpm}, duration={audio_duration}, has_duration={format_has_duration}")
                     else:
                         print(f"[api_server] Warning: format_sample failed: {format_result.error}, using original input")
                     lm_top_k=lm_top_k,
                     lm_top_p=lm_top_p,
                     lm_negative_prompt=req.lm_negative_prompt,
+                    # use_cot_metas logic:
+                    # - sample_mode: metas already generated, skip Phase 1
+                    # - format with duration: metas already generated, skip Phase 1
+                    # - format without duration: need Phase 1 to generate duration
+                    # - no format: need Phase 1 to generate all metas
+                    use_cot_metas=not sample_mode and not format_has_duration,
                     use_cot_caption=req.use_cot_caption,
                     use_cot_language=req.use_cot_language,
                     use_constrained_decoding=req.constrained_decoding,

acestep/gradio_ui/events/generation_handlers.py CHANGED Viewed

@@ -70,7 +70,7 @@ def load_metadata(file_obj):
     """Load generation parameters from a JSON file"""
     if file_obj is None:
         gr.Warning(t("messages.no_file_selected"))
-        return [None] * 34 + [False]  # Return None for all fields, False for is_format_caption
     try:
         # Read the uploaded file
@@ -115,7 +115,7 @@ def load_metadata(file_obj):
         inference_steps = metadata.get('inference_steps', 8)
         guidance_scale = metadata.get('guidance_scale', 7.0)
         seed = metadata.get('seed', '-1')
-        random_seed = metadata.get('random_seed', True)
         use_adg = metadata.get('use_adg', False)
         cfg_interval_start = metadata.get('cfg_interval_start', 0.0)
         cfg_interval_end = metadata.get('cfg_interval_end', 1.0)
@@ -137,6 +137,9 @@ def load_metadata(file_obj):
         complete_track_classes = metadata.get('complete_track_classes', [])
         shift = metadata.get('shift', 3.0)  # Default 3.0 for base models
         infer_method = metadata.get('infer_method', 'ode')  # Default 'ode' for diffusion inference
         instrumental = metadata.get('instrumental', False)  # Added: read instrumental
         gr.Info(t("messages.params_loaded", filename=os.path.basename(filepath)))
@@ -144,8 +147,9 @@ def load_metadata(file_obj):
         return (
             task_type, captions, lyrics, vocal_language, bpm, key_scale, time_signature,
             audio_duration, batch_size, inference_steps, guidance_scale, seed, random_seed,
-            use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, audio_format,
-            lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
             use_cot_metas, use_cot_caption, use_cot_language, audio_cover_strength,
             think, audio_codes, repainting_start, repainting_end,
             track_name, complete_track_classes, instrumental,
@@ -154,10 +158,10 @@ def load_metadata(file_obj):
     except json.JSONDecodeError as e:
         gr.Warning(t("messages.invalid_json", error=str(e)))
-        return [None] * 35 + [False]
     except Exception as e:
         gr.Warning(t("messages.load_error", error=str(e)))
-        return [None] * 35 + [False]
 def load_random_example(task_type: str):
@@ -429,7 +433,7 @@ def init_service_wrapper(dit_handler, llm_handler, checkpoint, config_path, devi
     # Check if model is initialized - if so, collapse the accordion
     is_model_initialized = dit_handler.model is not None
-    accordion_state = gr.update(open=not is_model_initialized)
     # Get model type settings based on actual loaded model
     is_turbo = dit_handler.is_turbo_model()
@@ -446,12 +450,12 @@ def init_service_wrapper(dit_handler, llm_handler, checkpoint, config_path, devi
 def get_model_type_ui_settings(is_turbo: bool):
     """Get UI settings based on whether the model is turbo or base"""
     if is_turbo:
-        # Turbo model: max 8 steps, hide CFG/ADG/shift, only show text2music/repaint/cover
         return (
-            gr.update(value=8, maximum=8, minimum=1),  # inference_steps
             gr.update(visible=False),  # guidance_scale
             gr.update(visible=False),  # use_adg
-            gr.update(value=1.0, visible=False),  # shift (not effective for turbo)
             gr.update(visible=False),  # cfg_interval_start
             gr.update(visible=False),  # cfg_interval_end
             gr.update(choices=TASK_TYPES_TURBO),  # task_type
@@ -603,7 +607,7 @@ def reset_format_caption_flag():
 def update_audio_uploads_accordion(reference_audio, src_audio):
     """Update Audio Uploads accordion open state based on whether audio files are present"""
     has_audio = (reference_audio is not None) or (src_audio is not None)
-    return gr.update(open=has_audio)
 def handle_instrumental_checkbox(instrumental_checked, current_lyrics):
@@ -708,11 +712,11 @@ def handle_generation_mode_change(mode: str):
     return (
         gr.update(visible=is_simple),  # simple_mode_group
-        gr.update(open=not is_simple),  # caption_accordion - collapsed in simple, open in custom
-        gr.update(open=not is_simple),  # lyrics_accordion - collapsed in simple, open in custom
         gr.update(interactive=not is_simple),  # generate_btn - disabled in simple until sample created
         False,  # simple_sample_created - reset to False on mode change
-        gr.update(open=not is_simple),  # optional_params_accordion - hidden in simple mode
     )
@@ -836,8 +840,8 @@ def handle_create_sample(
         result.language,  # simple vocal_language
         result.timesignature,  # time_signature
         result.instrumental,  # instrumental_checkbox
-        gr.update(open=True),  # caption_accordion - expand
-        gr.update(open=True),  # lyrics_accordion - expand
         gr.update(interactive=True),  # generate_btn - enable
         True,  # simple_sample_created - True
         True,  # think_checkbox - enable thinking

     """Load generation parameters from a JSON file"""
     if file_obj is None:
         gr.Warning(t("messages.no_file_selected"))
+        return [None] * 36 + [False]  # Return None for all fields, False for is_format_caption
     try:
         # Read the uploaded file
         inference_steps = metadata.get('inference_steps', 8)
         guidance_scale = metadata.get('guidance_scale', 7.0)
         seed = metadata.get('seed', '-1')
+        random_seed = False  # Always set to False when loading to enable reproducibility with saved seed
         use_adg = metadata.get('use_adg', False)
         cfg_interval_start = metadata.get('cfg_interval_start', 0.0)
         cfg_interval_end = metadata.get('cfg_interval_end', 1.0)
         complete_track_classes = metadata.get('complete_track_classes', [])
         shift = metadata.get('shift', 3.0)  # Default 3.0 for base models
         infer_method = metadata.get('infer_method', 'ode')  # Default 'ode' for diffusion inference
+        custom_timesteps = metadata.get('timesteps', '')  # Custom timesteps (stored as 'timesteps' in JSON)
+        if custom_timesteps is None:
+            custom_timesteps = ''
         instrumental = metadata.get('instrumental', False)  # Added: read instrumental
         gr.Info(t("messages.params_loaded", filename=os.path.basename(filepath)))
         return (
             task_type, captions, lyrics, vocal_language, bpm, key_scale, time_signature,
             audio_duration, batch_size, inference_steps, guidance_scale, seed, random_seed,
+            use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method,
+            custom_timesteps,  # Added: custom_timesteps (between infer_method and audio_format)
+            audio_format, lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
             use_cot_metas, use_cot_caption, use_cot_language, audio_cover_strength,
             think, audio_codes, repainting_start, repainting_end,
             track_name, complete_track_classes, instrumental,
     except json.JSONDecodeError as e:
         gr.Warning(t("messages.invalid_json", error=str(e)))
+        return [None] * 36 + [False]
     except Exception as e:
         gr.Warning(t("messages.load_error", error=str(e)))
+        return [None] * 36 + [False]
 def load_random_example(task_type: str):
     # Check if model is initialized - if so, collapse the accordion
     is_model_initialized = dit_handler.model is not None
+    accordion_state = gr.Accordion(open=not is_model_initialized)
     # Get model type settings based on actual loaded model
     is_turbo = dit_handler.is_turbo_model()
 def get_model_type_ui_settings(is_turbo: bool):
     """Get UI settings based on whether the model is turbo or base"""
     if is_turbo:
+        # Turbo model: max 20 steps, default 8, show shift with default 3.0, only show text2music/repaint/cover
         return (
+            gr.update(value=8, maximum=20, minimum=1),  # inference_steps
             gr.update(visible=False),  # guidance_scale
             gr.update(visible=False),  # use_adg
+            gr.update(value=3.0, visible=True),  # shift (show with default 3.0)
             gr.update(visible=False),  # cfg_interval_start
             gr.update(visible=False),  # cfg_interval_end
             gr.update(choices=TASK_TYPES_TURBO),  # task_type
 def update_audio_uploads_accordion(reference_audio, src_audio):
     """Update Audio Uploads accordion open state based on whether audio files are present"""
     has_audio = (reference_audio is not None) or (src_audio is not None)
+    return gr.Accordion(open=has_audio)
 def handle_instrumental_checkbox(instrumental_checked, current_lyrics):
     return (
         gr.update(visible=is_simple),  # simple_mode_group
+        gr.Accordion(open=not is_simple),  # caption_accordion - collapsed in simple, open in custom
+        gr.Accordion(open=not is_simple),  # lyrics_accordion - collapsed in simple, open in custom
         gr.update(interactive=not is_simple),  # generate_btn - disabled in simple until sample created
         False,  # simple_sample_created - reset to False on mode change
+        gr.Accordion(open=not is_simple),  # optional_params_accordion - hidden in simple mode
     )
         result.language,  # simple vocal_language
         result.timesignature,  # time_signature
         result.instrumental,  # instrumental_checkbox
+        gr.Accordion(open=True),  # caption_accordion - expand
+        gr.Accordion(open=True),  # lyrics_accordion - expand
         gr.update(interactive=True),  # generate_btn - enable
         True,  # simple_sample_created - True
         True,  # think_checkbox - enable thinking

acestep/gradio_ui/interfaces/generation.py CHANGED Viewed

@@ -402,13 +402,13 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                         )
         # Advanced Settings
-        # Default UI settings use turbo mode (max 8 steps, hide CFG/ADG/shift)
         # These will be updated after model initialization based on handler.is_turbo_model()
         with gr.Accordion(t("generation.advanced_settings"), open=False):
             with gr.Row():
                 inference_steps = gr.Slider(
                     minimum=1,
-                    maximum=8,
                     value=8,
                     step=1,
                     label=t("generation.inference_steps_label"),
@@ -455,7 +455,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                     step=0.1,
                     label=t("generation.shift_label"),
                     info=t("generation.shift_info"),
-                    visible=False
                 )
                 infer_method = gr.Dropdown(
                     choices=["ode", "sde"],

                         )
         # Advanced Settings
+        # Default UI settings use turbo mode (max 20 steps, default 8, show shift with default 3)
         # These will be updated after model initialization based on handler.is_turbo_model()
         with gr.Accordion(t("generation.advanced_settings"), open=False):
             with gr.Row():
                 inference_steps = gr.Slider(
                     minimum=1,
+                    maximum=20,
                     value=8,
                     step=1,
                     label=t("generation.inference_steps_label"),
                     step=0.1,
                     label=t("generation.shift_label"),
                     info=t("generation.shift_info"),
+                    visible=True
                 )
                 infer_method = gr.Dropdown(
                     choices=["ode", "sde"],

acestep/llm_inference.py CHANGED Viewed

@@ -375,9 +375,9 @@ class LLMHandler:
                 max_ratio=0.9
             )
             if low_gpu_memory_mode:
-                self.max_model_len = 2048
-            else:
                 self.max_model_len = 4096
             logger.info(f"Initializing 5Hz LM with model: {model_path}, enforce_eager: False, tensor_parallel_size: 1, max_model_len: {self.max_model_len}, gpu_memory_utilization: {gpu_memory_utilization}")
             start_time = time.time()
@@ -1796,7 +1796,7 @@ class LLMHandler:
             # If no lyrics generated, keep original input
             metadata['lyrics'] = lyrics
-        logger.info(f"Format completed successfully. Generated {len(metadata)} fields")
         if constrained_decoding_debug:
             logger.debug(f"Generated metadata: {list(metadata.keys())}")
             logger.debug(f"Output text preview: {output_text[:300]}...")

                 max_ratio=0.9
             )
             if low_gpu_memory_mode:
                 self.max_model_len = 4096
+            else:
+                self.max_model_len = 8192
             logger.info(f"Initializing 5Hz LM with model: {model_path}, enforce_eager: False, tensor_parallel_size: 1, max_model_len: {self.max_model_len}, gpu_memory_utilization: {gpu_memory_utilization}")
             start_time = time.time()
             # If no lyrics generated, keep original input
             metadata['lyrics'] = lyrics
+        logger.info(f"Format completed successfully. Generated {metadata} fields")
         if constrained_decoding_debug:
             logger.debug(f"Generated metadata: {list(metadata.keys())}")
             logger.debug(f"Output text preview: {output_text[:300]}...")

acestep/third_parts/nano-vllm/nanovllm/config.py CHANGED Viewed

@@ -8,7 +8,7 @@ class Config:
     model: str
     max_num_batched_tokens: int = 16384
     max_num_seqs: int = 512
-    max_model_len: int = 4096
     gpu_memory_utilization: float = 0.9
     tensor_parallel_size: int = 1
     enforce_eager: bool = False

     model: str
     max_num_batched_tokens: int = 16384
     max_num_seqs: int = 512
+    max_model_len: int = 8192
     gpu_memory_utilization: float = 0.9
     tensor_parallel_size: int = 1
     enforce_eager: bool = False