ACE-Step-1.5

Sleeping

App Files Files Community

Gong Junmin commited on Jan 15

Commit

574b7a9

unverified ·

2 Parent(s): 2d3816e fdb6f71

Merge pull request #7 from ace-step/add_rewrite_lyrics

Browse files

Files changed (22) hide show

.gitignore +2 -1
acestep/api_server.py +4 -1
acestep/gradio_ui/events/__init__.py +35 -0
acestep/gradio_ui/events/generation_handlers.py +147 -43
acestep/gradio_ui/events/results_handlers.py +22 -7
acestep/gradio_ui/i18n/en.json +8 -2
acestep/gradio_ui/i18n/ja.json +8 -2
acestep/gradio_ui/i18n/zh.json +8 -2
acestep/gradio_ui/interfaces/generation.py +44 -18
acestep/handler.py +2 -0
acestep/inference.py +181 -19
acestep/llm_inference.py +217 -25
examples/simple_mode/example_01.json +1 -1
examples/simple_mode/example_02.json +1 -1
examples/simple_mode/example_03.json +1 -1
examples/simple_mode/example_04.json +1 -1
examples/simple_mode/example_05.json +1 -1
examples/simple_mode/example_06.json +1 -1
examples/simple_mode/example_07.json +1 -1
examples/simple_mode/example_08.json +1 -1
examples/simple_mode/example_09.json +1 -1
examples/simple_mode/example_10.json +1 -1

.gitignore CHANGED Viewed

@@ -220,4 +220,5 @@ discord_bot/
 feishu_bot/
 tmp*
 torchinductor_root/
-scripts/

 feishu_bot/
 tmp*
 torchinductor_root/
+scripts/
+checkpoints_legacy/

acestep/api_server.py CHANGED Viewed

@@ -94,6 +94,7 @@ class GenerateMusicRequest(BaseModel):
     use_adg: bool = False
     cfg_interval_start: float = 0.0
     cfg_interval_end: float = 1.0
     audio_format: str = "mp3"
     use_tiled_decode: bool = True
@@ -535,10 +536,10 @@ def create_app() -> FastAPI:
                 if sample_mode:
                     print("[api_server] Sample mode: generating random caption/lyrics via LM")
                     sample_metadata, sample_status = llm.understand_audio_from_codes(
                         audio_codes="NO USER INPUT",
                         temperature=req.lm_temperature,
-                        negative_prompt=req.lm_negative_prompt,
                         top_k=lm_top_k if lm_top_k > 0 else None,
                         top_p=lm_top_p if lm_top_p < 1.0 else None,
                         repetition_penalty=req.lm_repetition_penalty,
@@ -584,6 +585,7 @@ def create_app() -> FastAPI:
                     use_adg=req.use_adg,
                     cfg_interval_start=req.cfg_interval_start,
                     cfg_interval_end=req.cfg_interval_end,
                     repainting_start=req.repainting_start,
                     repainting_end=req.repainting_end if req.repainting_end else -1,
                     audio_cover_strength=req.audio_cover_strength,
@@ -854,6 +856,7 @@ def create_app() -> FastAPI:
                 use_adg=_to_bool(get("use_adg"), False),
                 cfg_interval_start=_to_float(get("cfg_interval_start"), 0.0) or 0.0,
                 cfg_interval_end=_to_float(get("cfg_interval_end"), 1.0) or 1.0,
                 audio_format=str(get("audio_format", "mp3") or "mp3"),
                 use_tiled_decode=_to_bool(_get_any("use_tiled_decode", "useTiledDecode"), True),
                 lm_model_path=str(get("lm_model_path") or "").strip() or None,

     use_adg: bool = False
     cfg_interval_start: float = 0.0
     cfg_interval_end: float = 1.0
+    infer_method: str = "ode"  # "ode" or "sde" - diffusion inference method
     audio_format: str = "mp3"
     use_tiled_decode: bool = True
                 if sample_mode:
                     print("[api_server] Sample mode: generating random caption/lyrics via LM")
+                    # Note: understand_audio_from_codes does not support cfg_scale or negative_prompt
                     sample_metadata, sample_status = llm.understand_audio_from_codes(
                         audio_codes="NO USER INPUT",
                         temperature=req.lm_temperature,
                         top_k=lm_top_k if lm_top_k > 0 else None,
                         top_p=lm_top_p if lm_top_p < 1.0 else None,
                         repetition_penalty=req.lm_repetition_penalty,
                     use_adg=req.use_adg,
                     cfg_interval_start=req.cfg_interval_start,
                     cfg_interval_end=req.cfg_interval_end,
+                    infer_method=req.infer_method,
                     repainting_start=req.repainting_start,
                     repainting_end=req.repainting_end if req.repainting_end else -1,
                     audio_cover_strength=req.audio_cover_strength,
                 use_adg=_to_bool(get("use_adg"), False),
                 cfg_interval_start=_to_float(get("cfg_interval_start"), 0.0) or 0.0,
                 cfg_interval_end=_to_float(get("cfg_interval_end"), 1.0) or 1.0,
+                infer_method=str(_get_any("infer_method", "inferMethod", default="ode") or "ode"),
                 audio_format=str(get("audio_format", "mp3") or "mp3"),
                 use_tiled_decode=_to_bool(_get_any("use_tiled_decode", "useTiledDecode"), True),
                 lm_model_path=str(get("lm_model_path") or "").strip() or None,

acestep/gradio_ui/events/__init__.py CHANGED Viewed

@@ -190,6 +190,37 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
         outputs=[generation_section["lyrics"]]
     )
     # ========== Simple/Custom Mode Toggle ==========
     generation_section["generation_mode"].change(
         fn=gen_h.handle_generation_mode_change,
@@ -245,6 +276,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["audio_duration"],
             generation_section["key_scale"],
             generation_section["vocal_language"],
             generation_section["time_signature"],
             generation_section["instrumental_checkbox"],
             generation_section["caption_accordion"],
@@ -279,6 +311,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["cfg_interval_start"],
             generation_section["cfg_interval_end"],
             generation_section["shift"],
             generation_section["audio_format"],
             generation_section["lm_temperature"],
             generation_section["lm_cfg_scale"],
@@ -476,6 +509,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["cfg_interval_start"],
             generation_section["cfg_interval_end"],
             generation_section["shift"],
             generation_section["audio_format"],
             generation_section["lm_temperature"],
             generation_section["think_checkbox"],
@@ -662,6 +696,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["cfg_interval_start"],
             generation_section["cfg_interval_end"],
             generation_section["shift"],
             generation_section["audio_format"],
             generation_section["lm_temperature"],
             generation_section["think_checkbox"],

         outputs=[generation_section["lyrics"]]
     )
+    # ========== Format Button ==========
+    # Note: cfg_scale and negative_prompt are not supported in format mode
+    generation_section["format_btn"].click(
+        fn=lambda caption, lyrics, bpm, duration, key_scale, time_sig, temp, top_k, top_p, debug: gen_h.handle_format_sample(
+            llm_handler, caption, lyrics, bpm, duration, key_scale, time_sig, temp, top_k, top_p, debug
+        ),
+        inputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["lm_temperature"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["constrained_decoding_debug"],
+        ],
+        outputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["vocal_language"],
+            generation_section["time_signature"],
+            results_section["is_format_caption_state"],
+            results_section["status_output"],
+        ]
+    )
     # ========== Simple/Custom Mode Toggle ==========
     generation_section["generation_mode"].change(
         fn=gen_h.handle_generation_mode_change,
             generation_section["audio_duration"],
             generation_section["key_scale"],
             generation_section["vocal_language"],
+            generation_section["simple_vocal_language"],
             generation_section["time_signature"],
             generation_section["instrumental_checkbox"],
             generation_section["caption_accordion"],
             generation_section["cfg_interval_start"],
             generation_section["cfg_interval_end"],
             generation_section["shift"],
+            generation_section["infer_method"],
             generation_section["audio_format"],
             generation_section["lm_temperature"],
             generation_section["lm_cfg_scale"],
             generation_section["cfg_interval_start"],
             generation_section["cfg_interval_end"],
             generation_section["shift"],
+            generation_section["infer_method"],
             generation_section["audio_format"],
             generation_section["lm_temperature"],
             generation_section["think_checkbox"],
             generation_section["cfg_interval_start"],
             generation_section["cfg_interval_end"],
             generation_section["shift"],
+            generation_section["infer_method"],
             generation_section["audio_format"],
             generation_section["lm_temperature"],
             generation_section["think_checkbox"],

acestep/gradio_ui/events/generation_handlers.py CHANGED Viewed

@@ -13,7 +13,7 @@ from acestep.constants import (
     TASK_TYPES_BASE,
 )
 from acestep.gradio_ui.i18n import t
-from acestep.inference import understand_music, create_sample
 def load_metadata(file_obj):
@@ -86,6 +86,7 @@ def load_metadata(file_obj):
         track_name = metadata.get('track_name')
         complete_track_classes = metadata.get('complete_track_classes', [])
         shift = metadata.get('shift', 3.0)  # Default 3.0 for base models
         instrumental = metadata.get('instrumental', False)  # Added: read instrumental
         gr.Info(t("messages.params_loaded", filename=os.path.basename(filepath)))
@@ -93,7 +94,7 @@ def load_metadata(file_obj):
         return (
             task_type, captions, lyrics, vocal_language, bpm, key_scale, time_signature,
             audio_duration, batch_size, inference_steps, guidance_scale, seed, random_seed,
-            use_adg, cfg_interval_start, cfg_interval_end, shift, audio_format,
             lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
             use_cot_metas, use_cot_caption, use_cot_language, audio_cover_strength,
             think, audio_codes, repainting_start, repainting_end,
@@ -103,10 +104,10 @@ def load_metadata(file_obj):
     except json.JSONDecodeError as e:
         gr.Warning(t("messages.invalid_json", error=str(e)))
-        return [None] * 34 + [False]
     except Exception as e:
         gr.Warning(t("messages.load_error", error=str(e)))
-        return [None] * 34 + [False]
 def load_random_example(task_type: str):
@@ -256,7 +257,7 @@ def sample_example_smart(llm_handler, task_type: str, constrained_decoding_debug
 def load_random_simple_description():
     """Load a random description from the simple_mode examples directory.
     Returns:
         Tuple of (description, instrumental, vocal_language) for updating UI components
     """
@@ -265,39 +266,39 @@ def load_random_simple_description():
         current_file = os.path.abspath(__file__)
         # This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
         project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
         # Construct the examples directory path
         examples_dir = os.path.join(project_root, "examples", "simple_mode")
         # Check if directory exists
         if not os.path.exists(examples_dir):
             gr.Warning(t("messages.simple_examples_not_found"))
             return gr.update(), gr.update(), gr.update()
         # Find all JSON files in the directory
         json_files = glob.glob(os.path.join(examples_dir, "*.json"))
         if not json_files:
             gr.Warning(t("messages.simple_examples_empty"))
             return gr.update(), gr.update(), gr.update()
         # Randomly select one file
         selected_file = random.choice(json_files)
         # Read and parse JSON
         try:
             with open(selected_file, 'r', encoding='utf-8') as f:
                 data = json.load(f)
             # Extract fields
             description = data.get('description', '')
             instrumental = data.get('instrumental', False)
-            vocal_language = data.get('vocal_language', ['unknown'])
-            # Ensure vocal_language is a list
-            if isinstance(vocal_language, str):
-                vocal_language = [vocal_language]
             gr.Info(t("messages.simple_example_loaded", filename=os.path.basename(selected_file)))
             return description, instrumental, vocal_language
@@ -564,7 +565,7 @@ def handle_instrumental_checkbox(instrumental_checked, current_lyrics):
 def handle_simple_instrumental_change(is_instrumental: bool):
     """
     Handle simple mode instrumental checkbox changes.
-    When checked: set vocal_language to ["unknown"] and disable editing.
     When unchecked: enable vocal_language editing.
     Args:
@@ -574,7 +575,7 @@ def handle_simple_instrumental_change(is_instrumental: bool):
         gr.update for simple_vocal_language dropdown
     """
     if is_instrumental:
-        return gr.update(value=["unknown"], interactive=False)
     else:
         return gr.update(interactive=True)
@@ -653,7 +654,7 @@ def handle_create_sample(
     llm_handler,
     query: str,
     instrumental: bool,
-    vocal_language: list,
     lm_temperature: float,
     lm_top_k: int,
     lm_top_p: float,
@@ -671,7 +672,7 @@ def handle_create_sample(
         llm_handler: LLM handler instance
         query: User's natural language music description
         instrumental: Whether to generate instrumental music
-        vocal_language: List of preferred vocal languages for constrained decoding
         lm_temperature: LLM temperature for generation
         lm_top_k: LLM top-k sampling
         lm_top_p: LLM top-p sampling
@@ -695,27 +696,6 @@ def handle_create_sample(
         - is_format_caption_state (True)
         - status_output
     """
-    # Validate query
-    if not query or not query.strip():
-        gr.Warning(t("messages.empty_query"))
-        return (
-            gr.update(),  # captions - no change
-            gr.update(),  # lyrics - no change
-            gr.update(),  # bpm - no change
-            gr.update(),  # audio_duration - no change
-            gr.update(),  # key_scale - no change
-            gr.update(),  # vocal_language - no change
-            gr.update(),  # time_signature - no change
-            gr.update(),  # instrumental_checkbox - no change
-            gr.update(),  # caption_accordion - no change
-            gr.update(),  # lyrics_accordion - no change
-            gr.update(interactive=False),  # generate_btn - keep disabled
-            False,  # simple_sample_created - still False
-            gr.update(),  # think_checkbox - no change
-            gr.update(),  # is_format_caption_state - no change
-            t("messages.empty_query"),  # status_output
-        )
     # Check if LLM is initialized
     if not llm_handler.llm_initialized:
         gr.Warning(t("messages.lm_not_initialized"))
@@ -765,6 +745,7 @@ def handle_create_sample(
             gr.update(),  # audio_duration - no change
             gr.update(),  # key_scale - no change
             gr.update(),  # vocal_language - no change
             gr.update(),  # time_signature - no change
             gr.update(),  # instrumental_checkbox - no change
             gr.update(),  # caption_accordion - no change
@@ -786,6 +767,7 @@ def handle_create_sample(
         result.duration if result.duration and result.duration > 0 else -1,  # audio_duration
         result.keyscale,  # key_scale
         result.language,  # vocal_language
         result.timesignature,  # time_signature
         result.instrumental,  # instrumental_checkbox
         gr.update(open=True),  # caption_accordion - expand
@@ -798,3 +780,125 @@ def handle_create_sample(
     )

     TASK_TYPES_BASE,
 )
 from acestep.gradio_ui.i18n import t
+from acestep.inference import understand_music, create_sample, format_sample
 def load_metadata(file_obj):
         track_name = metadata.get('track_name')
         complete_track_classes = metadata.get('complete_track_classes', [])
         shift = metadata.get('shift', 3.0)  # Default 3.0 for base models
+        infer_method = metadata.get('infer_method', 'ode')  # Default 'ode' for diffusion inference
         instrumental = metadata.get('instrumental', False)  # Added: read instrumental
         gr.Info(t("messages.params_loaded", filename=os.path.basename(filepath)))
         return (
             task_type, captions, lyrics, vocal_language, bpm, key_scale, time_signature,
             audio_duration, batch_size, inference_steps, guidance_scale, seed, random_seed,
+            use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, audio_format,
             lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
             use_cot_metas, use_cot_caption, use_cot_language, audio_cover_strength,
             think, audio_codes, repainting_start, repainting_end,
     except json.JSONDecodeError as e:
         gr.Warning(t("messages.invalid_json", error=str(e)))
+        return [None] * 35 + [False]
     except Exception as e:
         gr.Warning(t("messages.load_error", error=str(e)))
+        return [None] * 35 + [False]
 def load_random_example(task_type: str):
 def load_random_simple_description():
     """Load a random description from the simple_mode examples directory.
     Returns:
         Tuple of (description, instrumental, vocal_language) for updating UI components
     """
         current_file = os.path.abspath(__file__)
         # This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
         project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
         # Construct the examples directory path
         examples_dir = os.path.join(project_root, "examples", "simple_mode")
         # Check if directory exists
         if not os.path.exists(examples_dir):
             gr.Warning(t("messages.simple_examples_not_found"))
             return gr.update(), gr.update(), gr.update()
         # Find all JSON files in the directory
         json_files = glob.glob(os.path.join(examples_dir, "*.json"))
         if not json_files:
             gr.Warning(t("messages.simple_examples_empty"))
             return gr.update(), gr.update(), gr.update()
         # Randomly select one file
         selected_file = random.choice(json_files)
         # Read and parse JSON
         try:
             with open(selected_file, 'r', encoding='utf-8') as f:
                 data = json.load(f)
             # Extract fields
             description = data.get('description', '')
             instrumental = data.get('instrumental', False)
+            vocal_language = data.get('vocal_language', 'unknown')
+            # Ensure vocal_language is a string
+            if isinstance(vocal_language, list):
+                vocal_language = vocal_language[0] if vocal_language else 'unknown'
             gr.Info(t("messages.simple_example_loaded", filename=os.path.basename(selected_file)))
             return description, instrumental, vocal_language
 def handle_simple_instrumental_change(is_instrumental: bool):
     """
     Handle simple mode instrumental checkbox changes.
+    When checked: set vocal_language to "unknown" and disable editing.
     When unchecked: enable vocal_language editing.
     Args:
         gr.update for simple_vocal_language dropdown
     """
     if is_instrumental:
+        return gr.update(value="unknown", interactive=False)
     else:
         return gr.update(interactive=True)
     llm_handler,
     query: str,
     instrumental: bool,
+    vocal_language: str,
     lm_temperature: float,
     lm_top_k: int,
     lm_top_p: float,
         llm_handler: LLM handler instance
         query: User's natural language music description
         instrumental: Whether to generate instrumental music
+        vocal_language: Preferred vocal language for constrained decoding
         lm_temperature: LLM temperature for generation
         lm_top_k: LLM top-k sampling
         lm_top_p: LLM top-p sampling
         - is_format_caption_state (True)
         - status_output
     """
     # Check if LLM is initialized
     if not llm_handler.llm_initialized:
         gr.Warning(t("messages.lm_not_initialized"))
             gr.update(),  # audio_duration - no change
             gr.update(),  # key_scale - no change
             gr.update(),  # vocal_language - no change
+            gr.update(),  # simple vocal_language - no change
             gr.update(),  # time_signature - no change
             gr.update(),  # instrumental_checkbox - no change
             gr.update(),  # caption_accordion - no change
         result.duration if result.duration and result.duration > 0 else -1,  # audio_duration
         result.keyscale,  # key_scale
         result.language,  # vocal_language
+        result.language,  # simple vocal_language
         result.timesignature,  # time_signature
         result.instrumental,  # instrumental_checkbox
         gr.update(open=True),  # caption_accordion - expand
     )
+def handle_format_sample(
+    llm_handler,
+    caption: str,
+    lyrics: str,
+    bpm,
+    audio_duration,
+    key_scale: str,
+    time_signature: str,
+    lm_temperature: float,
+    lm_top_k: int,
+    lm_top_p: float,
+    constrained_decoding_debug: bool = False,
+):
+    """
+    Handle the Format button click to format caption and lyrics.
+    Takes user-provided caption and lyrics, and uses the LLM to generate
+    structured music metadata and an enhanced description.
+    Note: cfg_scale and negative_prompt are not supported in format mode.
+    Args:
+        llm_handler: LLM handler instance
+        caption: User's caption/description
+        lyrics: User's lyrics
+        bpm: User-provided BPM (optional, for constrained decoding)
+        audio_duration: User-provided duration (optional, for constrained decoding)
+        key_scale: User-provided key scale (optional, for constrained decoding)
+        time_signature: User-provided time signature (optional, for constrained decoding)
+        lm_temperature: LLM temperature for generation
+        lm_top_k: LLM top-k sampling
+        lm_top_p: LLM top-p sampling
+        constrained_decoding_debug: Whether to enable debug logging
+    Returns:
+        Tuple of updates for:
+        - captions
+        - lyrics
+        - bpm
+        - audio_duration
+        - key_scale
+        - vocal_language
+        - time_signature
+        - is_format_caption_state
+        - status_output
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        gr.Warning(t("messages.lm_not_initialized"))
+        return (
+            gr.update(),  # captions - no change
+            gr.update(),  # lyrics - no change
+            gr.update(),  # bpm - no change
+            gr.update(),  # audio_duration - no change
+            gr.update(),  # key_scale - no change
+            gr.update(),  # vocal_language - no change
+            gr.update(),  # time_signature - no change
+            gr.update(),  # is_format_caption_state - no change
+            t("messages.lm_not_initialized"),  # status_output
+        )
+    # Build user_metadata from provided values for constrained decoding
+    user_metadata = {}
+    if bpm is not None and bpm > 0:
+        user_metadata['bpm'] = int(bpm)
+    if audio_duration is not None and audio_duration > 0:
+        user_metadata['duration'] = int(audio_duration)
+    if key_scale and key_scale.strip():
+        user_metadata['keyscale'] = key_scale.strip()
+    if time_signature and time_signature.strip():
+        user_metadata['timesignature'] = time_signature.strip()
+    # Only pass user_metadata if we have at least one field
+    user_metadata_to_pass = user_metadata if user_metadata else None
+    # Convert LM parameters
+    top_k_value = None if not lm_top_k or lm_top_k == 0 else int(lm_top_k)
+    top_p_value = None if not lm_top_p or lm_top_p >= 1.0 else lm_top_p
+    # Call format_sample API
+    result = format_sample(
+        llm_handler=llm_handler,
+        caption=caption,
+        lyrics=lyrics,
+        user_metadata=user_metadata_to_pass,
+        temperature=lm_temperature,
+        top_k=top_k_value,
+        top_p=top_p_value,
+        use_constrained_decoding=True,
+        constrained_decoding_debug=constrained_decoding_debug,
+    )
+    # Handle error
+    if not result.success:
+        gr.Warning(result.status_message or t("messages.format_failed"))
+        return (
+            gr.update(),  # captions - no change
+            gr.update(),  # lyrics - no change
+            gr.update(),  # bpm - no change
+            gr.update(),  # audio_duration - no change
+            gr.update(),  # key_scale - no change
+            gr.update(),  # vocal_language - no change
+            gr.update(),  # time_signature - no change
+            gr.update(),  # is_format_caption_state - no change
+            result.status_message or t("messages.format_failed"),  # status_output
+        )
+    # Success - populate fields
+    gr.Info(t("messages.format_success"))
+    return (
+        result.caption,  # captions
+        result.lyrics,  # lyrics
+        result.bpm,  # bpm
+        result.duration if result.duration and result.duration > 0 else -1,  # audio_duration
+        result.keyscale,  # key_scale
+        result.language,  # vocal_language
+        result.timesignature,  # time_signature
+        True,  # is_format_caption_state - True (LM-formatted)
+        result.status_message,  # status_output
+    )

acestep/gradio_ui/events/results_handlers.py CHANGED Viewed

@@ -452,7 +452,7 @@ def generate_with_progress(
     reference_audio, audio_duration, batch_size_input, src_audio,
     text2music_audio_code_string, repainting_start, repainting_end,
     instruction_display_gen, audio_cover_strength, task_type,
-    use_adg, cfg_interval_start, cfg_interval_end, shift, audio_format, lm_temperature,
     think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
     use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
     constrained_decoding_debug,
@@ -465,6 +465,14 @@ def generate_with_progress(
 ):
     """Generate audio with progress tracking"""
     # step 1: prepare inputs
     # generate_music, GenerationParams, GenerationConfig
     gen_params = GenerationParams(
@@ -487,6 +495,7 @@ def generate_with_progress(
         cfg_interval_start=cfg_interval_start,
         cfg_interval_end=cfg_interval_end,
         shift=shift,
         repainting_start=repainting_start,
         repainting_end=repainting_end,
         audio_cover_strength=audio_cover_strength,
@@ -496,7 +505,7 @@ def generate_with_progress(
         lm_top_k=lm_top_k,
         lm_top_p=lm_top_p,
         lm_negative_prompt=lm_negative_prompt,
-        use_cot_metas=use_cot_metas,
         use_cot_caption=use_cot_caption,
         use_cot_language=use_cot_language,
         use_constrained_decoding=True,
@@ -587,7 +596,7 @@ def generate_with_progress(
     # Clear lrc_display with empty string - this triggers .change() to clear subtitles
     clear_lrcs = [gr.update(value="", visible=True) for _ in range(8)]
     clear_accordions = [gr.skip() for _ in range(8)]  # Don't change accordion visibility
-    dump_audio = [None for _ in range(8)]
     yield (
         # Audio outputs - just skip, value will be updated in loop
         # Subtitles will be cleared via lrc_display.change()
@@ -1302,7 +1311,7 @@ def capture_current_params(
     reference_audio, audio_duration, batch_size_input, src_audio,
     text2music_audio_code_string, repainting_start, repainting_end,
     instruction_display_gen, audio_cover_strength, task_type,
-    use_adg, cfg_interval_start, cfg_interval_end, shift, audio_format, lm_temperature,
     think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
     use_cot_metas, use_cot_caption, use_cot_language,
     constrained_decoding_debug, allow_lm_batch, auto_score, auto_lrc, score_scale, lm_batch_chunk_size,
@@ -1339,6 +1348,7 @@ def capture_current_params(
         "cfg_interval_start": cfg_interval_start,
         "cfg_interval_end": cfg_interval_end,
         "shift": shift,
         "audio_format": audio_format,
         "lm_temperature": lm_temperature,
         "think_checkbox": think_checkbox,
@@ -1367,7 +1377,7 @@ def generate_with_batch_management(
     reference_audio, audio_duration, batch_size_input, src_audio,
     text2music_audio_code_string, repainting_start, repainting_end,
     instruction_display_gen, audio_cover_strength, task_type,
-    use_adg, cfg_interval_start, cfg_interval_end, shift, audio_format, lm_temperature,
     think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
     use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
     constrained_decoding_debug,
@@ -1396,7 +1406,7 @@ def generate_with_batch_management(
         reference_audio, audio_duration, batch_size_input, src_audio,
         text2music_audio_code_string, repainting_start, repainting_end,
         instruction_display_gen, audio_cover_strength, task_type,
-        use_adg, cfg_interval_start, cfg_interval_end, shift, audio_format, lm_temperature,
         think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
         use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
         constrained_decoding_debug,
@@ -1476,6 +1486,7 @@ def generate_with_batch_management(
         "cfg_interval_start": cfg_interval_start,
         "cfg_interval_end": cfg_interval_end,
         "shift": shift,
         "audio_format": audio_format,
         "lm_temperature": lm_temperature,
         "think_checkbox": think_checkbox,
@@ -1661,6 +1672,7 @@ def generate_next_batch_background(
         params.setdefault("cfg_interval_start", 0.0)
         params.setdefault("cfg_interval_end", 1.0)
         params.setdefault("shift", 1.0)
         params.setdefault("audio_format", "mp3")
         params.setdefault("lm_temperature", 0.85)
         params.setdefault("think_checkbox", True)
@@ -1682,6 +1694,8 @@ def generate_next_batch_background(
         # Call generate_with_progress with the saved parameters
         # Note: generate_with_progress is a generator, need to iterate through it
         generator = generate_with_progress(
             dit_handler,
             llm_handler,
@@ -1709,6 +1723,7 @@ def generate_next_batch_background(
             cfg_interval_start=params.get("cfg_interval_start"),
             cfg_interval_end=params.get("cfg_interval_end"),
             shift=params.get("shift"),
             audio_format=params.get("audio_format"),
             lm_temperature=params.get("lm_temperature"),
             think_checkbox=params.get("think_checkbox"),
@@ -1719,7 +1734,7 @@ def generate_next_batch_background(
             use_cot_metas=params.get("use_cot_metas"),
             use_cot_caption=params.get("use_cot_caption"),
             use_cot_language=params.get("use_cot_language"),
-            is_format_caption=is_format_caption,
             constrained_decoding_debug=params.get("constrained_decoding_debug"),
             allow_lm_batch=params.get("allow_lm_batch"),
             auto_score=params.get("auto_score"),

     reference_audio, audio_duration, batch_size_input, src_audio,
     text2music_audio_code_string, repainting_start, repainting_end,
     instruction_display_gen, audio_cover_strength, task_type,
+    use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, audio_format, lm_temperature,
     think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
     use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
     constrained_decoding_debug,
 ):
     """Generate audio with progress tracking"""
+    # Skip Phase 1 metas COT if sample is already formatted (from LLM/file/random)
+    # This avoids redundant LLM calls since metas (bpm, keyscale, etc.) are already generated
+    actual_use_cot_metas = use_cot_metas
+    if is_format_caption and use_cot_metas:
+        actual_use_cot_metas = False
+        logger.info("[generate_with_progress] Skipping Phase 1 metas COT: sample is already formatted (is_format_caption=True)")
+        gr.Info(t("messages.skipping_metas_cot"))
     # step 1: prepare inputs
     # generate_music, GenerationParams, GenerationConfig
     gen_params = GenerationParams(
         cfg_interval_start=cfg_interval_start,
         cfg_interval_end=cfg_interval_end,
         shift=shift,
+        infer_method=infer_method,
         repainting_start=repainting_start,
         repainting_end=repainting_end,
         audio_cover_strength=audio_cover_strength,
         lm_top_k=lm_top_k,
         lm_top_p=lm_top_p,
         lm_negative_prompt=lm_negative_prompt,
+        use_cot_metas=actual_use_cot_metas,
         use_cot_caption=use_cot_caption,
         use_cot_language=use_cot_language,
         use_constrained_decoding=True,
     # Clear lrc_display with empty string - this triggers .change() to clear subtitles
     clear_lrcs = [gr.update(value="", visible=True) for _ in range(8)]
     clear_accordions = [gr.skip() for _ in range(8)]  # Don't change accordion visibility
+    dump_audio = [gr.update(value=None, subtitles=None) for _ in range(8)]
     yield (
         # Audio outputs - just skip, value will be updated in loop
         # Subtitles will be cleared via lrc_display.change()
     reference_audio, audio_duration, batch_size_input, src_audio,
     text2music_audio_code_string, repainting_start, repainting_end,
     instruction_display_gen, audio_cover_strength, task_type,
+    use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, audio_format, lm_temperature,
     think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
     use_cot_metas, use_cot_caption, use_cot_language,
     constrained_decoding_debug, allow_lm_batch, auto_score, auto_lrc, score_scale, lm_batch_chunk_size,
         "cfg_interval_start": cfg_interval_start,
         "cfg_interval_end": cfg_interval_end,
         "shift": shift,
+        "infer_method": infer_method,
         "audio_format": audio_format,
         "lm_temperature": lm_temperature,
         "think_checkbox": think_checkbox,
     reference_audio, audio_duration, batch_size_input, src_audio,
     text2music_audio_code_string, repainting_start, repainting_end,
     instruction_display_gen, audio_cover_strength, task_type,
+    use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, audio_format, lm_temperature,
     think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
     use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
     constrained_decoding_debug,
         reference_audio, audio_duration, batch_size_input, src_audio,
         text2music_audio_code_string, repainting_start, repainting_end,
         instruction_display_gen, audio_cover_strength, task_type,
+        use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, audio_format, lm_temperature,
         think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
         use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
         constrained_decoding_debug,
         "cfg_interval_start": cfg_interval_start,
         "cfg_interval_end": cfg_interval_end,
         "shift": shift,
+        "infer_method": infer_method,
         "audio_format": audio_format,
         "lm_temperature": lm_temperature,
         "think_checkbox": think_checkbox,
         params.setdefault("cfg_interval_start", 0.0)
         params.setdefault("cfg_interval_end", 1.0)
         params.setdefault("shift", 1.0)
+        params.setdefault("infer_method", "ode")
         params.setdefault("audio_format", "mp3")
         params.setdefault("lm_temperature", 0.85)
         params.setdefault("think_checkbox", True)
         # Call generate_with_progress with the saved parameters
         # Note: generate_with_progress is a generator, need to iterate through it
+        # For AutoGen background batches, always skip metas COT since we want to
+        # generate NEW audio codes with new seeds, not regenerate the same metas
         generator = generate_with_progress(
             dit_handler,
             llm_handler,
             cfg_interval_start=params.get("cfg_interval_start"),
             cfg_interval_end=params.get("cfg_interval_end"),
             shift=params.get("shift"),
+            infer_method=params.get("infer_method"),
             audio_format=params.get("audio_format"),
             lm_temperature=params.get("lm_temperature"),
             think_checkbox=params.get("think_checkbox"),
             use_cot_metas=params.get("use_cot_metas"),
             use_cot_caption=params.get("use_cot_caption"),
             use_cot_language=params.get("use_cot_language"),
+            is_format_caption=is_format_caption,  # Pass through - will skip metas COT if True
             constrained_decoding_debug=params.get("constrained_decoding_debug"),
             allow_lm_batch=params.get("allow_lm_batch"),
             auto_score=params.get("auto_score"),

acestep/gradio_ui/i18n/en.json CHANGED Viewed

@@ -84,7 +84,7 @@
     "mode_simple": "Simple",
     "mode_custom": "Custom",
     "simple_query_label": "Song Description",
-    "simple_query_placeholder": "Describe the music you want to create, e.g., 'a soft Bengali love song for a quiet evening'",
     "simple_query_info": "Enter a natural language description of the music you want to generate",
     "simple_vocal_language_label": "Vocal Language (optional)",
     "simple_vocal_language_info": "Select preferred language(s) for lyrics. Use 'unknown' for any language.",
@@ -98,6 +98,7 @@
     "lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
     "lyrics_info": "Song lyrics with structure",
     "instrumental_label": "Instrumental",
     "optional_params": "⚙️ Optional Parameters",
     "vocal_language_label": "Vocal Language (optional)",
     "vocal_language_info": "use `unknown` for inst",
@@ -127,6 +128,8 @@
     "use_adg_info": "Enable Angle Domain Guidance",
     "shift_label": "Shift",
     "shift_info": "Timestep shift factor for base models (range 1.0~5.0, default 3.0). Not effective for turbo models.",
     "cfg_interval_start": "CFG Interval Start",
     "cfg_interval_end": "CFG Interval End",
     "lm_params_title": "🤖 LM Generation Parameters",
@@ -227,6 +230,9 @@
     "sample_created": "✅ Sample created! Review the caption and lyrics, then click Generate Music.",
     "simple_examples_not_found": "⚠️ Simple mode examples directory not found.",
     "simple_examples_empty": "⚠️ No example files found in simple mode examples.",
-    "simple_example_loaded": "🎲 Loaded random example from {filename}"
   }
 }

     "mode_simple": "Simple",
     "mode_custom": "Custom",
     "simple_query_label": "Song Description",
+    "simple_query_placeholder": "Describe the music you want to create, e.g., 'a soft Bengali love song for a quiet evening'. Leave empty for a random sample.",
     "simple_query_info": "Enter a natural language description of the music you want to generate",
     "simple_vocal_language_label": "Vocal Language (optional)",
     "simple_vocal_language_info": "Select preferred language(s) for lyrics. Use 'unknown' for any language.",
     "lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
     "lyrics_info": "Song lyrics with structure",
     "instrumental_label": "Instrumental",
+    "format_btn": "Format",
     "optional_params": "⚙️ Optional Parameters",
     "vocal_language_label": "Vocal Language (optional)",
     "vocal_language_info": "use `unknown` for inst",
     "use_adg_info": "Enable Angle Domain Guidance",
     "shift_label": "Shift",
     "shift_info": "Timestep shift factor for base models (range 1.0~5.0, default 3.0). Not effective for turbo models.",
+    "infer_method_label": "Inference Method",
+    "infer_method_info": "Diffusion inference method. ODE (Euler) is faster, SDE (stochastic) may produce different results.",
     "cfg_interval_start": "CFG Interval Start",
     "cfg_interval_end": "CFG Interval End",
     "lm_params_title": "🤖 LM Generation Parameters",
     "sample_created": "✅ Sample created! Review the caption and lyrics, then click Generate Music.",
     "simple_examples_not_found": "⚠️ Simple mode examples directory not found.",
     "simple_examples_empty": "⚠️ No example files found in simple mode examples.",
+    "simple_example_loaded": "🎲 Loaded random example from {filename}",
+    "format_success": "✅ Caption and lyrics formatted successfully",
+    "format_failed": "❌ Format failed: {error}",
+    "skipping_metas_cot": "⚡ Skipping Phase 1 metas COT (sample already formatted)"
   }
 }

acestep/gradio_ui/i18n/ja.json CHANGED Viewed

@@ -84,7 +84,7 @@
     "mode_simple": "シンプル",
     "mode_custom": "カスタム",
     "simple_query_label": "曲の説明",
-    "simple_query_placeholder": "作成したい音楽を説明してください。例：'静かな夜のための優しいベンガルのラブソング'",
     "simple_query_info": "生成したい音楽の自然言語の説明を入力",
     "simple_vocal_language_label": "ボーカル言語(オプション)",
     "simple_vocal_language_info": "歌詞の希望言語を選択。任意の言語の場合は'unknown'を使用。",
@@ -98,6 +98,7 @@
     "lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
     "lyrics_info": "構造を持つ曲の歌詞",
     "instrumental_label": "インストゥルメンタル",
     "optional_params": "⚙️ オプションパラメータ",
     "vocal_language_label": "ボーカル言語(オプション)",
     "vocal_language_info": "インストには`unknown`を使用",
@@ -127,6 +128,8 @@
     "use_adg_info": "角度ドメインガイダンスを有効化",
     "shift_label": "シフト",
     "shift_info": "baseモデル用タイムステップシフト係数 (範囲 1.0~5.0、デフォルト 3.0)。turboモデルには無効。",
     "cfg_interval_start": "CFG 間隔開始",
     "cfg_interval_end": "CFG 間隔終了",
     "lm_params_title": "🤖 LM 生成パラメータ",
@@ -227,6 +230,9 @@
     "sample_created": "✅ サンプルが作成されました！キャプションと歌詞を確認して、音楽を生成をクリックしてください。",
     "simple_examples_not_found": "⚠️ シンプルモードサンプルディレクトリが見つかりません。",
     "simple_examples_empty": "⚠️ シンプルモードサンプルにファイルがありません。",
-    "simple_example_loaded": "🎲 {filename} からランダムサンプルを読み込みました"
   }
 }

     "mode_simple": "シンプル",
     "mode_custom": "カスタム",
     "simple_query_label": "曲の説明",
+    "simple_query_placeholder": "作成したい音楽を説明してください。例：'静かな夜のための優しいベンガルのラブソング'。空欄の場合はランダムなサンプルが生成されます。",
     "simple_query_info": "生成したい音楽の自然言語の説明を入力",
     "simple_vocal_language_label": "ボーカル言語(オプション)",
     "simple_vocal_language_info": "歌詞の希望言語を選択。任意の言語の場合は'unknown'を使用。",
     "lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
     "lyrics_info": "構造を持つ曲の歌詞",
     "instrumental_label": "インストゥルメンタル",
+    "format_btn": "フォーマット",
     "optional_params": "⚙️ オプションパラメータ",
     "vocal_language_label": "ボーカル言語(オプション)",
     "vocal_language_info": "インストには`unknown`を使用",
     "use_adg_info": "角度ドメインガイダンスを有効化",
     "shift_label": "シフト",
     "shift_info": "baseモデル用タイムステップシフト係数 (範囲 1.0~5.0、デフォルト 3.0)。turboモデルには無効。",
+    "infer_method_label": "推論方法",
+    "infer_method_info": "拡散推論方法。ODE (オイラー) は高速、SDE (確率的) は異なる結果を生成する可能性があります。",
     "cfg_interval_start": "CFG 間隔開始",
     "cfg_interval_end": "CFG 間隔終了",
     "lm_params_title": "🤖 LM 生成パラメータ",
     "sample_created": "✅ サンプルが作成されました！キャプションと歌詞を確認して、音楽を生成をクリックしてください。",
     "simple_examples_not_found": "⚠️ シンプルモードサンプルディレクトリが見つかりません。",
     "simple_examples_empty": "⚠️ シンプルモードサンプルにファイルがありません。",
+    "simple_example_loaded": "🎲 {filename} からランダムサンプルを読み込みました",
+    "format_success": "✅ キャプションと歌詞のフォーマットに成功しました",
+    "format_failed": "❌ フォーマットに失敗しました: {error}",
+    "skipping_metas_cot": "⚡ Phase 1 メタデータ COT をスキップ（サンプルは既にフォーマット済み）"
   }
 }

acestep/gradio_ui/i18n/zh.json CHANGED Viewed

@@ -84,7 +84,7 @@
     "mode_simple": "简单",
     "mode_custom": "自定义",
     "simple_query_label": "歌曲描述",
-    "simple_query_placeholder": "描述你想创作的音乐，例如：'给我生成一首暗黑的戏剧古风，歌词要华丽'",
     "simple_query_info": "输入你想生成的音乐的自然语言描述",
     "simple_vocal_language_label": "人声语言(可选)",
     "simple_vocal_language_info": "选择歌词的首选语言。使用 'unknown' 表示任意语言。",
@@ -98,6 +98,7 @@
     "lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
     "lyrics_info": "带有结构的歌曲歌词",
     "instrumental_label": "纯音乐",
     "optional_params": "⚙️ 可选参数",
     "vocal_language_label": "人声语言(可选)",
     "vocal_language_info": "纯音乐使用 `unknown`",
@@ -127,6 +128,8 @@
     "use_adg_info": "启用角域引导",
     "shift_label": "Shift",
     "shift_info": "时间步偏移因子，仅对 base 模型生效 (范围 1.0~5.0，默认 3.0)。对 turbo 模型无效。",
     "cfg_interval_start": "CFG 间隔开始",
     "cfg_interval_end": "CFG 间隔结束",
     "lm_params_title": "🤖 LM 生成参数",
@@ -227,6 +230,9 @@
     "sample_created": "✅ 样本已创建！检查描述和歌词，然后点击生成音乐。",
     "simple_examples_not_found": "⚠️ 未找到简单模式示例目录。",
     "simple_examples_empty": "⚠️ 简单模式示例中没有示例文件。",
-    "simple_example_loaded": "🎲 已从 {filename} 加载随机示例"
   }
 }

     "mode_simple": "简单",
     "mode_custom": "自定义",
     "simple_query_label": "歌曲描述",
+    "simple_query_placeholder": "描述你想创作的音乐，例如：'给我生成一首暗黑的戏剧古风，歌词要华丽'。留空则随机生成样本。",
     "simple_query_info": "输入你想生成的音乐的自然语言描述",
     "simple_vocal_language_label": "人声语言(可选)",
     "simple_vocal_language_info": "选择歌词的首选语言。使用 'unknown' 表示任意语言。",
     "lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
     "lyrics_info": "带有结构的歌曲歌词",
     "instrumental_label": "纯音乐",
+    "format_btn": "格式化",
     "optional_params": "⚙️ 可选参数",
     "vocal_language_label": "人声语言(可选)",
     "vocal_language_info": "纯音乐使用 `unknown`",
     "use_adg_info": "启用角域引导",
     "shift_label": "Shift",
     "shift_info": "时间步偏移因子，仅对 base 模型生效 (范围 1.0~5.0，默认 3.0)。对 turbo 模型无效。",
+    "infer_method_label": "推理方法",
+    "infer_method_info": "扩散推理方法。ODE (欧拉) 更快，SDE (随机) 可能产生不同结果。",
     "cfg_interval_start": "CFG 间隔开始",
     "cfg_interval_end": "CFG 间隔结束",
     "lm_params_title": "🤖 LM 生成参数",
     "sample_created": "✅ 样本已创建！检查描述和歌词，然后点击生成音乐。",
     "simple_examples_not_found": "⚠️ 未找到简单模式示例目录。",
     "simple_examples_empty": "⚠️ 简单模式示例中没有示例文件。",
+    "simple_example_loaded": "🎲 已从 {filename} 加载随机示例",
+    "format_success": "✅ 描述和歌词格式化成功",
+    "format_failed": "❌ 格式化失败: {error}",
+    "skipping_metas_cot": "⚡ 跳过 Phase 1 元数据 COT（样本已格式化）"
   }
 }

acestep/gradio_ui/interfaces/generation.py CHANGED Viewed

@@ -314,15 +314,15 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                             placeholder=t("generation.caption_placeholder"),
                             lines=3,
                             info=t("generation.caption_info"),
-                            scale=9,
                         )
-                        sample_btn = gr.Button(
-                            "🎲",
-                            variant="secondary",
-                            size="sm",
-                            scale=1,
-                        )
                 # Lyrics - wrapped in accordion that can be collapsed in Simple mode
                 with gr.Accordion(t("generation.lyrics_title"), open=False) as lyrics_accordion:
                     lyrics = gr.Textbox(
@@ -331,22 +331,40 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                         lines=8,
                         info=t("generation.lyrics_info")
                     )
-                    instrumental_checkbox = gr.Checkbox(
-                        label=t("generation.instrumental_label"),
-                        value=False,
-                        scale=1,
-                    )
-                # Optional Parameters
-                with gr.Accordion(t("generation.optional_params"), open=False) as optional_params_accordion:
-                    with gr.Row():
                         vocal_language = gr.Dropdown(
                             choices=VALID_LANGUAGES,
                             value="unknown",
                             label=t("generation.vocal_language_label"),
                             allow_custom_value=True,
-                            info=t("generation.vocal_language_info")
                         )
                         bpm = gr.Number(
                             label=t("generation.bpm_label"),
                             value=None,
@@ -437,6 +455,12 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                     info=t("generation.shift_info"),
                     visible=False
                 )
             with gr.Row():
                 cfg_interval_start = gr.Slider(
@@ -673,12 +697,14 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
         "cfg_interval_start": cfg_interval_start,
         "cfg_interval_end": cfg_interval_end,
         "shift": shift,
         "audio_format": audio_format,
         "output_alignment_preference": output_alignment_preference,
         "think_checkbox": think_checkbox,
         "autogen_checkbox": autogen_checkbox,
         "generate_btn": generate_btn,
         "instrumental_checkbox": instrumental_checkbox,
         "constrained_decoding_debug": constrained_decoding_debug,
         "score_scale": score_scale,
         "allow_lm_batch": allow_lm_batch,

                             placeholder=t("generation.caption_placeholder"),
                             lines=3,
                             info=t("generation.caption_info"),
+                            scale=12,
                         )
+                        with gr.Column(scale=1, min_width=100):
+                            sample_btn = gr.Button(
+                                "🎲",
+                                variant="secondary",
+                                size="sm",
+                                scale=2,
+                            )
                 # Lyrics - wrapped in accordion that can be collapsed in Simple mode
                 with gr.Accordion(t("generation.lyrics_title"), open=False) as lyrics_accordion:
                     lyrics = gr.Textbox(
                         lines=8,
                         info=t("generation.lyrics_info")
                     )
+                    with gr.Row(variant="compact", equal_height=True):
+                        instrumental_checkbox = gr.Checkbox(
+                            label=t("generation.instrumental_label"),
+                            value=False,
+                            scale=1,
+                            min_width=120,
+                            container=True,
+                        )
+                        # 中间：语言选择 (Dropdown)
+                        # 移除 gr.HTML hack，直接使用 label 参数，Gradio 会自动处理对齐
                         vocal_language = gr.Dropdown(
                             choices=VALID_LANGUAGES,
                             value="unknown",
                             label=t("generation.vocal_language_label"),
+                            show_label=False,
+                            container=True,
                             allow_custom_value=True,
+                            scale=3,
                         )
+                        # 右侧：格式化按钮 (Button)
+                        # 放在同一行最右侧，操作更顺手
+                        format_btn = gr.Button(
+                            t("generation.format_btn"),
+                            variant="secondary",
+                            scale=1,
+                            min_width=80,
+                        )
+                # Optional Parameters
+                with gr.Accordion(t("generation.optional_params"), open=False) as optional_params_accordion:
+                    with gr.Row():
                         bpm = gr.Number(
                             label=t("generation.bpm_label"),
                             value=None,
                     info=t("generation.shift_info"),
                     visible=False
                 )
+                infer_method = gr.Dropdown(
+                    choices=["ode", "sde"],
+                    value="ode",
+                    label=t("generation.infer_method_label"),
+                    info=t("generation.infer_method_info"),
+                )
             with gr.Row():
                 cfg_interval_start = gr.Slider(
         "cfg_interval_start": cfg_interval_start,
         "cfg_interval_end": cfg_interval_end,
         "shift": shift,
+        "infer_method": infer_method,
         "audio_format": audio_format,
         "output_alignment_preference": output_alignment_preference,
         "think_checkbox": think_checkbox,
         "autogen_checkbox": autogen_checkbox,
         "generate_btn": generate_btn,
         "instrumental_checkbox": instrumental_checkbox,
+        "format_btn": format_btn,
         "constrained_decoding_debug": constrained_decoding_debug,
         "score_scale": score_scale,
         "allow_lm_batch": allow_lm_batch,

acestep/handler.py CHANGED Viewed

@@ -2079,6 +2079,7 @@ class AceStepHandler:
         cfg_interval_start: float = 0.0,
         cfg_interval_end: float = 1.0,
         shift: float = 1.0,
         use_tiled_decode: bool = True,
         progress=None
     ) -> Dict[str, Any]:
@@ -2227,6 +2228,7 @@ class AceStepHandler:
                 cfg_interval_start=cfg_interval_start,  # Pass CFG interval start
                 cfg_interval_end=cfg_interval_end,  # Pass CFG interval end
                 shift=shift,  # Pass shift parameter
                 audio_code_hints=audio_code_hints_batch,  # Pass audio code hints as list
                 return_intermediate=should_return_intermediate
             )

         cfg_interval_start: float = 0.0,
         cfg_interval_end: float = 1.0,
         shift: float = 1.0,
+        infer_method: str = "ode",
         use_tiled_decode: bool = True,
         progress=None
     ) -> Dict[str, Any]:
                 cfg_interval_start=cfg_interval_start,  # Pass CFG interval start
                 cfg_interval_end=cfg_interval_end,  # Pass CFG interval end
                 shift=shift,  # Pass shift parameter
+                infer_method=infer_method,  # Pass infer method (ode or sde)
                 audio_code_hints=audio_code_hints_batch,  # Pass audio code hints as list
                 return_intermediate=should_return_intermediate
             )

acestep/inference.py CHANGED Viewed

@@ -96,6 +96,7 @@ class GenerationParams:
     cfg_interval_start: float = 0.0
     cfg_interval_end: float = 1.0
     shift: float = 1.0
     repainting_start: float = 0.0
     repainting_end: float = -1
@@ -532,6 +533,7 @@ def generate_music(
             cfg_interval_start=params.cfg_interval_start,
             cfg_interval_end=params.cfg_interval_end,
             shift=params.shift,
             progress=progress,
         )
@@ -671,8 +673,6 @@ def understand_music(
     llm_handler,
     audio_codes: str,
     temperature: float = 0.85,
-    cfg_scale: float = 1.0,
-    negative_prompt: str = "NO USER INPUT",
     top_k: Optional[int] = None,
     top_p: Optional[float] = None,
     repetition_penalty: float = 1.0,
@@ -687,13 +687,13 @@ def understand_music(
     If audio_codes is empty or "NO USER INPUT", the LM will generate a sample example
     instead of analyzing existing codes.
     Args:
         llm_handler: Initialized LLM handler (LLMHandler instance)
         audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
                      Use empty string or "NO USER INPUT" to generate a sample example.
         temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
-        cfg_scale: Classifier-Free Guidance scale (1.0 = no CFG, >1.0 = use CFG)
-        negative_prompt: Negative prompt for CFG guidance
         top_k: Top-K sampling (None or 0 = disabled)
         top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
         repetition_penalty: Repetition penalty (1.0 = no penalty)
@@ -727,8 +727,6 @@ def understand_music(
         metadata, status = llm_handler.understand_audio_from_codes(
             audio_codes=audio_codes,
             temperature=temperature,
-            cfg_scale=cfg_scale,
-            negative_prompt=negative_prompt,
             top_k=top_k,
             top_p=top_p,
             repetition_penalty=repetition_penalty,
@@ -847,7 +845,7 @@ def create_sample(
     llm_handler,
     query: str,
     instrumental: bool = False,
-    vocal_language: Optional[List[str]] = None,
     temperature: float = 0.85,
     top_k: Optional[int] = None,
     top_p: Optional[float] = None,
@@ -869,9 +867,9 @@ def create_sample(
         llm_handler: Initialized LLM handler (LLMHandler instance)
         query: User's natural language music description (e.g., "a soft Bengali love song")
         instrumental: Whether to generate instrumental music (no vocals)
-        vocal_language: List of allowed vocal languages for constrained decoding (e.g., ["en", "zh"]).
-                       If provided, the model will be constrained to generate lyrics in these languages.
-                       If None or ["unknown"], no language constraint is applied.
         temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
         top_k: Top-K sampling (None or 0 = disabled)
         top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
@@ -883,7 +881,7 @@ def create_sample(
         CreateSampleResult with generated sample fields and status
     Example:
-        >>> result = create_sample(llm_handler, "a soft Bengali love song for a quiet evening", vocal_language=["bn"])
         >>> if result.success:
         ...     print(f"Caption: {result.caption}")
         ...     print(f"Lyrics: {result.lyrics}")
@@ -897,14 +895,6 @@ def create_sample(
             error="LLM not initialized",
         )
-    # Validate query
-    if not query or not query.strip():
-        return CreateSampleResult(
-            status_message="No query provided. Please enter a music description.",
-            success=False,
-            error="Empty query",
-        )
     try:
         # Call LLM to create sample
         metadata, status = llm_handler.create_sample_from_query(
@@ -982,3 +972,175 @@ def create_sample(
             success=False,
             error=str(e),
         )

     cfg_interval_start: float = 0.0
     cfg_interval_end: float = 1.0
     shift: float = 1.0
+    infer_method: str = "ode"  # "ode" or "sde" - diffusion inference method
     repainting_start: float = 0.0
     repainting_end: float = -1
             cfg_interval_start=params.cfg_interval_start,
             cfg_interval_end=params.cfg_interval_end,
             shift=params.shift,
+            infer_method=params.infer_method,
             progress=progress,
         )
     llm_handler,
     audio_codes: str,
     temperature: float = 0.85,
     top_k: Optional[int] = None,
     top_p: Optional[float] = None,
     repetition_penalty: float = 1.0,
     If audio_codes is empty or "NO USER INPUT", the LM will generate a sample example
     instead of analyzing existing codes.
+    Note: cfg_scale and negative_prompt are not supported in understand mode.
     Args:
         llm_handler: Initialized LLM handler (LLMHandler instance)
         audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
                      Use empty string or "NO USER INPUT" to generate a sample example.
         temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
         top_k: Top-K sampling (None or 0 = disabled)
         top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
         repetition_penalty: Repetition penalty (1.0 = no penalty)
         metadata, status = llm_handler.understand_audio_from_codes(
             audio_codes=audio_codes,
             temperature=temperature,
             top_k=top_k,
             top_p=top_p,
             repetition_penalty=repetition_penalty,
     llm_handler,
     query: str,
     instrumental: bool = False,
+    vocal_language: Optional[str] = None,
     temperature: float = 0.85,
     top_k: Optional[int] = None,
     top_p: Optional[float] = None,
         llm_handler: Initialized LLM handler (LLMHandler instance)
         query: User's natural language music description (e.g., "a soft Bengali love song")
         instrumental: Whether to generate instrumental music (no vocals)
+        vocal_language: Allowed vocal language for constrained decoding (e.g., "en", "zh").
+                       If provided, the model will be constrained to generate lyrics in this language.
+                       If None or "unknown", no language constraint is applied.
         temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
         top_k: Top-K sampling (None or 0 = disabled)
         top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
         CreateSampleResult with generated sample fields and status
     Example:
+        >>> result = create_sample(llm_handler, "a soft Bengali love song for a quiet evening", vocal_language="bn")
         >>> if result.success:
         ...     print(f"Caption: {result.caption}")
         ...     print(f"Lyrics: {result.lyrics}")
             error="LLM not initialized",
         )
     try:
         # Call LLM to create sample
         metadata, status = llm_handler.create_sample_from_query(
             success=False,
             error=str(e),
         )
+@dataclass
+class FormatSampleResult:
+    """Result of formatting user-provided caption and lyrics.
+    This is used by the "Format" feature where users provide caption and lyrics,
+    and the LLM formats them into structured music metadata and an enhanced description.
+    Attributes:
+        # Metadata Fields
+        caption: Enhanced/formatted music description/caption
+        lyrics: Formatted lyrics (may be same as input or reformatted)
+        bpm: Beats per minute (None if not detected)
+        duration: Duration in seconds (None if not detected)
+        keyscale: Musical key (e.g., "C Major")
+        language: Vocal language code (e.g., "en", "zh")
+        timesignature: Time signature (e.g., "4")
+        # Status
+        status_message: Status message from formatting
+        success: Whether formatting completed successfully
+        error: Error message if formatting failed
+    """
+    # Metadata Fields
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    # Status
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert result to dictionary for JSON serialization."""
+        return asdict(self)
+def format_sample(
+    llm_handler,
+    caption: str,
+    lyrics: str,
+    user_metadata: Optional[Dict[str, Any]] = None,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> FormatSampleResult:
+    """Format user-provided caption and lyrics using the 5Hz Language Model.
+    This function takes user input (caption and lyrics) and generates structured
+    music metadata including an enhanced caption, BPM, duration, key, language,
+    and time signature.
+    If user_metadata is provided, those values will be used to constrain the
+    decoding, ensuring the output matches user-specified values.
+    Note: cfg_scale and negative_prompt are not supported in format mode.
+    Args:
+        llm_handler: Initialized LLM handler (LLMHandler instance)
+        caption: User's caption/description (e.g., "Latin pop, reggaeton")
+        lyrics: User's lyrics with structure tags
+        user_metadata: Optional dict with user-provided metadata to constrain decoding.
+                      Supported keys: bpm, duration, keyscale, timesignature, language
+        temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
+        top_k: Top-K sampling (None or 0 = disabled)
+        top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
+        repetition_penalty: Repetition penalty (1.0 = no penalty)
+        use_constrained_decoding: Whether to use FSM-based constrained decoding for metadata
+        constrained_decoding_debug: Whether to enable debug logging for constrained decoding
+    Returns:
+        FormatSampleResult with formatted metadata fields and status
+    Example:
+        >>> result = format_sample(llm_handler, "Latin pop, reggaeton", "[Verse 1]\\nHola mundo...")
+        >>> if result.success:
+        ...     print(f"Caption: {result.caption}")
+        ...     print(f"BPM: {result.bpm}")
+        ...     print(f"Lyrics: {result.lyrics}")
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        return FormatSampleResult(
+            status_message="5Hz LM not initialized. Please initialize it first.",
+            success=False,
+            error="LLM not initialized",
+        )
+    try:
+        # Call LLM formatting
+        metadata, status = llm_handler.format_sample_from_input(
+            caption=caption,
+            lyrics=lyrics,
+            user_metadata=user_metadata,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+        )
+        # Check if LLM returned empty metadata (error case)
+        if not metadata:
+            return FormatSampleResult(
+                status_message=status or "Failed to format input",
+                success=False,
+                error=status or "Empty metadata returned",
+            )
+        # Extract and convert fields
+        result_caption = metadata.get('caption', '')
+        result_lyrics = metadata.get('lyrics', lyrics)  # Fall back to input lyrics
+        keyscale = metadata.get('keyscale', '')
+        language = metadata.get('language', metadata.get('vocal_language', ''))
+        timesignature = metadata.get('timesignature', '')
+        # Convert BPM to int
+        bpm = None
+        bpm_value = metadata.get('bpm')
+        if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+        # Convert duration to float
+        duration = None
+        duration_value = metadata.get('duration')
+        if duration_value is not None and duration_value != 'N/A' and duration_value != '':
+            try:
+                duration = float(duration_value)
+            except (ValueError, TypeError):
+                pass
+        # Clean up N/A values
+        if keyscale == 'N/A':
+            keyscale = ''
+        if language == 'N/A':
+            language = ''
+        if timesignature == 'N/A':
+            timesignature = ''
+        return FormatSampleResult(
+            caption=result_caption,
+            lyrics=result_lyrics,
+            bpm=bpm,
+            duration=duration,
+            keyscale=keyscale,
+            language=language,
+            timesignature=timesignature,
+            status_message=status,
+            success=True,
+            error=None,
+        )
+    except Exception as e:
+        logger.exception("Format sample failed")
+        return FormatSampleResult(
+            status_message=f"Error: {str(e)}",
+            success=False,
+            error=str(e),
+        )

acestep/llm_inference.py CHANGED Viewed

@@ -19,7 +19,7 @@ from transformers.generation.logits_process import (
     RepetitionPenaltyLogitsProcessor,
 )
 from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
-from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION, DEFAULT_LM_INSPIRED_INSTRUCTION
 class LLMHandler:
@@ -1296,8 +1296,6 @@ class LLMHandler:
         self,
         audio_codes: str,
         temperature: float = 0.3,
-        cfg_scale: float = 1.0,
-        negative_prompt: str = "NO USER INPUT",
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
         repetition_penalty: float = 1.0,
@@ -1306,16 +1304,16 @@ class LLMHandler:
     ) -> Tuple[Dict[str, Any], str]:
         """
         Understand audio codes and generate metadata + lyrics.
         This is the reverse of the normal generation flow:
         - Input: Audio codes
         - Output: Metadata (bpm, caption, duration, etc.) + Lyrics
         Args:
             audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
             temperature: Sampling temperature for generation
-            cfg_scale: Classifier-Free Guidance scale (1.0 = no CFG, >1.0 = use CFG)
-            negative_prompt: Negative prompt for CFG
             top_k: Top-K sampling (None = disabled)
             top_p: Top-P (nucleus) sampling (None = disabled)
             repetition_penalty: Repetition penalty (1.0 = no penalty)
@@ -1352,12 +1350,11 @@ class LLMHandler:
         print(f"formatted_prompt: {formatted_prompt}")
         # Generate using constrained decoding (understand phase)
         # We want to generate metadata first (CoT), then lyrics (natural text)
         output_text, status = self.generate_from_formatted_prompt(
             formatted_prompt=formatted_prompt,
             cfg={
                 "temperature": temperature,
-                "cfg_scale": cfg_scale,
-                "negative_prompt": negative_prompt,
                 "top_k": top_k,
                 "top_p": top_p,
                 "repetition_penalty": repetition_penalty,
@@ -1491,7 +1488,7 @@ class LLMHandler:
         self,
         query: str,
         instrumental: bool = False,
-        vocal_language: Optional[List[str]] = None,
         temperature: float = 0.85,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
@@ -1509,8 +1506,8 @@ class LLMHandler:
         Args:
             query: User's natural language music description
             instrumental: Whether to generate instrumental music (no vocals)
-            vocal_language: List of allowed vocal languages for constrained decoding (e.g., ["en", "zh"]).
-                           If provided and not ["unknown"], the first language will be used.
             temperature: Sampling temperature for generation (0.0-2.0)
             top_k: Top-K sampling (None = disabled)
             top_p: Top-P (nucleus) sampling (None = disabled)
@@ -1532,7 +1529,7 @@ class LLMHandler:
         Example:
             query = "a soft Bengali love song for a quiet evening"
-            metadata, status = handler.create_sample_from_query(query, instrumental=False, vocal_language=["bn"])
             print(metadata['caption'])  # "A gentle romantic acoustic pop ballad..."
             print(metadata['lyrics'])   # "[Intro: ...]\\n..."
         """
@@ -1540,7 +1537,7 @@ class LLMHandler:
             return {}, "❌ 5Hz LM not initialized. Please initialize it first."
         if not query or not query.strip():
-            return {}, "❌ No query provided. Please enter a music description."
         logger.info(f"Creating sample from query: {query[:100]}... (instrumental={instrumental}, vocal_language={vocal_language})")
@@ -1554,14 +1551,11 @@ class LLMHandler:
         # Build user_metadata if vocal_language is specified and is not "unknown"
         user_metadata = None
         skip_language = False
-        if vocal_language and len(vocal_language) > 0:
-            # Filter out "unknown" from the list
-            valid_languages = [lang for lang in vocal_language if lang and lang.lower() != "unknown"]
-            if valid_languages:
-                # Use the first valid language for constrained decoding
-                user_metadata = {"language": valid_languages[0]}
-                skip_language = True  # Skip language generation since we're injecting it
-                logger.info(f"Using user-specified language: {valid_languages[0]}")
         # Generate using constrained decoding (inspiration phase)
         # Similar to understand mode - generate metadata first (CoT), then lyrics
@@ -1576,7 +1570,7 @@ class LLMHandler:
                 "target_duration": None,  # No duration constraint
                 "user_metadata": user_metadata,  # Inject language if specified
                 "skip_caption": False,  # Generate caption
-                "skip_language": skip_language,  # Skip if we're injecting language
                 "skip_genres": False,  # Generate genres
                 "generation_phase": "understand",  # Use understand phase for metadata + free-form lyrics
                 "caption": "",
@@ -1604,12 +1598,210 @@ class LLMHandler:
         # Echo back the instrumental flag
         metadata['instrumental'] = instrumental
-        logger.info(f"Sample created successfully. Generated {len(metadata)} fields")
         if constrained_decoding_debug:
             logger.debug(f"Generated metadata: {list(metadata.keys())}")
             logger.debug(f"Output text preview: {output_text[:300]}...")
-        status_msg = f"✅ Sample created successfully\nGenerated fields: {', '.join(metadata.keys())}"
         return metadata, status_msg
     def generate_from_formatted_prompt(

     RepetitionPenaltyLogitsProcessor,
 )
 from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
+from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION, DEFAULT_LM_INSPIRED_INSTRUCTION, DEFAULT_LM_REWRITE_INSTRUCTION
 class LLMHandler:
         self,
         audio_codes: str,
         temperature: float = 0.3,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
         repetition_penalty: float = 1.0,
     ) -> Tuple[Dict[str, Any], str]:
         """
         Understand audio codes and generate metadata + lyrics.
         This is the reverse of the normal generation flow:
         - Input: Audio codes
         - Output: Metadata (bpm, caption, duration, etc.) + Lyrics
+        Note: cfg_scale and negative_prompt are not supported in understand mode.
         Args:
             audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
             temperature: Sampling temperature for generation
             top_k: Top-K sampling (None = disabled)
             top_p: Top-P (nucleus) sampling (None = disabled)
             repetition_penalty: Repetition penalty (1.0 = no penalty)
         print(f"formatted_prompt: {formatted_prompt}")
         # Generate using constrained decoding (understand phase)
         # We want to generate metadata first (CoT), then lyrics (natural text)
+        # Note: cfg_scale and negative_prompt are not used in understand mode
         output_text, status = self.generate_from_formatted_prompt(
             formatted_prompt=formatted_prompt,
             cfg={
                 "temperature": temperature,
                 "top_k": top_k,
                 "top_p": top_p,
                 "repetition_penalty": repetition_penalty,
         self,
         query: str,
         instrumental: bool = False,
+        vocal_language: Optional[str] = None,
         temperature: float = 0.85,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
         Args:
             query: User's natural language music description
             instrumental: Whether to generate instrumental music (no vocals)
+            vocal_language: Allowed vocal language for constrained decoding (e.g., "en", "zh").
+                           If provided and not "unknown", it will be used.
             temperature: Sampling temperature for generation (0.0-2.0)
             top_k: Top-K sampling (None = disabled)
             top_p: Top-P (nucleus) sampling (None = disabled)
         Example:
             query = "a soft Bengali love song for a quiet evening"
+            metadata, status = handler.create_sample_from_query(query, instrumental=False, vocal_language="bn")
             print(metadata['caption'])  # "A gentle romantic acoustic pop ballad..."
             print(metadata['lyrics'])   # "[Intro: ...]\\n..."
         """
             return {}, "❌ 5Hz LM not initialized. Please initialize it first."
         if not query or not query.strip():
+            query = "NO USER INPUT"
         logger.info(f"Creating sample from query: {query[:100]}... (instrumental={instrumental}, vocal_language={vocal_language})")
         # Build user_metadata if vocal_language is specified and is not "unknown"
         user_metadata = None
         skip_language = False
+        if vocal_language and vocal_language.strip() and vocal_language.strip().lower() != "unknown":
+            # Use the specified language for constrained decoding
+            user_metadata = {"language": vocal_language.strip()}
+            # skip_language = True  # Skip language generation since we're injecting it
+            logger.info(f"Using user-specified language: {vocal_language.strip()}")
         # Generate using constrained decoding (inspiration phase)
         # Similar to understand mode - generate metadata first (CoT), then lyrics
                 "target_duration": None,  # No duration constraint
                 "user_metadata": user_metadata,  # Inject language if specified
                 "skip_caption": False,  # Generate caption
+                "skip_language": False,
                 "skip_genres": False,  # Generate genres
                 "generation_phase": "understand",  # Use understand phase for metadata + free-form lyrics
                 "caption": "",
         # Echo back the instrumental flag
         metadata['instrumental'] = instrumental
+        logger.info(f"Sample created successfully. Generated {metadata} fields")
+        if constrained_decoding_debug:
+            logger.debug(f"Generated metadata: {list(metadata.keys())}")
+            logger.debug(f"Output text preview: {output_text[:300]}...")
+        status_msg = f"✅ Sample created successfully\nGenerated fields: {metadata}"
+        return metadata, status_msg
+    def build_formatted_prompt_for_format(
+        self,
+        caption: str,
+        lyrics: str,
+        is_negative_prompt: bool = False,
+        negative_prompt: str = "NO USER INPUT"
+    ) -> str:
+        """
+        Build the chat-formatted prompt for format/rewrite mode.
+        This formats user-provided caption and lyrics into a more detailed and specific
+        musical description with metadata.
+        Args:
+            caption: User's caption/description of the music
+            lyrics: User's lyrics
+            is_negative_prompt: If True, builds unconditional prompt for CFG
+            negative_prompt: Negative prompt for CFG (used when is_negative_prompt=True)
+        Returns:
+            Formatted prompt string
+        Example:
+            caption = "Latin pop, reggaeton, flamenco-pop"
+            lyrics = "[Verse 1]\\nTengo un nudo..."
+            prompt = handler.build_formatted_prompt_for_format(caption, lyrics)
+        """
+        if self.llm_tokenizer is None:
+            raise ValueError("LLM tokenizer is not initialized. Call initialize() first.")
+        if is_negative_prompt:
+            # For CFG unconditional prompt
+            user_content = negative_prompt if negative_prompt and negative_prompt.strip() else ""
+        else:
+            # Normal prompt: caption + lyrics
+            user_content = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}"
+        return self.llm_tokenizer.apply_chat_template(
+            [
+                {
+                    "role": "system",
+                    "content": f"# Instruction\n{DEFAULT_LM_REWRITE_INSTRUCTION}\n\n"
+                },
+                {
+                    "role": "user",
+                    "content": user_content
+                },
+            ],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+    def format_sample_from_input(
+        self,
+        caption: str,
+        lyrics: str,
+        user_metadata: Optional[Dict[str, Any]] = None,
+        temperature: float = 0.85,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        repetition_penalty: float = 1.0,
+        use_constrained_decoding: bool = True,
+        constrained_decoding_debug: bool = False,
+    ) -> Tuple[Dict[str, Any], str]:
+        """
+        Format user-provided caption and lyrics into structured music metadata.
+        This is the "Format" feature that takes user input and generates:
+        - Enhanced caption with detailed music description
+        - Metadata (bpm, duration, keyscale, language, timesignature)
+        - Formatted lyrics (preserved from input)
+        Note: cfg_scale and negative_prompt are not supported in format mode.
+        Args:
+            caption: User's caption/description (e.g., "Latin pop, reggaeton")
+            lyrics: User's lyrics with structure tags
+            user_metadata: Optional dict with user-provided metadata to constrain decoding.
+                          Supported keys: bpm, duration, keyscale, timesignature, language
+            temperature: Sampling temperature for generation (0.0-2.0)
+            top_k: Top-K sampling (None = disabled)
+            top_p: Top-P (nucleus) sampling (None = disabled)
+            repetition_penalty: Repetition penalty (1.0 = no penalty)
+            use_constrained_decoding: Whether to use FSM-based constrained decoding
+            constrained_decoding_debug: Whether to enable debug logging
+        Returns:
+            Tuple of (metadata_dict, status_message)
+            metadata_dict contains:
+                - bpm: int or str
+                - caption: str (enhanced)
+                - duration: int or str
+                - keyscale: str
+                - language: str
+                - timesignature: str
+                - lyrics: str (from input, possibly formatted)
+        Example:
+            caption = "Latin pop, reggaeton, flamenco-pop"
+            lyrics = "[Verse 1]\\nTengo un nudo en la garganta..."
+            metadata, status = handler.format_sample_from_input(caption, lyrics)
+            print(metadata['caption'])  # "A dramatic and powerful Latin pop track..."
+            print(metadata['bpm'])      # 100
+        """
+        if not getattr(self, "llm_initialized", False):
+            return {}, "❌ 5Hz LM not initialized. Please initialize it first."
+        if not caption or not caption.strip():
+            caption = "NO USER INPUT"
+        if not lyrics or not lyrics.strip():
+            lyrics = "[Instrumental]"
+        logger.info(f"Formatting sample from input: caption={caption[:50]}..., lyrics length={len(lyrics)}")
+        # Build formatted prompt for format task
+        formatted_prompt = self.build_formatted_prompt_for_format(
+            caption=caption,
+            lyrics=lyrics,
+        )
+        logger.debug(f"Formatted prompt for format: {formatted_prompt}")
+        # Build constrained decoding metadata from user_metadata
+        constrained_metadata = None
+        if user_metadata:
+            constrained_metadata = {}
+            if user_metadata.get('bpm') is not None:
+                try:
+                    bpm_val = int(user_metadata['bpm'])
+                    if bpm_val > 0:
+                        constrained_metadata['bpm'] = bpm_val
+                except (ValueError, TypeError):
+                    pass
+            if user_metadata.get('duration') is not None:
+                try:
+                    dur_val = int(user_metadata['duration'])
+                    if dur_val > 0:
+                        constrained_metadata['duration'] = dur_val
+                except (ValueError, TypeError):
+                    pass
+            if user_metadata.get('keyscale'):
+                constrained_metadata['keyscale'] = user_metadata['keyscale']
+            if user_metadata.get('timesignature'):
+                constrained_metadata['timesignature'] = user_metadata['timesignature']
+            if user_metadata.get('language'):
+                constrained_metadata['language'] = user_metadata['language']
+            # Only use if we have at least one field
+            if not constrained_metadata:
+                constrained_metadata = None
+            else:
+                logger.info(f"Using user-provided metadata constraints: {constrained_metadata}")
+        # Generate using constrained decoding (format phase)
+        # Similar to understand/inspiration mode - generate metadata first (CoT), then formatted lyrics
+        # Note: cfg_scale and negative_prompt are not used in format mode
+        output_text, status = self.generate_from_formatted_prompt(
+            formatted_prompt=formatted_prompt,
+            cfg={
+                "temperature": temperature,
+                "top_k": top_k,
+                "top_p": top_p,
+                "repetition_penalty": repetition_penalty,
+                "target_duration": None,  # No duration constraint for generation length
+                "user_metadata": constrained_metadata,  # Inject user-provided metadata
+                "skip_caption": False,  # Generate caption
+                "skip_language": constrained_metadata.get('language') is not None if constrained_metadata else False,
+                "skip_genres": False,  # Generate genres
+                "generation_phase": "understand",  # Use understand phase for metadata + free-form lyrics
+                "caption": "",
+                "lyrics": "",
+            },
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+            stop_at_reasoning=False,  # Continue after </think> to get formatted lyrics
+        )
+        if not output_text:
+            return {}, status
+        # Parse metadata and extract lyrics
+        metadata, _ = self.parse_lm_output(output_text)
+        # Extract formatted lyrics section (everything after </think>)
+        formatted_lyrics = self._extract_lyrics_from_output(output_text)
+        if formatted_lyrics:
+            metadata['lyrics'] = formatted_lyrics
+        else:
+            # If no lyrics generated, keep original input
+            metadata['lyrics'] = lyrics
+        logger.info(f"Format completed successfully. Generated {len(metadata)} fields")
         if constrained_decoding_debug:
             logger.debug(f"Generated metadata: {list(metadata.keys())}")
             logger.debug(f"Output text preview: {output_text[:300]}...")
+        status_msg = f"✅ Format completed successfully\nGenerated fields: {', '.join(metadata.keys())}"
         return metadata, status_msg
     def generate_from_formatted_prompt(

examples/simple_mode/example_01.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "a soft Bengali love song for a quiet evening",
     "instrumental": false,
-    "vocal_language": ["bn"]
 }

 {
     "description": "a soft Bengali love song for a quiet evening",
     "instrumental": false,
+    "vocal_language": "bn"
 }

examples/simple_mode/example_02.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "an upbeat summer pop song with catchy hooks",
     "instrumental": false,
-    "vocal_language": ["en"]
 }

 {
     "description": "an upbeat summer pop song with catchy hooks",
     "instrumental": false,
+    "vocal_language": "en"
 }

examples/simple_mode/example_03.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "epic orchestral cinematic music for a movie trailer",
     "instrumental": true,
-    "vocal_language": ["unknown"]
 }

 {
     "description": "epic orchestral cinematic music for a movie trailer",
     "instrumental": true,
+    "vocal_language": "unknown"
 }

examples/simple_mode/example_04.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "一首深情的中文抒情歌曲，适合夜晚独自聆听",
     "instrumental": false,
-    "vocal_language": ["zh"]
 }

 {
     "description": "一首深情的中文抒情歌曲，适合夜晚独自聆听",
     "instrumental": false,
+    "vocal_language": "zh"
 }

examples/simple_mode/example_05.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "Japanese city pop with nostalgic 80s vibes",
     "instrumental": false,
-    "vocal_language": ["ja"]
 }

 {
     "description": "Japanese city pop with nostalgic 80s vibes",
     "instrumental": false,
+    "vocal_language": "ja"
 }

examples/simple_mode/example_06.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "lo-fi hip hop beats for studying and relaxing",
     "instrumental": true,
-    "vocal_language": ["unknown"]
 }

 {
     "description": "lo-fi hip hop beats for studying and relaxing",
     "instrumental": true,
+    "vocal_language": "unknown"
 }

examples/simple_mode/example_07.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "energetic K-pop dance track with powerful vocals",
     "instrumental": false,
-    "vocal_language": ["ko"]
 }

 {
     "description": "energetic K-pop dance track with powerful vocals",
     "instrumental": false,
+    "vocal_language": "ko"
 }

examples/simple_mode/example_08.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "romantic Spanish guitar ballad with heartfelt lyrics",
     "instrumental": false,
-    "vocal_language": ["es"]
 }

 {
     "description": "romantic Spanish guitar ballad with heartfelt lyrics",
     "instrumental": false,
+    "vocal_language": "es"
 }

examples/simple_mode/example_09.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "中国风电子舞曲，融合古典乐器与现代节拍",
     "instrumental": false,
-    "vocal_language": ["zh"]
 }

 {
     "description": "中国风电子舞曲，融合古典乐器与现代节拍",
     "instrumental": false,
+    "vocal_language": "zh"
 }

examples/simple_mode/example_10.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
     "description": "peaceful piano melody for meditation and relaxation",
     "instrumental": true,
-    "vocal_language": ["unknown"]
 }

 {
     "description": "peaceful piano melody for meditation and relaxation",
     "instrumental": true,
+    "vocal_language": "unknown"
 }