Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on Zero

App Files Files Community

ChuxiJ commited on Jan 8

Commit

1da0418

1 Parent(s): 134e9c0

refact ui and add i18n

Browse files

Files changed (17) hide show

acestep/acestep_v15_pipeline.py +51 -7
acestep/constants.py +9 -0
acestep/gradio_ui/__init__.py +1 -0
acestep/{gradio_ui.py → gradio_ui/event.py} +0 -0
acestep/gradio_ui/events/__init__.py +622 -0
acestep/gradio_ui/events/generation_handlers.py +619 -0
acestep/gradio_ui/events/results_handlers.py +1381 -0
acestep/gradio_ui/i18n.py +152 -0
acestep/gradio_ui/i18n/en.json +209 -0
acestep/gradio_ui/i18n/ja.json +209 -0
acestep/gradio_ui/i18n/zh.json +209 -0
acestep/gradio_ui/interfaces/__init__.py +82 -0
acestep/gradio_ui/interfaces/dataset.py +101 -0
acestep/gradio_ui/interfaces/generation.py +683 -0
acestep/gradio_ui/interfaces/result.py +341 -0
acestep/handler.py +7 -12
acestep/test_time_scaling.py +96 -5

acestep/acestep_v15_pipeline.py CHANGED Viewed

@@ -26,7 +26,7 @@ except ImportError:
     from acestep.gradio_ui import create_gradio_interface
-def create_demo(init_params=None):
     """
     Create Gradio demo interface
@@ -36,7 +36,9 @@ def create_demo(init_params=None):
                     Keys: 'pre_initialized' (bool), 'checkpoint', 'config_path', 'device',
                           'init_llm', 'lm_model_path', 'backend', 'use_flash_attention',
                           'offload_to_cpu', 'offload_dit_to_cpu', 'init_status',
-                          'dit_handler', 'llm_handler' (initialized handlers if pre-initialized)
     Returns:
         Gradio Blocks instance
@@ -52,20 +54,52 @@ def create_demo(init_params=None):
     dataset_handler = DatasetHandler()  # Dataset handler
     # Create Gradio interface with all handlers and initialization parameters
-    demo = create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_params=init_params)
     return demo
 def main():
     """Main entry function"""
     import argparse
     parser = argparse.ArgumentParser(description="Gradio Demo for ACE-Step V1.5")
     parser.add_argument("--port", type=int, default=7860, help="Port to run the gradio server on")
     parser.add_argument("--share", action="store_true", help="Create a public link")
     parser.add_argument("--debug", action="store_true", help="Enable debug mode")
     parser.add_argument("--server-name", type=str, default="127.0.0.1", help="Server name (default: 127.0.0.1, use 0.0.0.0 for all interfaces)")
     # Service initialization arguments
     parser.add_argument("--init_service", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False, help="Initialize service on startup (default: False)")
@@ -76,7 +110,7 @@ def main():
     parser.add_argument("--lm_model_path", type=str, default=None, help="5Hz LM model path (e.g., 'acestep-5Hz-lm-0.6B')")
     parser.add_argument("--backend", type=str, default="vllm", choices=["vllm", "pt"], help="5Hz LM backend (default: vllm)")
     parser.add_argument("--use_flash_attention", type=lambda x: x.lower() in ['true', '1', 'yes'], default=None, help="Use flash attention (default: auto-detect)")
-    parser.add_argument("--offload_to_cpu", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False, help="Offload models to CPU (default: False)")
     parser.add_argument("--offload_dit_to_cpu", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False, help="Offload DiT to CPU (default: False)")
     args = parser.parse_args()
@@ -176,14 +210,24 @@ def main():
                 'init_status': init_status,
                 'enable_generate': enable_generate,
                 'dit_handler': dit_handler,
-                'llm_handler': llm_handler
             }
             print("Service initialization completed successfully!")
         # Create and launch demo
-        print("Creating Gradio interface...")
-        demo = create_demo(init_params=init_params)
         print(f"Launching server on {args.server_name}:{args.port}...")
         demo.launch(
             server_name=args.server_name,

     from acestep.gradio_ui import create_gradio_interface
+def create_demo(init_params=None, language='en'):
     """
     Create Gradio demo interface
                     Keys: 'pre_initialized' (bool), 'checkpoint', 'config_path', 'device',
                           'init_llm', 'lm_model_path', 'backend', 'use_flash_attention',
                           'offload_to_cpu', 'offload_dit_to_cpu', 'init_status',
+                          'dit_handler', 'llm_handler' (initialized handlers if pre-initialized),
+                          'language' (UI language code)
+        language: UI language code ('en', 'zh', 'ja', default: 'en')
     Returns:
         Gradio Blocks instance
     dataset_handler = DatasetHandler()  # Dataset handler
     # Create Gradio interface with all handlers and initialization parameters
+    demo = create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_params=init_params, language=language)
     return demo
+def get_gpu_memory_gb():
+    """
+    Get GPU memory in GB. Returns 0 if no GPU is available.
+    """
+    try:
+        import torch
+        if torch.cuda.is_available():
+            # Get total memory of the first GPU in GB
+            total_memory = torch.cuda.get_device_properties(0).total_memory
+            memory_gb = total_memory / (1024**3)  # Convert bytes to GB
+            return memory_gb
+        else:
+            return 0
+    except Exception as e:
+        print(f"Warning: Failed to detect GPU memory: {e}", file=sys.stderr)
+        return 0
 def main():
     """Main entry function"""
     import argparse
+    # Detect GPU memory to auto-configure offload settings
+    gpu_memory_gb = get_gpu_memory_gb()
+    auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < 16
+    if auto_offload:
+        print(f"Detected GPU memory: {gpu_memory_gb:.2f} GB (< 16GB)")
+        print("Auto-enabling CPU offload to reduce GPU memory usage")
+    elif gpu_memory_gb > 0:
+        print(f"Detected GPU memory: {gpu_memory_gb:.2f} GB (>= 16GB)")
+        print("CPU offload disabled by default")
+    else:
+        print("No GPU detected, running on CPU")
     parser = argparse.ArgumentParser(description="Gradio Demo for ACE-Step V1.5")
     parser.add_argument("--port", type=int, default=7860, help="Port to run the gradio server on")
     parser.add_argument("--share", action="store_true", help="Create a public link")
     parser.add_argument("--debug", action="store_true", help="Enable debug mode")
     parser.add_argument("--server-name", type=str, default="127.0.0.1", help="Server name (default: 127.0.0.1, use 0.0.0.0 for all interfaces)")
+    parser.add_argument("--language", type=str, default="en", choices=["en", "zh", "ja"], help="UI language: en (English), zh (中文), ja (日本語)")
     # Service initialization arguments
     parser.add_argument("--init_service", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False, help="Initialize service on startup (default: False)")
     parser.add_argument("--lm_model_path", type=str, default=None, help="5Hz LM model path (e.g., 'acestep-5Hz-lm-0.6B')")
     parser.add_argument("--backend", type=str, default="vllm", choices=["vllm", "pt"], help="5Hz LM backend (default: vllm)")
     parser.add_argument("--use_flash_attention", type=lambda x: x.lower() in ['true', '1', 'yes'], default=None, help="Use flash attention (default: auto-detect)")
+    parser.add_argument("--offload_to_cpu", type=lambda x: x.lower() in ['true', '1', 'yes'], default=auto_offload, help=f"Offload models to CPU (default: {'True' if auto_offload else 'False'}, auto-detected based on GPU VRAM)")
     parser.add_argument("--offload_dit_to_cpu", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False, help="Offload DiT to CPU (default: False)")
     args = parser.parse_args()
                 'init_status': init_status,
                 'enable_generate': enable_generate,
                 'dit_handler': dit_handler,
+                'llm_handler': llm_handler,
+                'language': args.language
             }
             print("Service initialization completed successfully!")
         # Create and launch demo
+        print(f"Creating Gradio interface with language: {args.language}...")
+        demo = create_demo(init_params=init_params, language=args.language)
+        # Enable queue for multi-user support
+        # This ensures proper request queuing and prevents concurrent generation conflicts
+        print("Enabling queue for multi-user support...")
+        demo.queue(
+            max_size=20,  # Maximum queue size (adjust based on your needs)
+            status_update_rate="auto",  # Update rate for queue status
+        )
         print(f"Launching server on {args.server_name}:{args.port}...")
         demo.launch(
             server_name=args.server_name,

acestep/constants.py CHANGED Viewed

@@ -96,3 +96,12 @@ TRACK_NAMES = [
     "keyboard", "guitar", "bass", "drums", "backing_vocals", "vocals"
 ]

     "keyboard", "guitar", "bass", "drums", "backing_vocals", "vocals"
 ]
+SFT_GEN_PROMPT = """# Instruction
+{}
+# Caption
+{}
+# Metas
+{}<|endoftext|>
+"""

acestep/gradio_ui/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from acestep.gradio_ui.interfaces import create_gradio_interface

acestep/{gradio_ui.py → gradio_ui/event.py} RENAMED Viewed

The diff for this file is too large to render. See raw diff

acestep/gradio_ui/events/__init__.py ADDED Viewed

	@@ -0,0 +1,622 @@

+"""
+Gradio UI Event Handlers Module
+Main entry point for setting up all event handlers
+"""
+import gradio as gr
+from typing import Optional
+# Import handler modules
+from . import generation_handlers as gen_h
+from . import results_handlers as res_h
+from acestep.gradio_ui.i18n import t
+def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, dataset_section, generation_section, results_section):
+    """Setup event handlers connecting UI components and business logic"""
+    # ========== Dataset Handlers ==========
+    dataset_section["import_dataset_btn"].click(
+        fn=dataset_handler.import_dataset,
+        inputs=[dataset_section["dataset_type"]],
+        outputs=[dataset_section["data_status"]]
+    )
+    # ========== Service Initialization ==========
+    generation_section["refresh_btn"].click(
+        fn=lambda: gen_h.refresh_checkpoints(dit_handler),
+        outputs=[generation_section["checkpoint_dropdown"]]
+    )
+    generation_section["config_path"].change(
+        fn=gen_h.update_model_type_settings,
+        inputs=[generation_section["config_path"]],
+        outputs=[
+            generation_section["inference_steps"],
+            generation_section["guidance_scale"],
+            generation_section["use_adg"],
+            generation_section["cfg_interval_start"],
+            generation_section["cfg_interval_end"],
+            generation_section["task_type"],
+        ]
+    )
+    generation_section["init_btn"].click(
+        fn=lambda *args: gen_h.init_service_wrapper(dit_handler, llm_handler, *args),
+        inputs=[
+            generation_section["checkpoint_dropdown"],
+            generation_section["config_path"],
+            generation_section["device"],
+            generation_section["init_llm_checkbox"],
+            generation_section["lm_model_path"],
+            generation_section["backend_dropdown"],
+            generation_section["use_flash_attention_checkbox"],
+            generation_section["offload_to_cpu_checkbox"],
+            generation_section["offload_dit_to_cpu_checkbox"],
+        ],
+        outputs=[generation_section["init_status"], generation_section["generate_btn"], generation_section["service_config_accordion"]]
+    )
+    # ========== UI Visibility Updates ==========
+    generation_section["init_llm_checkbox"].change(
+        fn=gen_h.update_negative_prompt_visibility,
+        inputs=[generation_section["init_llm_checkbox"]],
+        outputs=[generation_section["lm_negative_prompt"]]
+    )
+    generation_section["init_llm_checkbox"].change(
+        fn=gen_h.update_audio_cover_strength_visibility,
+        inputs=[generation_section["task_type"], generation_section["init_llm_checkbox"]],
+        outputs=[generation_section["audio_cover_strength"]]
+    )
+    generation_section["task_type"].change(
+        fn=gen_h.update_audio_cover_strength_visibility,
+        inputs=[generation_section["task_type"], generation_section["init_llm_checkbox"]],
+        outputs=[generation_section["audio_cover_strength"]]
+    )
+    generation_section["batch_size_input"].change(
+        fn=gen_h.update_audio_components_visibility,
+        inputs=[generation_section["batch_size_input"]],
+        outputs=[
+            results_section["audio_col_1"],
+            results_section["audio_col_2"],
+            results_section["audio_col_3"],
+            results_section["audio_col_4"],
+            results_section["audio_row_5_8"],
+            results_section["audio_col_5"],
+            results_section["audio_col_6"],
+            results_section["audio_col_7"],
+            results_section["audio_col_8"],
+        ]
+    )
+    # Update codes hints visibility
+    for trigger in [generation_section["src_audio"], generation_section["allow_lm_batch"], generation_section["batch_size_input"]]:
+        trigger.change(
+            fn=gen_h.update_codes_hints_visibility,
+            inputs=[
+                generation_section["src_audio"],
+                generation_section["allow_lm_batch"],
+                generation_section["batch_size_input"]
+            ],
+            outputs=[
+                generation_section["codes_single_row"],
+                generation_section["codes_batch_row"],
+                generation_section["codes_batch_row_2"],
+                generation_section["codes_col_1"],
+                generation_section["codes_col_2"],
+                generation_section["codes_col_3"],
+                generation_section["codes_col_4"],
+                generation_section["codes_col_5"],
+                generation_section["codes_col_6"],
+                generation_section["codes_col_7"],
+                generation_section["codes_col_8"],
+                generation_section["transcribe_btn"],
+            ]
+        )
+    # ========== Audio Conversion ==========
+    generation_section["convert_src_to_codes_btn"].click(
+        fn=lambda src: gen_h.convert_src_audio_to_codes_wrapper(dit_handler, src),
+        inputs=[generation_section["src_audio"]],
+        outputs=[generation_section["text2music_audio_code_string"]]
+    )
+    # ========== Instruction UI Updates ==========
+    for trigger in [generation_section["task_type"], generation_section["track_name"], generation_section["complete_track_classes"]]:
+        trigger.change(
+            fn=lambda *args: gen_h.update_instruction_ui(dit_handler, *args),
+            inputs=[
+                generation_section["task_type"],
+                generation_section["track_name"],
+                generation_section["complete_track_classes"],
+                generation_section["text2music_audio_code_string"],
+                generation_section["init_llm_checkbox"]
+            ],
+            outputs=[
+                generation_section["instruction_display_gen"],
+                generation_section["track_name"],
+                generation_section["complete_track_classes"],
+                generation_section["audio_cover_strength"],
+                generation_section["repainting_group"],
+                generation_section["text2music_audio_codes_group"],
+            ]
+        )
+    # ========== Sample/Transcribe Handlers ==========
+    generation_section["sample_btn"].click(
+        fn=lambda task, debug: gen_h.sample_example_smart(llm_handler, task, debug) + (True,),
+        inputs=[
+            generation_section["task_type"],
+            generation_section["constrained_decoding_debug"]
+        ],
+        outputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["think_checkbox"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["vocal_language"],
+            generation_section["time_signature"],
+            results_section["is_format_caption_state"]
+        ]
+    )
+    generation_section["text2music_audio_code_string"].change(
+        fn=gen_h.update_transcribe_button_text,
+        inputs=[generation_section["text2music_audio_code_string"]],
+        outputs=[generation_section["transcribe_btn"]]
+    )
+    generation_section["transcribe_btn"].click(
+        fn=lambda codes, debug: gen_h.transcribe_audio_codes(llm_handler, codes, debug),
+        inputs=[
+            generation_section["text2music_audio_code_string"],
+            generation_section["constrained_decoding_debug"]
+        ],
+        outputs=[
+            results_section["status_output"],
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["vocal_language"],
+            generation_section["time_signature"],
+            results_section["is_format_caption_state"]
+        ]
+    )
+    # ========== Reset Format Caption Flag ==========
+    for trigger in [generation_section["captions"], generation_section["lyrics"], generation_section["bpm"],
+                    generation_section["key_scale"], generation_section["time_signature"],
+                    generation_section["vocal_language"], generation_section["audio_duration"]]:
+        trigger.change(
+            fn=gen_h.reset_format_caption_flag,
+            inputs=[],
+            outputs=[results_section["is_format_caption_state"]]
+        )
+    # ========== Audio Uploads Accordion ==========
+    for trigger in [generation_section["reference_audio"], generation_section["src_audio"]]:
+        trigger.change(
+            fn=gen_h.update_audio_uploads_accordion,
+            inputs=[generation_section["reference_audio"], generation_section["src_audio"]],
+            outputs=[generation_section["audio_uploads_accordion"]]
+        )
+    # ========== Instrumental Checkbox ==========
+    generation_section["instrumental_checkbox"].change(
+        fn=gen_h.handle_instrumental_checkbox,
+        inputs=[generation_section["instrumental_checkbox"], generation_section["lyrics"]],
+        outputs=[generation_section["lyrics"]]
+    )
+    # ========== Load/Save Metadata ==========
+    generation_section["load_file"].upload(
+        fn=gen_h.load_metadata,
+        inputs=[generation_section["load_file"]],
+        outputs=[
+            generation_section["task_type"],
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["vocal_language"],
+            generation_section["bpm"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["audio_duration"],
+            generation_section["batch_size_input"],
+            generation_section["inference_steps"],
+            generation_section["guidance_scale"],
+            generation_section["seed"],
+            generation_section["random_seed_checkbox"],
+            generation_section["use_adg"],
+            generation_section["cfg_interval_start"],
+            generation_section["cfg_interval_end"],
+            generation_section["audio_format"],
+            generation_section["lm_temperature"],
+            generation_section["lm_cfg_scale"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["lm_negative_prompt"],
+            generation_section["use_cot_caption"],
+            generation_section["use_cot_language"],
+            generation_section["audio_cover_strength"],
+            generation_section["think_checkbox"],
+            generation_section["text2music_audio_code_string"],
+            generation_section["repainting_start"],
+            generation_section["repainting_end"],
+            generation_section["track_name"],
+            generation_section["complete_track_classes"],
+            results_section["is_format_caption_state"]
+        ]
+    )
+    # Save buttons for audio 1 and 2
+    for btn_idx, btn_key in [(1, "save_btn_1"), (2, "save_btn_2")]:
+        results_section[btn_key].click(
+            fn=res_h.save_audio_and_metadata,
+            inputs=[
+                results_section[f"generated_audio_{btn_idx}"],
+                generation_section["task_type"],
+                generation_section["captions"],
+                generation_section["lyrics"],
+                generation_section["vocal_language"],
+                generation_section["bpm"],
+                generation_section["key_scale"],
+                generation_section["time_signature"],
+                generation_section["audio_duration"],
+                generation_section["batch_size_input"],
+                generation_section["inference_steps"],
+                generation_section["guidance_scale"],
+                generation_section["seed"],
+                generation_section["random_seed_checkbox"],
+                generation_section["use_adg"],
+                generation_section["cfg_interval_start"],
+                generation_section["cfg_interval_end"],
+                generation_section["audio_format"],
+                generation_section["lm_temperature"],
+                generation_section["lm_cfg_scale"],
+                generation_section["lm_top_k"],
+                generation_section["lm_top_p"],
+                generation_section["lm_negative_prompt"],
+                generation_section["use_cot_caption"],
+                generation_section["use_cot_language"],
+                generation_section["audio_cover_strength"],
+                generation_section["think_checkbox"],
+                generation_section["text2music_audio_code_string"],
+                generation_section["repainting_start"],
+                generation_section["repainting_end"],
+                generation_section["track_name"],
+                generation_section["complete_track_classes"],
+                results_section["lm_metadata_state"],
+            ],
+            outputs=[gr.File(label="Download Package", visible=False)]
+        )
+    # ========== Send to SRC Handlers ==========
+    for btn_idx in range(1, 9):
+        results_section[f"send_to_src_btn_{btn_idx}"].click(
+            fn=res_h.send_audio_to_src_with_metadata,
+            inputs=[
+                results_section[f"generated_audio_{btn_idx}"],
+                results_section["lm_metadata_state"]
+            ],
+            outputs=[
+                generation_section["src_audio"],
+                generation_section["bpm"],
+                generation_section["captions"],
+                generation_section["lyrics"],
+                generation_section["audio_duration"],
+                generation_section["key_scale"],
+                generation_section["vocal_language"],
+                generation_section["time_signature"],
+                results_section["is_format_caption_state"]
+            ]
+        )
+    # ========== Score Calculation Handlers ==========
+    for btn_idx in range(1, 9):
+        results_section[f"score_btn_{btn_idx}"].click(
+            fn=lambda sample_idx, scale, batch_idx, queue: res_h.calculate_score_handler_with_selection(
+                llm_handler, sample_idx, scale, batch_idx, queue
+            ),
+            inputs=[
+                gr.State(value=btn_idx),
+                generation_section["score_scale"],
+                results_section["current_batch_index"],
+                results_section["batch_queue"],
+            ],
+            outputs=[results_section[f"score_display_{btn_idx}"], results_section["batch_queue"]]
+        )
+    # ========== Generation Handler ==========
+    generation_section["generate_btn"].click(
+        fn=lambda *args: res_h.generate_with_batch_management(dit_handler, llm_handler, *args),
+        inputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["vocal_language"],
+            generation_section["inference_steps"],
+            generation_section["guidance_scale"],
+            generation_section["random_seed_checkbox"],
+            generation_section["seed"],
+            generation_section["reference_audio"],
+            generation_section["audio_duration"],
+            generation_section["batch_size_input"],
+            generation_section["src_audio"],
+            generation_section["text2music_audio_code_string"],
+            generation_section["repainting_start"],
+            generation_section["repainting_end"],
+            generation_section["instruction_display_gen"],
+            generation_section["audio_cover_strength"],
+            generation_section["task_type"],
+            generation_section["use_adg"],
+            generation_section["cfg_interval_start"],
+            generation_section["cfg_interval_end"],
+            generation_section["audio_format"],
+            generation_section["lm_temperature"],
+            generation_section["think_checkbox"],
+            generation_section["lm_cfg_scale"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["lm_negative_prompt"],
+            generation_section["use_cot_metas"],
+            generation_section["use_cot_caption"],
+            generation_section["use_cot_language"],
+            results_section["is_format_caption_state"],
+            generation_section["constrained_decoding_debug"],
+            generation_section["allow_lm_batch"],
+            generation_section["auto_score"],
+            generation_section["score_scale"],
+            generation_section["lm_batch_chunk_size"],
+            generation_section["track_name"],
+            generation_section["complete_track_classes"],
+            generation_section["autogen_checkbox"],
+            results_section["current_batch_index"],
+            results_section["total_batches"],
+            results_section["batch_queue"],
+            results_section["generation_params_state"],
+        ],
+        outputs=[
+            results_section["generated_audio_1"],
+            results_section["generated_audio_2"],
+            results_section["generated_audio_3"],
+            results_section["generated_audio_4"],
+            results_section["generated_audio_5"],
+            results_section["generated_audio_6"],
+            results_section["generated_audio_7"],
+            results_section["generated_audio_8"],
+            results_section["generated_audio_batch"],
+            results_section["generation_info"],
+            results_section["status_output"],
+            generation_section["seed"],
+            results_section["align_score_1"],
+            results_section["align_text_1"],
+            results_section["align_plot_1"],
+            results_section["align_score_2"],
+            results_section["align_text_2"],
+            results_section["align_plot_2"],
+            results_section["score_display_1"],
+            results_section["score_display_2"],
+            results_section["score_display_3"],
+            results_section["score_display_4"],
+            results_section["score_display_5"],
+            results_section["score_display_6"],
+            results_section["score_display_7"],
+            results_section["score_display_8"],
+            generation_section["text2music_audio_code_string"],
+            generation_section["text2music_audio_code_string_1"],
+            generation_section["text2music_audio_code_string_2"],
+            generation_section["text2music_audio_code_string_3"],
+            generation_section["text2music_audio_code_string_4"],
+            generation_section["text2music_audio_code_string_5"],
+            generation_section["text2music_audio_code_string_6"],
+            generation_section["text2music_audio_code_string_7"],
+            generation_section["text2music_audio_code_string_8"],
+            results_section["lm_metadata_state"],
+            results_section["is_format_caption_state"],
+            results_section["current_batch_index"],
+            results_section["total_batches"],
+            results_section["batch_queue"],
+            results_section["generation_params_state"],
+            results_section["batch_indicator"],
+            results_section["prev_batch_btn"],
+            results_section["next_batch_btn"],
+            results_section["next_batch_status"],
+            results_section["restore_params_btn"],
+        ]
+    ).then(
+        fn=lambda *args: res_h.generate_next_batch_background(dit_handler, llm_handler, *args),
+        inputs=[
+            generation_section["autogen_checkbox"],
+            results_section["generation_params_state"],
+            results_section["current_batch_index"],
+            results_section["total_batches"],
+            results_section["batch_queue"],
+            results_section["is_format_caption_state"],
+        ],
+        outputs=[
+            results_section["batch_queue"],
+            results_section["total_batches"],
+            results_section["next_batch_status"],
+            results_section["next_batch_btn"],
+        ]
+    )
+    # ========== Batch Navigation Handlers ==========
+    results_section["prev_batch_btn"].click(
+        fn=res_h.navigate_to_previous_batch,
+        inputs=[
+            results_section["current_batch_index"],
+            results_section["batch_queue"],
+        ],
+        outputs=[
+            results_section["generated_audio_1"],
+            results_section["generated_audio_2"],
+            results_section["generated_audio_3"],
+            results_section["generated_audio_4"],
+            results_section["generated_audio_5"],
+            results_section["generated_audio_6"],
+            results_section["generated_audio_7"],
+            results_section["generated_audio_8"],
+            results_section["generated_audio_batch"],
+            results_section["generation_info"],
+            results_section["current_batch_index"],
+            results_section["batch_indicator"],
+            results_section["prev_batch_btn"],
+            results_section["next_batch_btn"],
+            results_section["status_output"],
+            results_section["score_display_1"],
+            results_section["score_display_2"],
+            results_section["score_display_3"],
+            results_section["score_display_4"],
+            results_section["score_display_5"],
+            results_section["score_display_6"],
+            results_section["score_display_7"],
+            results_section["score_display_8"],
+            results_section["restore_params_btn"],
+        ]
+    )
+    results_section["next_batch_btn"].click(
+        fn=res_h.capture_current_params,
+        inputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["vocal_language"],
+            generation_section["inference_steps"],
+            generation_section["guidance_scale"],
+            generation_section["random_seed_checkbox"],
+            generation_section["seed"],
+            generation_section["reference_audio"],
+            generation_section["audio_duration"],
+            generation_section["batch_size_input"],
+            generation_section["src_audio"],
+            generation_section["text2music_audio_code_string"],
+            generation_section["repainting_start"],
+            generation_section["repainting_end"],
+            generation_section["instruction_display_gen"],
+            generation_section["audio_cover_strength"],
+            generation_section["task_type"],
+            generation_section["use_adg"],
+            generation_section["cfg_interval_start"],
+            generation_section["cfg_interval_end"],
+            generation_section["audio_format"],
+            generation_section["lm_temperature"],
+            generation_section["think_checkbox"],
+            generation_section["lm_cfg_scale"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["lm_negative_prompt"],
+            generation_section["use_cot_metas"],
+            generation_section["use_cot_caption"],
+            generation_section["use_cot_language"],
+            generation_section["constrained_decoding_debug"],
+            generation_section["allow_lm_batch"],
+            generation_section["auto_score"],
+            generation_section["score_scale"],
+            generation_section["lm_batch_chunk_size"],
+            generation_section["track_name"],
+            generation_section["complete_track_classes"],
+        ],
+        outputs=[results_section["generation_params_state"]]
+    ).then(
+        fn=res_h.navigate_to_next_batch,
+        inputs=[
+            generation_section["autogen_checkbox"],
+            results_section["current_batch_index"],
+            results_section["total_batches"],
+            results_section["batch_queue"],
+        ],
+        outputs=[
+            results_section["generated_audio_1"],
+            results_section["generated_audio_2"],
+            results_section["generated_audio_3"],
+            results_section["generated_audio_4"],
+            results_section["generated_audio_5"],
+            results_section["generated_audio_6"],
+            results_section["generated_audio_7"],
+            results_section["generated_audio_8"],
+            results_section["generated_audio_batch"],
+            results_section["generation_info"],
+            results_section["current_batch_index"],
+            results_section["batch_indicator"],
+            results_section["prev_batch_btn"],
+            results_section["next_batch_btn"],
+            results_section["status_output"],
+            results_section["next_batch_status"],
+            results_section["score_display_1"],
+            results_section["score_display_2"],
+            results_section["score_display_3"],
+            results_section["score_display_4"],
+            results_section["score_display_5"],
+            results_section["score_display_6"],
+            results_section["score_display_7"],
+            results_section["score_display_8"],
+            results_section["restore_params_btn"],
+        ]
+    ).then(
+        fn=lambda *args: res_h.generate_next_batch_background(dit_handler, llm_handler, *args),
+        inputs=[
+            generation_section["autogen_checkbox"],
+            results_section["generation_params_state"],
+            results_section["current_batch_index"],
+            results_section["total_batches"],
+            results_section["batch_queue"],
+            results_section["is_format_caption_state"],
+        ],
+        outputs=[
+            results_section["batch_queue"],
+            results_section["total_batches"],
+            results_section["next_batch_status"],
+            results_section["next_batch_btn"],
+        ]
+    )
+    # ========== Restore Parameters Handler ==========
+    results_section["restore_params_btn"].click(
+        fn=res_h.restore_batch_parameters,
+        inputs=[
+            results_section["current_batch_index"],
+            results_section["batch_queue"]
+        ],
+        outputs=[
+            generation_section["text2music_audio_code_string"],
+            generation_section["text2music_audio_code_string_1"],
+            generation_section["text2music_audio_code_string_2"],
+            generation_section["text2music_audio_code_string_3"],
+            generation_section["text2music_audio_code_string_4"],
+            generation_section["text2music_audio_code_string_5"],
+            generation_section["text2music_audio_code_string_6"],
+            generation_section["text2music_audio_code_string_7"],
+            generation_section["text2music_audio_code_string_8"],
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["vocal_language"],
+            generation_section["audio_duration"],
+            generation_section["batch_size_input"],
+            generation_section["inference_steps"],
+            generation_section["lm_temperature"],
+            generation_section["lm_cfg_scale"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["think_checkbox"],
+            generation_section["use_cot_caption"],
+            generation_section["use_cot_language"],
+            generation_section["allow_lm_batch"],
+            generation_section["track_name"],
+            generation_section["complete_track_classes"],
+        ]
+    )

acestep/gradio_ui/events/generation_handlers.py ADDED Viewed

	@@ -0,0 +1,619 @@

+"""
+Generation Input Handlers Module
+Contains event handlers and helper functions related to generation inputs
+"""
+import os
+import json
+import random
+import glob
+import gradio as gr
+from typing import Optional
+from acestep.constants import (
+    TASK_TYPES_TURBO,
+    TASK_TYPES_BASE,
+)
+from acestep.gradio_ui.i18n import t
+def load_metadata(file_obj):
+    """Load generation parameters from a JSON file"""
+    if file_obj is None:
+        gr.Warning(t("messages.no_file_selected"))
+        return [None] * 31 + [False]  # Return None for all fields, False for is_format_caption
+    try:
+        # Read the uploaded file
+        if hasattr(file_obj, 'name'):
+            filepath = file_obj.name
+        else:
+            filepath = file_obj
+        with open(filepath, 'r', encoding='utf-8') as f:
+            metadata = json.load(f)
+        # Extract all fields
+        task_type = metadata.get('task_type', 'text2music')
+        captions = metadata.get('caption', '')
+        lyrics = metadata.get('lyrics', '')
+        vocal_language = metadata.get('vocal_language', 'unknown')
+        # Convert bpm
+        bpm_value = metadata.get('bpm')
+        if bpm_value is not None and bpm_value != "N/A":
+            try:
+                bpm = int(bpm_value) if bpm_value else None
+            except:
+                bpm = None
+        else:
+            bpm = None
+        key_scale = metadata.get('keyscale', '')
+        time_signature = metadata.get('timesignature', '')
+        # Convert duration
+        duration_value = metadata.get('duration', -1)
+        if duration_value is not None and duration_value != "N/A":
+            try:
+                audio_duration = float(duration_value)
+            except:
+                audio_duration = -1
+        else:
+            audio_duration = -1
+        batch_size = metadata.get('batch_size', 2)
+        inference_steps = metadata.get('inference_steps', 8)
+        guidance_scale = metadata.get('guidance_scale', 7.0)
+        seed = metadata.get('seed', '-1')
+        random_seed = metadata.get('random_seed', True)
+        use_adg = metadata.get('use_adg', False)
+        cfg_interval_start = metadata.get('cfg_interval_start', 0.0)
+        cfg_interval_end = metadata.get('cfg_interval_end', 1.0)
+        audio_format = metadata.get('audio_format', 'mp3')
+        lm_temperature = metadata.get('lm_temperature', 0.85)
+        lm_cfg_scale = metadata.get('lm_cfg_scale', 2.0)
+        lm_top_k = metadata.get('lm_top_k', 0)
+        lm_top_p = metadata.get('lm_top_p', 0.9)
+        lm_negative_prompt = metadata.get('lm_negative_prompt', 'NO USER INPUT')
+        use_cot_caption = metadata.get('use_cot_caption', True)
+        use_cot_language = metadata.get('use_cot_language', True)
+        audio_cover_strength = metadata.get('audio_cover_strength', 1.0)
+        think = metadata.get('think', True)
+        audio_codes = metadata.get('audio_codes', '')
+        repainting_start = metadata.get('repainting_start', 0.0)
+        repainting_end = metadata.get('repainting_end', -1)
+        track_name = metadata.get('track_name')
+        complete_track_classes = metadata.get('complete_track_classes', [])
+        gr.Info(t("messages.params_loaded", filename=os.path.basename(filepath)))
+        return (
+            task_type, captions, lyrics, vocal_language, bpm, key_scale, time_signature,
+            audio_duration, batch_size, inference_steps, guidance_scale, seed, random_seed,
+            use_adg, cfg_interval_start, cfg_interval_end, audio_format,
+            lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
+            use_cot_caption, use_cot_language, audio_cover_strength,
+            think, audio_codes, repainting_start, repainting_end,
+            track_name, complete_track_classes,
+            True  # Set is_format_caption to True when loading from file
+        )
+    except json.JSONDecodeError as e:
+        gr.Warning(t("messages.invalid_json", error=str(e)))
+        return [None] * 31 + [False]
+    except Exception as e:
+        gr.Warning(t("messages.load_error", error=str(e)))
+        return [None] * 31 + [False]
+def load_random_example(task_type: str):
+    """Load a random example from the task-specific examples directory
+    Args:
+        task_type: The task type (e.g., "text2music")
+    Returns:
+        Tuple of (caption, lyrics, think, bpm, duration, keyscale, language, timesignature) for updating UI components
+    """
+    try:
+        # Get the project root directory
+        current_file = os.path.abspath(__file__)
+        # This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
+        project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
+        # Construct the examples directory path
+        examples_dir = os.path.join(project_root, "examples", task_type)
+        # Check if directory exists
+        if not os.path.exists(examples_dir):
+            gr.Warning(f"Examples directory not found: examples/{task_type}/")
+            return "", "", True, None, None, "", "", ""
+        # Find all JSON files in the directory
+        json_files = glob.glob(os.path.join(examples_dir, "*.json"))
+        if not json_files:
+            gr.Warning(f"No JSON files found in examples/{task_type}/")
+            return "", "", True, None, None, "", "", ""
+        # Randomly select one file
+        selected_file = random.choice(json_files)
+        # Read and parse JSON
+        try:
+            with open(selected_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            # Extract caption (prefer 'caption', fallback to 'prompt')
+            caption_value = data.get('caption', data.get('prompt', ''))
+            if not isinstance(caption_value, str):
+                caption_value = str(caption_value) if caption_value else ''
+            # Extract lyrics
+            lyrics_value = data.get('lyrics', '')
+            if not isinstance(lyrics_value, str):
+                lyrics_value = str(lyrics_value) if lyrics_value else ''
+            # Extract think (default to True if not present)
+            think_value = data.get('think', True)
+            if not isinstance(think_value, bool):
+                think_value = True
+            # Extract optional metadata fields
+            bpm_value = None
+            if 'bpm' in data and data['bpm'] not in [None, "N/A", ""]:
+                try:
+                    bpm_value = int(data['bpm'])
+                except (ValueError, TypeError):
+                    pass
+            duration_value = None
+            if 'duration' in data and data['duration'] not in [None, "N/A", ""]:
+                try:
+                    duration_value = float(data['duration'])
+                except (ValueError, TypeError):
+                    pass
+            keyscale_value = data.get('keyscale', '')
+            if keyscale_value in [None, "N/A"]:
+                keyscale_value = ''
+            language_value = data.get('language', '')
+            if language_value in [None, "N/A"]:
+                language_value = ''
+            timesignature_value = data.get('timesignature', '')
+            if timesignature_value in [None, "N/A"]:
+                timesignature_value = ''
+            gr.Info(t("messages.example_loaded", filename=os.path.basename(selected_file)))
+            return caption_value, lyrics_value, think_value, bpm_value, duration_value, keyscale_value, language_value, timesignature_value
+        except json.JSONDecodeError as e:
+            gr.Warning(t("messages.example_failed", filename=os.path.basename(selected_file), error=str(e)))
+            return "", "", True, None, None, "", "", ""
+        except Exception as e:
+            gr.Warning(t("messages.example_error", error=str(e)))
+            return "", "", True, None, None, "", "", ""
+    except Exception as e:
+        gr.Warning(t("messages.example_error", error=str(e)))
+        return "", "", True, None, None, "", "", ""
+def sample_example_smart(llm_handler, task_type: str, constrained_decoding_debug: bool = False):
+    """Smart sample function that uses LM if initialized, otherwise falls back to examples
+    Args:
+        llm_handler: LLM handler instance
+        task_type: The task type (e.g., "text2music")
+        constrained_decoding_debug: Whether to enable debug logging for constrained decoding
+    Returns:
+        Tuple of (caption, lyrics, think, bpm, duration, keyscale, language, timesignature) for updating UI components
+    """
+    # Check if LM is initialized
+    if llm_handler.llm_initialized:
+        # Use LM to generate example
+        try:
+            # Generate example using LM with empty input (NO USER INPUT)
+            metadata, status = llm_handler.understand_audio_from_codes(
+                audio_codes="NO USER INPUT",
+                use_constrained_decoding=True,
+                temperature=0.85,
+                constrained_decoding_debug=constrained_decoding_debug,
+            )
+            if metadata:
+                caption_value = metadata.get('caption', '')
+                lyrics_value = metadata.get('lyrics', '')
+                think_value = True  # Always enable think when using LM-generated examples
+                # Extract optional metadata fields
+                bpm_value = None
+                if 'bpm' in metadata and metadata['bpm'] not in [None, "N/A", ""]:
+                    try:
+                        bpm_value = int(metadata['bpm'])
+                    except (ValueError, TypeError):
+                        pass
+                duration_value = None
+                if 'duration' in metadata and metadata['duration'] not in [None, "N/A", ""]:
+                    try:
+                        duration_value = float(metadata['duration'])
+                    except (ValueError, TypeError):
+                        pass
+                keyscale_value = metadata.get('keyscale', '')
+                if keyscale_value in [None, "N/A"]:
+                    keyscale_value = ''
+                language_value = metadata.get('language', '')
+                if language_value in [None, "N/A"]:
+                    language_value = ''
+                timesignature_value = metadata.get('timesignature', '')
+                if timesignature_value in [None, "N/A"]:
+                    timesignature_value = ''
+                gr.Info(t("messages.lm_generated"))
+                return caption_value, lyrics_value, think_value, bpm_value, duration_value, keyscale_value, language_value, timesignature_value
+            else:
+                gr.Warning(t("messages.lm_fallback"))
+                return load_random_example(task_type)
+        except Exception as e:
+            gr.Warning(t("messages.lm_fallback"))
+            return load_random_example(task_type)
+    else:
+        # LM not initialized, use examples directory
+        return load_random_example(task_type)
+def refresh_checkpoints(dit_handler):
+    """Refresh available checkpoints"""
+    choices = dit_handler.get_available_checkpoints()
+    return gr.update(choices=choices)
+def update_model_type_settings(config_path):
+    """Update UI settings based on model type"""
+    if config_path is None:
+        config_path = ""
+    config_path_lower = config_path.lower()
+    if "turbo" in config_path_lower:
+        # Turbo model: max 8 steps, hide CFG/ADG, only show text2music/repaint/cover
+        return (
+            gr.update(value=8, maximum=8, minimum=1),  # inference_steps
+            gr.update(visible=False),  # guidance_scale
+            gr.update(visible=False),  # use_adg
+            gr.update(visible=False),  # cfg_interval_start
+            gr.update(visible=False),  # cfg_interval_end
+            gr.update(choices=TASK_TYPES_TURBO),  # task_type
+        )
+    elif "base" in config_path_lower:
+        # Base model: max 100 steps, show CFG/ADG, show all task types
+        return (
+            gr.update(value=32, maximum=100, minimum=1),  # inference_steps
+            gr.update(visible=True),  # guidance_scale
+            gr.update(visible=True),  # use_adg
+            gr.update(visible=True),  # cfg_interval_start
+            gr.update(visible=True),  # cfg_interval_end
+            gr.update(choices=TASK_TYPES_BASE),  # task_type
+        )
+    else:
+        # Default to turbo settings
+        return (
+            gr.update(value=8, maximum=8, minimum=1),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(choices=TASK_TYPES_TURBO),  # task_type
+        )
+def init_service_wrapper(dit_handler, llm_handler, checkpoint, config_path, device, init_llm, lm_model_path, backend, use_flash_attention, offload_to_cpu, offload_dit_to_cpu):
+    """Wrapper for service initialization, returns status, button state, and accordion state"""
+    # Initialize DiT handler
+    status, enable = dit_handler.initialize_service(
+        checkpoint, config_path, device,
+        use_flash_attention=use_flash_attention, compile_model=False,
+        offload_to_cpu=offload_to_cpu, offload_dit_to_cpu=offload_dit_to_cpu
+    )
+    # Initialize LM handler if requested
+    if init_llm:
+        # Get checkpoint directory
+        current_file = os.path.abspath(__file__)
+        # This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
+        project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
+        checkpoint_dir = os.path.join(project_root, "checkpoints")
+        lm_status, lm_success = llm_handler.initialize(
+            checkpoint_dir=checkpoint_dir,
+            lm_model_path=lm_model_path,
+            backend=backend,
+            device=device,
+            offload_to_cpu=offload_to_cpu,
+            dtype=dit_handler.dtype
+        )
+        if lm_success:
+            status += f"\n{lm_status}"
+        else:
+            status += f"\n{lm_status}"
+            # Don't fail the entire initialization if LM fails, but log it
+            # Keep enable as is (DiT initialization result) even if LM fails
+    # Check if model is initialized - if so, collapse the accordion
+    is_model_initialized = dit_handler.model is not None
+    accordion_state = gr.update(open=not is_model_initialized)
+    return status, gr.update(interactive=enable), accordion_state
+def update_negative_prompt_visibility(init_llm_checked):
+    """Update negative prompt visibility: show if Initialize 5Hz LM checkbox is checked"""
+    return gr.update(visible=init_llm_checked)
+def update_audio_cover_strength_visibility(task_type_value, init_llm_checked):
+    """Update audio_cover_strength visibility and label"""
+    # Show if task is cover OR if LM is initialized
+    is_visible = (task_type_value == "cover") or init_llm_checked
+    # Change label based on context
+    if init_llm_checked and task_type_value != "cover":
+        label = "LM codes strength"
+        info = "Control how many denoising steps use LM-generated codes"
+    else:
+        label = "Audio Cover Strength"
+        info = "Control how many denoising steps use cover mode"
+    return gr.update(visible=is_visible, label=label, info=info)
+def convert_src_audio_to_codes_wrapper(dit_handler, src_audio):
+    """Wrapper for converting src audio to codes"""
+    codes_string = dit_handler.convert_src_audio_to_codes(src_audio)
+    return codes_string
+def update_instruction_ui(
+    dit_handler,
+    task_type_value: str,
+    track_name_value: Optional[str],
+    complete_track_classes_value: list,
+    audio_codes_content: str = "",
+    init_llm_checked: bool = False
+) -> tuple:
+    """Update instruction and UI visibility based on task type."""
+    instruction = dit_handler.generate_instruction(
+        task_type=task_type_value,
+        track_name=track_name_value,
+        complete_track_classes=complete_track_classes_value
+    )
+    # Show track_name for lego and extract
+    track_name_visible = task_type_value in ["lego", "extract"]
+    # Show complete_track_classes for complete
+    complete_visible = task_type_value == "complete"
+    # Show audio_cover_strength for cover OR when LM is initialized
+    audio_cover_strength_visible = (task_type_value == "cover") or init_llm_checked
+    # Determine label and info based on context
+    if init_llm_checked and task_type_value != "cover":
+        audio_cover_strength_label = "LM codes strength"
+        audio_cover_strength_info = "Control how many denoising steps use LM-generated codes"
+    else:
+        audio_cover_strength_label = "Audio Cover Strength"
+        audio_cover_strength_info = "Control how many denoising steps use cover mode"
+    # Show repainting controls for repaint and lego
+    repainting_visible = task_type_value in ["repaint", "lego"]
+    # Show text2music_audio_codes if task is text2music OR if it has content
+    # This allows it to stay visible even if user switches task type but has codes
+    has_audio_codes = audio_codes_content and str(audio_codes_content).strip()
+    text2music_audio_codes_visible = task_type_value == "text2music" or has_audio_codes
+    return (
+        instruction,  # instruction_display_gen
+        gr.update(visible=track_name_visible),  # track_name
+        gr.update(visible=complete_visible),  # complete_track_classes
+        gr.update(visible=audio_cover_strength_visible, label=audio_cover_strength_label, info=audio_cover_strength_info),  # audio_cover_strength
+        gr.update(visible=repainting_visible),  # repainting_group
+        gr.update(visible=text2music_audio_codes_visible),  # text2music_audio_codes_group
+    )
+def transcribe_audio_codes(llm_handler, audio_code_string, constrained_decoding_debug):
+    """
+    Transcribe audio codes to metadata using LLM understanding.
+    If audio_code_string is empty, generate a sample example instead.
+    Args:
+        llm_handler: LLM handler instance
+        audio_code_string: String containing audio codes (or empty for example generation)
+        constrained_decoding_debug: Whether to enable debug logging for constrained decoding
+    Returns:
+        Tuple of (status_message, caption, lyrics, bpm, duration, keyscale, language, timesignature)
+    """
+    if not llm_handler.llm_initialized:
+        return t("messages.lm_not_initialized"), "", "", None, None, "", "", ""
+    # If codes are empty, this becomes a "generate example" task
+    # Use "NO USER INPUT" as the input to generate a sample
+    if not audio_code_string or not audio_code_string.strip():
+        audio_code_string = "NO USER INPUT"
+    # Call LLM understanding
+    metadata, status = llm_handler.understand_audio_from_codes(
+        audio_codes=audio_code_string,
+        use_constrained_decoding=True,
+        constrained_decoding_debug=constrained_decoding_debug,
+    )
+    # Extract fields for UI update
+    caption = metadata.get('caption', '')
+    lyrics = metadata.get('lyrics', '')
+    bpm = metadata.get('bpm')
+    duration = metadata.get('duration')
+    keyscale = metadata.get('keyscale', '')
+    language = metadata.get('language', '')
+    timesignature = metadata.get('timesignature', '')
+    # Convert to appropriate types
+    try:
+        bpm = int(bpm) if bpm and bpm != 'N/A' else None
+    except:
+        bpm = None
+    try:
+        duration = float(duration) if duration and duration != 'N/A' else None
+    except:
+        duration = None
+    return (
+        status,
+        caption,
+        lyrics,
+        bpm,
+        duration,
+        keyscale,
+        language,
+        timesignature,
+        True  # Set is_format_caption to True (from Transcribe/LM understanding)
+    )
+def update_transcribe_button_text(audio_code_string):
+    """
+    Update the transcribe button text based on input content.
+    If empty: "Generate Example"
+    If has content: "Transcribe"
+    """
+    if not audio_code_string or not audio_code_string.strip():
+        return gr.update(value="Generate Example")
+    else:
+        return gr.update(value="Transcribe")
+def reset_format_caption_flag():
+    """Reset is_format_caption to False when user manually edits caption/metadata"""
+    return False
+def update_audio_uploads_accordion(reference_audio, src_audio):
+    """Update Audio Uploads accordion open state based on whether audio files are present"""
+    has_audio = (reference_audio is not None) or (src_audio is not None)
+    return gr.update(open=has_audio)
+def handle_instrumental_checkbox(instrumental_checked, current_lyrics):
+    """
+    Handle instrumental checkbox changes.
+    When checked: if no lyrics, fill with [Instrumental]
+    When unchecked: if lyrics is [Instrumental], clear it
+    """
+    if instrumental_checked:
+        # If checked and no lyrics, fill with [Instrumental]
+        if not current_lyrics or not current_lyrics.strip():
+            return "[Instrumental]"
+        else:
+            # Has lyrics, don't change
+            return current_lyrics
+    else:
+        # If unchecked and lyrics is exactly [Instrumental], clear it
+        if current_lyrics and current_lyrics.strip() == "[Instrumental]":
+            return ""
+        else:
+            # Has other lyrics, don't change
+            return current_lyrics
+def update_audio_components_visibility(batch_size):
+    """Show/hide individual audio components based on batch size (1-8)
+    Row 1: Components 1-4 (batch_size 1-4)
+    Row 2: Components 5-8 (batch_size 5-8)
+    """
+    # Clamp batch size to 1-8 range for UI
+    batch_size = min(max(int(batch_size), 1), 8)
+    # Row 1 columns (1-4)
+    updates_row1 = (
+        gr.update(visible=True),  # audio_col_1: always visible
+        gr.update(visible=batch_size >= 2),  # audio_col_2
+        gr.update(visible=batch_size >= 3),  # audio_col_3
+        gr.update(visible=batch_size >= 4),  # audio_col_4
+    )
+    # Row 2 container and columns (5-8)
+    show_row_5_8 = batch_size >= 5
+    updates_row2 = (
+        gr.update(visible=show_row_5_8),  # audio_row_5_8 (container)
+        gr.update(visible=batch_size >= 5),  # audio_col_5
+        gr.update(visible=batch_size >= 6),  # audio_col_6
+        gr.update(visible=batch_size >= 7),  # audio_col_7
+        gr.update(visible=batch_size >= 8),  # audio_col_8
+    )
+    return updates_row1 + updates_row2
+def update_codes_hints_visibility(src_audio, allow_lm_batch, batch_size):
+    """Switch between single/batch codes input based on src_audio presence
+    When src_audio is present:
+        - Show single mode with transcribe button
+        - Clear codes (will be filled by transcription)
+    When src_audio is absent:
+        - Hide transcribe button
+        - Show batch mode if allow_lm_batch=True and batch_size>=2
+        - Show single mode otherwise
+    Row 1: Codes 1-4
+    Row 2: Codes 5-8 (batch_size >= 5)
+    """
+    batch_size = min(max(int(batch_size), 1), 8)
+    has_src_audio = src_audio is not None
+    if has_src_audio:
+        # Has src_audio: show single mode with transcribe button
+        return (
+            gr.update(visible=True),   # codes_single_row
+            gr.update(visible=False),  # codes_batch_row
+            gr.update(visible=False),  # codes_batch_row_2
+            *[gr.update(visible=False)] * 8,  # Hide all batch columns
+            gr.update(visible=True),   # transcribe_btn: show when src_audio present
+        )
+    else:
+        # No src_audio: decide between single/batch mode based on settings
+        if allow_lm_batch and batch_size >= 2:
+            # Batch mode: hide single, show batch codes with dynamic columns
+            show_row_2 = batch_size >= 5
+            return (
+                gr.update(visible=False),  # codes_single_row
+                gr.update(visible=True),   # codes_batch_row (row 1)
+                gr.update(visible=show_row_2),  # codes_batch_row_2 (row 2)
+                # Row 1 columns (1-4)
+                gr.update(visible=True),   # codes_col_1: always visible in batch mode
+                gr.update(visible=batch_size >= 2),  # codes_col_2
+                gr.update(visible=batch_size >= 3),  # codes_col_3
+                gr.update(visible=batch_size >= 4),  # codes_col_4
+                # Row 2 columns (5-8)
+                gr.update(visible=batch_size >= 5),  # codes_col_5
+                gr.update(visible=batch_size >= 6),  # codes_col_6
+                gr.update(visible=batch_size >= 7),  # codes_col_7
+                gr.update(visible=batch_size >= 8),  # codes_col_8
+                gr.update(visible=False),  # transcribe_btn: hide when no src_audio
+            )
+        else:
+            # Single mode: show single, hide batch
+            return (
+                gr.update(visible=True),   # codes_single_row
+                gr.update(visible=False),  # codes_batch_row
+                gr.update(visible=False),  # codes_batch_row_2
+                *[gr.update(visible=False)] * 8,  # Hide all batch columns
+                gr.update(visible=False),  # transcribe_btn: hide when no src_audio
+            )

acestep/gradio_ui/events/results_handlers.py ADDED Viewed

	@@ -0,0 +1,1381 @@

+"""
+Results Handlers Module
+Contains event handlers and helper functions related to result display, scoring, and batch management
+"""
+import os
+import json
+import datetime
+import tempfile
+import shutil
+import zipfile
+import time as time_module
+import gradio as gr
+from loguru import logger
+from acestep.gradio_ui.i18n import t
+def store_batch_in_queue(
+    batch_queue,
+    batch_index,
+    audio_paths,
+    generation_info,
+    seeds,
+    codes=None,
+    scores=None,
+    allow_lm_batch=False,
+    batch_size=2,
+    generation_params=None,
+    lm_generated_metadata=None,
+    status="completed"
+):
+    """Store batch results in queue with ALL generation parameters
+    Args:
+        codes: Audio codes used for generation (list for batch mode, string for single mode)
+        scores: List of score displays for each audio (optional)
+        allow_lm_batch: Whether batch LM mode was used for this batch
+        batch_size: Batch size used for this batch
+        generation_params: Complete dictionary of ALL generation parameters used
+        lm_generated_metadata: LM-generated metadata for scoring (optional)
+    """
+    batch_queue[batch_index] = {
+        "status": status,
+        "audio_paths": audio_paths,
+        "generation_info": generation_info,
+        "seeds": seeds,
+        "codes": codes,  # Store codes used for this batch
+        "scores": scores if scores else [""] * 8,  # Store scores, default to empty
+        "allow_lm_batch": allow_lm_batch,  # Store batch mode setting
+        "batch_size": batch_size,  # Store batch size
+        "generation_params": generation_params if generation_params else {},  # Store ALL parameters
+        "lm_generated_metadata": lm_generated_metadata,  # Store LM metadata for scoring
+        "timestamp": datetime.datetime.now().isoformat()
+    }
+    return batch_queue
+def update_batch_indicator(current_batch, total_batches):
+    """Update batch indicator text"""
+    return t("results.batch_indicator", current=current_batch + 1, total=total_batches)
+def update_navigation_buttons(current_batch, total_batches):
+    """Determine navigation button states"""
+    can_go_previous = current_batch > 0
+    can_go_next = current_batch < total_batches - 1
+    return can_go_previous, can_go_next
+def save_audio_and_metadata(
+    audio_path, task_type, captions, lyrics, vocal_language, bpm, key_scale, time_signature, audio_duration,
+    batch_size_input, inference_steps, guidance_scale, seed, random_seed_checkbox,
+    use_adg, cfg_interval_start, cfg_interval_end, audio_format,
+    lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
+    use_cot_caption, use_cot_language, audio_cover_strength,
+    think_checkbox, text2music_audio_code_string, repainting_start, repainting_end,
+    track_name, complete_track_classes, lm_metadata
+):
+    """Save audio file and its metadata as a zip package"""
+    if audio_path is None:
+        gr.Warning(t("messages.no_audio_to_save"))
+        return None
+    try:
+        # Create metadata dictionary
+        metadata = {
+            "saved_at": datetime.datetime.now().isoformat(),
+            "task_type": task_type,
+            "caption": captions or "",
+            "lyrics": lyrics or "",
+            "vocal_language": vocal_language,
+            "bpm": bpm if bpm is not None else None,
+            "keyscale": key_scale or "",
+            "timesignature": time_signature or "",
+            "duration": audio_duration if audio_duration is not None else -1,
+            "batch_size": batch_size_input,
+            "inference_steps": inference_steps,
+            "guidance_scale": guidance_scale,
+            "seed": seed,
+            "random_seed": False,  # Disable random seed for reproducibility
+            "use_adg": use_adg,
+            "cfg_interval_start": cfg_interval_start,
+            "cfg_interval_end": cfg_interval_end,
+            "audio_format": audio_format,
+            "lm_temperature": lm_temperature,
+            "lm_cfg_scale": lm_cfg_scale,
+            "lm_top_k": lm_top_k,
+            "lm_top_p": lm_top_p,
+            "lm_negative_prompt": lm_negative_prompt,
+            "use_cot_caption": use_cot_caption,
+            "use_cot_language": use_cot_language,
+            "audio_cover_strength": audio_cover_strength,
+            "think": think_checkbox,
+            "audio_codes": text2music_audio_code_string or "",
+            "repainting_start": repainting_start,
+            "repainting_end": repainting_end,
+            "track_name": track_name,
+            "complete_track_classes": complete_track_classes or [],
+        }
+        # Add LM-generated metadata if available
+        if lm_metadata:
+            metadata["lm_generated_metadata"] = lm_metadata
+        # Generate timestamp and base name
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        # Extract audio filename extension
+        audio_ext = os.path.splitext(audio_path)[1]
+        # Create temporary directory for packaging
+        temp_dir = tempfile.mkdtemp()
+        # Save JSON metadata
+        json_path = os.path.join(temp_dir, f"metadata_{timestamp}.json")
+        with open(json_path, 'w', encoding='utf-8') as f:
+            json.dump(metadata, f, indent=2, ensure_ascii=False)
+        # Copy audio file
+        audio_copy_path = os.path.join(temp_dir, f"audio_{timestamp}{audio_ext}")
+        shutil.copy2(audio_path, audio_copy_path)
+        # Create zip file
+        zip_path = os.path.join(tempfile.gettempdir(), f"music_package_{timestamp}.zip")
+        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            zipf.write(audio_copy_path, os.path.basename(audio_copy_path))
+            zipf.write(json_path, os.path.basename(json_path))
+        # Clean up temp directory
+        shutil.rmtree(temp_dir)
+        gr.Info(t("messages.save_success", filename=os.path.basename(zip_path)))
+        return zip_path
+    except Exception as e:
+        gr.Warning(t("messages.save_failed", error=str(e)))
+        import traceback
+        traceback.print_exc()
+        return None
+def send_audio_to_src_with_metadata(audio_file, lm_metadata):
+    """Send generated audio file to src_audio input and populate metadata fields
+    Args:
+        audio_file: Audio file path
+        lm_metadata: Dictionary containing LM-generated metadata
+    Returns:
+        Tuple of (audio_file, bpm, caption, lyrics, duration, key_scale, language, time_signature, is_format_caption)
+    """
+    if audio_file is None:
+        return None, None, None, None, None, None, None, None, True  # Keep is_format_caption as True
+    # Extract metadata fields if available
+    bpm_value = None
+    caption_value = None
+    lyrics_value = None
+    duration_value = None
+    key_scale_value = None
+    language_value = None
+    time_signature_value = None
+    if lm_metadata:
+        # BPM
+        if lm_metadata.get('bpm'):
+            bpm_str = lm_metadata.get('bpm')
+            if bpm_str and bpm_str != "N/A":
+                try:
+                    bpm_value = int(bpm_str)
+                except (ValueError, TypeError):
+                    pass
+        # Caption (Rewritten Caption)
+        if lm_metadata.get('caption'):
+            caption_value = lm_metadata.get('caption')
+        # Lyrics
+        if lm_metadata.get('lyrics'):
+            lyrics_value = lm_metadata.get('lyrics')
+        # Duration
+        if lm_metadata.get('duration'):
+            duration_str = lm_metadata.get('duration')
+            if duration_str and duration_str != "N/A":
+                try:
+                    duration_value = float(duration_str)
+                except (ValueError, TypeError):
+                    pass
+        # KeyScale
+        if lm_metadata.get('keyscale'):
+            key_scale_str = lm_metadata.get('keyscale')
+            if key_scale_str and key_scale_str != "N/A":
+                key_scale_value = key_scale_str
+        # Language
+        if lm_metadata.get('language'):
+            language_str = lm_metadata.get('language')
+            if language_str and language_str != "N/A":
+                language_value = language_str
+        # Time Signature
+        if lm_metadata.get('timesignature'):
+            time_sig_str = lm_metadata.get('timesignature')
+            if time_sig_str and time_sig_str != "N/A":
+                time_signature_value = time_sig_str
+    return (
+        audio_file,
+        bpm_value,
+        caption_value,
+        lyrics_value,
+        duration_value,
+        key_scale_value,
+        language_value,
+        time_signature_value,
+        True  # Set is_format_caption to True (from LM-generated metadata)
+    )
+def generate_with_progress(
+    dit_handler, llm_handler,
+    captions, lyrics, bpm, key_scale, time_signature, vocal_language,
+    inference_steps, guidance_scale, random_seed_checkbox, seed,
+    reference_audio, audio_duration, batch_size_input, src_audio,
+    text2music_audio_code_string, repainting_start, repainting_end,
+    instruction_display_gen, audio_cover_strength, task_type,
+    use_adg, cfg_interval_start, cfg_interval_end, audio_format, lm_temperature,
+    think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
+    use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
+    constrained_decoding_debug,
+    allow_lm_batch,
+    auto_score,
+    score_scale,
+    lm_batch_chunk_size,
+    progress=gr.Progress(track_tqdm=True)
+):
+    """Generate audio with progress tracking"""
+    # If think is enabled (llm_dit mode) and use_cot_metas is True, generate audio codes using LM first
+    audio_code_string_to_use = text2music_audio_code_string
+    lm_generated_metadata = None  # Store LM-generated metadata for display
+    lm_generated_audio_codes = None  # Store LM-generated audio codes for display
+    lm_generated_audio_codes_list = []  # Store list of audio codes for batch processing
+    # Determine if we should use batch LM generation
+    should_use_lm_batch = (
+        think_checkbox and
+        llm_handler.llm_initialized and
+        use_cot_metas and
+        allow_lm_batch and
+        batch_size_input >= 2
+    )
+    if think_checkbox and llm_handler.llm_initialized and use_cot_metas:
+        # Convert top_k: 0 means None (disabled)
+        top_k_value = None if lm_top_k == 0 else int(lm_top_k)
+        # Convert top_p: 1.0 means None (disabled)
+        top_p_value = None if lm_top_p >= 1.0 else lm_top_p
+        # Build user_metadata from user-provided values (only include non-empty values)
+        user_metadata = {}
+        # Handle bpm: gr.Number can be None, int, float, or string
+        if bpm is not None:
+            try:
+                bpm_value = float(bpm)
+                if bpm_value > 0:
+                    user_metadata['bpm'] = str(int(bpm_value))
+            except (ValueError, TypeError):
+                # If bpm is not a valid number, skip it
+                pass
+        if key_scale and key_scale.strip():
+            key_scale_clean = key_scale.strip()
+            if key_scale_clean.lower() not in ["n/a", ""]:
+                user_metadata['keyscale'] = key_scale_clean
+        if time_signature and time_signature.strip():
+            time_sig_clean = time_signature.strip()
+            if time_sig_clean.lower() not in ["n/a", ""]:
+                user_metadata['timesignature'] = time_sig_clean
+        if audio_duration is not None:
+            try:
+                duration_value = float(audio_duration)
+                if duration_value > 0:
+                    user_metadata['duration'] = str(int(duration_value))
+            except (ValueError, TypeError):
+                # If audio_duration is not a valid number, skip it
+                pass
+        # Only pass user_metadata if user provided any values, otherwise let LM generate
+        user_metadata_to_pass = user_metadata if user_metadata else None
+        if should_use_lm_batch:
+            # BATCH LM GENERATION
+            import math
+            from acestep.handler import AceStepHandler
+            logger.info(f"Using LM batch generation for {batch_size_input} items...")
+            # Prepare seeds for batch items
+            temp_handler = AceStepHandler()
+            actual_seed_list, _ = temp_handler.prepare_seeds(batch_size_input, seed, random_seed_checkbox)
+            # Split batch into chunks (GPU memory constraint)
+            max_inference_batch_size = int(lm_batch_chunk_size)
+            num_chunks = math.ceil(batch_size_input / max_inference_batch_size)
+            all_metadata_list = []
+            all_audio_codes_list = []
+            for chunk_idx in range(num_chunks):
+                chunk_start = chunk_idx * max_inference_batch_size
+                chunk_end = min(chunk_start + max_inference_batch_size, batch_size_input)
+                chunk_size = chunk_end - chunk_start
+                chunk_seeds = actual_seed_list[chunk_start:chunk_end]
+                logger.info(f"Generating LM batch chunk {chunk_idx+1}/{num_chunks} (size: {chunk_size}, seeds: {chunk_seeds})...")
+                # Generate batch
+                metadata_list, audio_codes_list, status = llm_handler.generate_with_stop_condition_batch(
+                    caption=captions or "",
+                    lyrics=lyrics or "",
+                    batch_size=chunk_size,
+                    infer_type="llm_dit",
+                    temperature=lm_temperature,
+                    cfg_scale=lm_cfg_scale,
+                    negative_prompt=lm_negative_prompt,
+                    top_k=top_k_value,
+                    top_p=top_p_value,
+                    user_metadata=user_metadata_to_pass,
+                    use_cot_caption=use_cot_caption,
+                    use_cot_language=use_cot_language,
+                    is_format_caption=is_format_caption,
+                    constrained_decoding_debug=constrained_decoding_debug,
+                    seeds=chunk_seeds,
+                )
+                all_metadata_list.extend(metadata_list)
+                all_audio_codes_list.extend(audio_codes_list)
+            # Use first metadata as representative (all are same)
+            lm_generated_metadata = all_metadata_list[0] if all_metadata_list else None
+            # Store audio codes list for later use
+            lm_generated_audio_codes_list = all_audio_codes_list
+            # Prepare audio codes for DiT (list of codes, one per batch item)
+            audio_code_string_to_use = all_audio_codes_list
+            # Update metadata fields from LM if not provided by user
+            if lm_generated_metadata:
+                if bpm is None and lm_generated_metadata.get('bpm'):
+                    bpm_value = lm_generated_metadata.get('bpm')
+                    if bpm_value != "N/A" and bpm_value != "":
+                        try:
+                            bpm = int(bpm_value)
+                        except:
+                            pass
+                if not key_scale and lm_generated_metadata.get('keyscale'):
+                    key_scale_value = lm_generated_metadata.get('keyscale', lm_generated_metadata.get('key_scale', ""))
+                    if key_scale_value != "N/A":
+                        key_scale = key_scale_value
+                if not time_signature and lm_generated_metadata.get('timesignature'):
+                    time_signature_value = lm_generated_metadata.get('timesignature', lm_generated_metadata.get('time_signature', ""))
+                    if time_signature_value != "N/A":
+                        time_signature = time_signature_value
+                if audio_duration is None or audio_duration <= 0:
+                    audio_duration_value = lm_generated_metadata.get('duration', -1)
+                    if audio_duration_value != "N/A" and audio_duration_value != "":
+                        try:
+                            audio_duration = float(audio_duration_value)
+                        except:
+                            pass
+        else:
+            # SEQUENTIAL LM GENERATION (current behavior, when allow_lm_batch is False)
+            # Phase 1: Generate CoT metadata
+            phase1_start = time_module.time()
+            metadata, _, status = llm_handler.generate_with_stop_condition(
+                caption=captions or "",
+                lyrics=lyrics or "",
+                infer_type="dit",  # Only generate metadata in Phase 1
+                temperature=lm_temperature,
+                cfg_scale=lm_cfg_scale,
+                negative_prompt=lm_negative_prompt,
+                top_k=top_k_value,
+                top_p=top_p_value,
+                user_metadata=user_metadata_to_pass,
+                use_cot_caption=use_cot_caption,
+                use_cot_language=use_cot_language,
+                is_format_caption=is_format_caption,
+                constrained_decoding_debug=constrained_decoding_debug,
+            )
+            lm_phase1_time = time_module.time() - phase1_start
+            logger.info(f"LM Phase 1 (CoT) completed in {lm_phase1_time:.2f}s")
+            # Phase 2: Generate audio codes
+            phase2_start = time_module.time()
+            metadata, audio_codes, status = llm_handler.generate_with_stop_condition(
+                caption=captions or "",
+                lyrics=lyrics or "",
+                infer_type="llm_dit",  # Generate both metadata and codes
+                temperature=lm_temperature,
+                cfg_scale=lm_cfg_scale,
+                negative_prompt=lm_negative_prompt,
+                top_k=top_k_value,
+                top_p=top_p_value,
+                user_metadata=user_metadata_to_pass,
+                use_cot_caption=use_cot_caption,
+                use_cot_language=use_cot_language,
+                is_format_caption=is_format_caption,
+                constrained_decoding_debug=constrained_decoding_debug,
+            )
+            lm_phase2_time = time_module.time() - phase2_start
+            logger.info(f"LM Phase 2 (Codes) completed in {lm_phase2_time:.2f}s")
+            # Store LM-generated metadata and audio codes for display
+            lm_generated_metadata = metadata
+            if audio_codes:
+                audio_code_string_to_use = audio_codes
+                lm_generated_audio_codes = audio_codes
+                # Update metadata fields only if they are empty/None (user didn't provide them)
+                if bpm is None and metadata.get('bpm'):
+                    bpm_value = metadata.get('bpm')
+                    if bpm_value != "N/A" and bpm_value != "":
+                        try:
+                            bpm = int(bpm_value)
+                        except:
+                            pass
+                if not key_scale and metadata.get('keyscale'):
+                    key_scale_value = metadata.get('keyscale', metadata.get('key_scale', ""))
+                    if key_scale_value != "N/A":
+                        key_scale = key_scale_value
+                if not time_signature and metadata.get('timesignature'):
+                    time_signature_value = metadata.get('timesignature', metadata.get('time_signature', ""))
+                    if time_signature_value != "N/A":
+                        time_signature = time_signature_value
+                if audio_duration is None or audio_duration <= 0:
+                    audio_duration_value = metadata.get('duration', -1)
+                    if audio_duration_value != "N/A" and audio_duration_value != "":
+                        try:
+                            audio_duration = float(audio_duration_value)
+                        except:
+                            pass
+    # Call generate_music and get results
+    result = dit_handler.generate_music(
+        captions=captions, lyrics=lyrics, bpm=bpm, key_scale=key_scale,
+        time_signature=time_signature, vocal_language=vocal_language,
+        inference_steps=inference_steps, guidance_scale=guidance_scale,
+        use_random_seed=random_seed_checkbox, seed=seed,
+        reference_audio=reference_audio, audio_duration=audio_duration,
+        batch_size=batch_size_input, src_audio=src_audio,
+        audio_code_string=audio_code_string_to_use,
+        repainting_start=repainting_start, repainting_end=repainting_end,
+        instruction=instruction_display_gen, audio_cover_strength=audio_cover_strength,
+        task_type=task_type, use_adg=use_adg,
+        cfg_interval_start=cfg_interval_start, cfg_interval_end=cfg_interval_end,
+        audio_format=audio_format, lm_temperature=lm_temperature,
+        progress=progress
+    )
+    # Extract results
+    first_audio, second_audio, all_audio_paths, generation_info, status_message, seed_value_for_ui, \
+        align_score_1, align_text_1, align_plot_1, align_score_2, align_text_2, align_plot_2 = result
+    # Extract LM timing from status if available and prepend to generation_info
+    if status:
+        import re
+        # Try to extract timing info from status using regex
+        # Expected format: "Phase1: X.XXs" and "Phase2: X.XXs"
+        phase1_match = re.search(r'Phase1:\s*([\d.]+)s', status)
+        phase2_match = re.search(r'Phase2:\s*([\d.]+)s', status)
+        if phase1_match or phase2_match:
+            lm_timing_section = "\n\n**🤖 LM Timing:**\n"
+            lm_total = 0.0
+            if phase1_match:
+                phase1_time = float(phase1_match.group(1))
+                lm_timing_section += f"  - Phase 1 (CoT Metadata): {phase1_time:.2f}s\n"
+                lm_total += phase1_time
+            if phase2_match:
+                phase2_time = float(phase2_match.group(1))
+                lm_timing_section += f"  - Phase 2 (Audio Codes): {phase2_time:.2f}s\n"
+                lm_total += phase2_time
+            if lm_total > 0:
+                lm_timing_section += f"  - Total LM Time: {lm_total:.2f}s\n"
+            generation_info = lm_timing_section + "\n" + generation_info
+    # Append LM-generated metadata to generation_info if available
+    if lm_generated_metadata:
+        metadata_lines = []
+        if lm_generated_metadata.get('bpm'):
+            metadata_lines.append(f"- **BPM:** {lm_generated_metadata['bpm']}")
+        if lm_generated_metadata.get('caption'):
+            metadata_lines.append(f"- **User Query Rewritten Caption:** {lm_generated_metadata['caption']}")
+        if lm_generated_metadata.get('duration'):
+            metadata_lines.append(f"- **Duration:** {lm_generated_metadata['duration']} seconds")
+        if lm_generated_metadata.get('keyscale'):
+            metadata_lines.append(f"- **KeyScale:** {lm_generated_metadata['keyscale']}")
+        if lm_generated_metadata.get('language'):
+            metadata_lines.append(f"- **Language:** {lm_generated_metadata['language']}")
+        if lm_generated_metadata.get('timesignature'):
+            metadata_lines.append(f"- **Time Signature:** {lm_generated_metadata['timesignature']}")
+        if metadata_lines:
+            metadata_section = "\n\n**🤖 LM-Generated Metadata:**\n" + "\n\n".join(metadata_lines)
+            generation_info = metadata_section + "\n\n" + generation_info
+    # Update audio codes in UI if LM generated them
+    codes_outputs = [""] * 8  # Codes for 8 components
+    if should_use_lm_batch and lm_generated_audio_codes_list:
+        # Batch mode: update individual codes inputs
+        for idx in range(min(len(lm_generated_audio_codes_list), 8)):
+            codes_outputs[idx] = lm_generated_audio_codes_list[idx]
+        # For single codes input, show first one
+        updated_audio_codes = lm_generated_audio_codes_list[0] if lm_generated_audio_codes_list else text2music_audio_code_string
+    else:
+        # Single mode: update main codes input
+        updated_audio_codes = lm_generated_audio_codes if lm_generated_audio_codes else text2music_audio_code_string
+    # AUTO-SCORING
+    score_displays = [""] * 8  # Scores for 8 components
+    if auto_score and all_audio_paths:
+        logger.info(f"Auto-scoring enabled, calculating quality scores for {batch_size_input} generated audios...")
+        # Determine which audio codes to use for scoring
+        if should_use_lm_batch and lm_generated_audio_codes_list:
+            codes_list = lm_generated_audio_codes_list
+        elif audio_code_string_to_use and isinstance(audio_code_string_to_use, list):
+            codes_list = audio_code_string_to_use
+        else:
+            # Single code string, replicate for all audios
+            codes_list = [audio_code_string_to_use] * len(all_audio_paths)
+        # Calculate scores only for actually generated audios (up to batch_size_input)
+        # Don't score beyond the actual batch size to avoid duplicates
+        actual_audios_to_score = min(len(all_audio_paths), int(batch_size_input))
+        for idx in range(actual_audios_to_score):
+            if idx < len(codes_list) and codes_list[idx]:
+                try:
+                    score_display = calculate_score_handler(
+                        llm_handler,
+                        codes_list[idx],
+                        captions,
+                        lyrics,
+                        lm_generated_metadata,
+                        bpm, key_scale, time_signature, audio_duration, vocal_language,
+                        score_scale
+                    )
+                    score_displays[idx] = score_display
+                    logger.info(f"Auto-scored audio {idx+1}")
+                except Exception as e:
+                    logger.error(f"Auto-scoring failed for audio {idx+1}: {e}")
+                    score_displays[idx] = f"❌ Auto-scoring failed: {str(e)}"
+    # Prepare audio outputs (up to 8)
+    audio_outputs = [None] * 8
+    for idx in range(min(len(all_audio_paths), 8)):
+        audio_outputs[idx] = all_audio_paths[idx]
+    return (
+        audio_outputs[0],  # generated_audio_1
+        audio_outputs[1],  # generated_audio_2
+        audio_outputs[2],  # generated_audio_3
+        audio_outputs[3],  # generated_audio_4
+        audio_outputs[4],  # generated_audio_5
+        audio_outputs[5],  # generated_audio_6
+        audio_outputs[6],  # generated_audio_7
+        audio_outputs[7],  # generated_audio_8
+        all_audio_paths,   # generated_audio_batch
+        generation_info,
+        status_message,
+        seed_value_for_ui,
+        align_score_1,
+        align_text_1,
+        align_plot_1,
+        align_score_2,
+        align_text_2,
+        align_plot_2,
+        score_displays[0],  # score_display_1
+        score_displays[1],  # score_display_2
+        score_displays[2],  # score_display_3
+        score_displays[3],  # score_display_4
+        score_displays[4],  # score_display_5
+        score_displays[5],  # score_display_6
+        score_displays[6],  # score_display_7
+        score_displays[7],  # score_display_8
+        updated_audio_codes,  # Update main audio codes in UI
+        codes_outputs[0],  # text2music_audio_code_string_1
+        codes_outputs[1],  # text2music_audio_code_string_2
+        codes_outputs[2],  # text2music_audio_code_string_3
+        codes_outputs[3],  # text2music_audio_code_string_4
+        codes_outputs[4],  # text2music_audio_code_string_5
+        codes_outputs[5],  # text2music_audio_code_string_6
+        codes_outputs[6],  # text2music_audio_code_string_7
+        codes_outputs[7],  # text2music_audio_code_string_8
+        lm_generated_metadata,  # Store metadata for "Send to src audio" buttons
+        is_format_caption,  # Keep is_format_caption unchanged
+    )
+def calculate_score_handler(llm_handler, audio_codes_str, caption, lyrics, lm_metadata, bpm, key_scale, time_signature, audio_duration, vocal_language, score_scale):
+    """
+    Calculate PMI-based quality score for generated audio.
+    PMI (Pointwise Mutual Information) removes condition bias:
+    score = log P(condition|codes) - log P(condition)
+    Args:
+        llm_handler: LLM handler instance
+        audio_codes_str: Generated audio codes string
+        caption: Caption text used for generation
+        lyrics: Lyrics text used for generation
+        lm_metadata: LM-generated metadata dictionary (from CoT generation)
+        bpm: BPM value
+        key_scale: Key scale value
+        time_signature: Time signature value
+        audio_duration: Audio duration value
+        vocal_language: Vocal language value
+        score_scale: Sensitivity scale parameter
+    Returns:
+        Score display string
+    """
+    from acestep.test_time_scaling import calculate_pmi_score_per_condition
+    if not llm_handler.llm_initialized:
+        return t("messages.lm_not_initialized")
+    if not audio_codes_str or not audio_codes_str.strip():
+        return t("messages.no_codes")
+    try:
+        # Build metadata dictionary from both LM metadata and user inputs
+        metadata = {}
+        # Priority 1: Use LM-generated metadata if available
+        if lm_metadata and isinstance(lm_metadata, dict):
+            metadata.update(lm_metadata)
+        # Priority 2: Add user-provided metadata (if not already in LM metadata)
+        if bpm is not None and 'bpm' not in metadata:
+            try:
+                metadata['bpm'] = int(bpm)
+            except:
+                pass
+        if caption and 'caption' not in metadata:
+            metadata['caption'] = caption
+        if audio_duration is not None and audio_duration > 0 and 'duration' not in metadata:
+            try:
+                metadata['duration'] = int(audio_duration)
+            except:
+                pass
+        if key_scale and key_scale.strip() and 'keyscale' not in metadata:
+            metadata['keyscale'] = key_scale.strip()
+        if vocal_language and vocal_language.strip() and 'language' not in metadata:
+            metadata['language'] = vocal_language.strip()
+        if time_signature and time_signature.strip() and 'timesignature' not in metadata:
+            metadata['timesignature'] = time_signature.strip()
+        # Calculate per-condition scores with appropriate metrics
+        # - Metadata fields (bpm, duration, etc.): Top-k recall
+        # - Caption and lyrics: PMI (normalized)
+        scores_per_condition, global_score, status = calculate_pmi_score_per_condition(
+            llm_handler=llm_handler,
+            audio_codes=audio_codes_str,
+            caption=caption or "",
+            lyrics=lyrics or "",
+            metadata=metadata if metadata else None,
+            temperature=1.0,
+            topk=10,
+            score_scale=score_scale
+        )
+        # Format display string with per-condition breakdown
+        if global_score == 0.0 and not scores_per_condition:
+            return t("messages.score_failed", error=status)
+        else:
+            # Build per-condition scores display
+            condition_lines = []
+            for condition_name, score_value in sorted(scores_per_condition.items()):
+                condition_lines.append(
+                    f"  • {condition_name}: {score_value:.4f}"
+                )
+            conditions_display = "\n".join(condition_lines) if condition_lines else "  (no conditions)"
+            return (
+                f"✅ Global Quality Score: {global_score:.4f} (0-1, higher=better)\n\n"
+                f"📊 Per-Condition Scores (0-1):\n{conditions_display}\n\n"
+                f"Note: Metadata uses Top-k Recall, Caption/Lyrics use PMI\n"
+            )
+    except Exception as e:
+        import traceback
+        error_msg = t("messages.score_error", error=str(e)) + f"\n{traceback.format_exc()}"
+        return error_msg
+def calculate_score_handler_with_selection(llm_handler, sample_idx, score_scale, current_batch_index, batch_queue):
+    """
+    Calculate PMI-based quality score - REFACTORED to read from batch_queue only.
+    This ensures scoring uses the actual generation parameters, not current UI values.
+    Args:
+        llm_handler: LLM handler instance
+        sample_idx: Which sample to score (1-8)
+        score_scale: Sensitivity scale parameter (tool setting, can be from UI)
+        current_batch_index: Current batch index
+        batch_queue: Batch queue containing historical generation data
+    """
+    if current_batch_index not in batch_queue:
+        return t("messages.scoring_failed"), batch_queue
+    batch_data = batch_queue[current_batch_index]
+    params = batch_data.get("generation_params", {})
+    # Read ALL parameters from historical batch data
+    caption = params.get("captions", "")
+    lyrics = params.get("lyrics", "")
+    bpm = params.get("bpm")
+    key_scale = params.get("key_scale", "")
+    time_signature = params.get("time_signature", "")
+    audio_duration = params.get("audio_duration", -1)
+    vocal_language = params.get("vocal_language", "")
+    # Get LM metadata from batch_data (if it was saved during generation)
+    lm_metadata = batch_data.get("lm_generated_metadata", None)
+    # Get codes from batch_data
+    stored_codes = batch_data.get("codes", "")
+    stored_allow_lm_batch = batch_data.get("allow_lm_batch", False)
+    # Select correct codes for this sample
+    audio_codes_str = ""
+    if stored_allow_lm_batch and isinstance(stored_codes, list):
+        # Batch mode: use specific sample's codes
+        if 0 <= sample_idx - 1 < len(stored_codes):
+            audio_codes_str = stored_codes[sample_idx - 1]
+    else:
+        # Single mode: all samples use same codes
+        audio_codes_str = stored_codes if isinstance(stored_codes, str) else ""
+    # Calculate score using historical parameters
+    score_display = calculate_score_handler(
+        llm_handler,
+        audio_codes_str, caption, lyrics, lm_metadata,
+        bpm, key_scale, time_signature, audio_duration, vocal_language,
+        score_scale
+    )
+    # Update batch_queue with the calculated score
+    if current_batch_index in batch_queue:
+        if "scores" not in batch_queue[current_batch_index]:
+            batch_queue[current_batch_index]["scores"] = [""] * 8
+        batch_queue[current_batch_index]["scores"][sample_idx - 1] = score_display
+    return score_display, batch_queue
+def capture_current_params(
+    captions, lyrics, bpm, key_scale, time_signature, vocal_language,
+    inference_steps, guidance_scale, random_seed_checkbox, seed,
+    reference_audio, audio_duration, batch_size_input, src_audio,
+    text2music_audio_code_string, repainting_start, repainting_end,
+    instruction_display_gen, audio_cover_strength, task_type,
+    use_adg, cfg_interval_start, cfg_interval_end, audio_format, lm_temperature,
+    think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
+    use_cot_metas, use_cot_caption, use_cot_language,
+    constrained_decoding_debug, allow_lm_batch, auto_score, score_scale, lm_batch_chunk_size,
+    track_name, complete_track_classes
+):
+    """Capture current UI parameters for next batch generation
+    IMPORTANT: For AutoGen batches, we clear audio codes to ensure:
+    - Thinking mode: LM generates NEW codes for each batch
+    - Non-thinking mode: DiT generates with different random seeds
+    """
+    return {
+        "captions": captions,
+        "lyrics": lyrics,
+        "bpm": bpm,
+        "key_scale": key_scale,
+        "time_signature": time_signature,
+        "vocal_language": vocal_language,
+        "inference_steps": inference_steps,
+        "guidance_scale": guidance_scale,
+        "random_seed_checkbox": True,  # Always use random for AutoGen batches
+        "seed": seed,
+        "reference_audio": reference_audio,
+        "audio_duration": audio_duration,
+        "batch_size_input": batch_size_input,
+        "src_audio": src_audio,
+        "text2music_audio_code_string": "",  # CLEAR codes for next batch! Let LM regenerate or DiT use new seeds
+        "repainting_start": repainting_start,
+        "repainting_end": repainting_end,
+        "instruction_display_gen": instruction_display_gen,
+        "audio_cover_strength": audio_cover_strength,
+        "task_type": task_type,
+        "use_adg": use_adg,
+        "cfg_interval_start": cfg_interval_start,
+        "cfg_interval_end": cfg_interval_end,
+        "audio_format": audio_format,
+        "lm_temperature": lm_temperature,
+        "think_checkbox": think_checkbox,
+        "lm_cfg_scale": lm_cfg_scale,
+        "lm_top_k": lm_top_k,
+        "lm_top_p": lm_top_p,
+        "lm_negative_prompt": lm_negative_prompt,
+        "use_cot_metas": use_cot_metas,
+        "use_cot_caption": use_cot_caption,
+        "use_cot_language": use_cot_language,
+        "constrained_decoding_debug": constrained_decoding_debug,
+        "allow_lm_batch": allow_lm_batch,
+        "auto_score": auto_score,
+        "score_scale": score_scale,
+        "lm_batch_chunk_size": lm_batch_chunk_size,
+        "track_name": track_name,
+        "complete_track_classes": complete_track_classes,
+    }
+def generate_with_batch_management(
+    dit_handler, llm_handler,
+    captions, lyrics, bpm, key_scale, time_signature, vocal_language,
+    inference_steps, guidance_scale, random_seed_checkbox, seed,
+    reference_audio, audio_duration, batch_size_input, src_audio,
+    text2music_audio_code_string, repainting_start, repainting_end,
+    instruction_display_gen, audio_cover_strength, task_type,
+    use_adg, cfg_interval_start, cfg_interval_end, audio_format, lm_temperature,
+    think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
+    use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
+    constrained_decoding_debug,
+    allow_lm_batch,
+    auto_score,
+    score_scale,
+    lm_batch_chunk_size,
+    track_name,
+    complete_track_classes,
+    autogen_checkbox,
+    current_batch_index,
+    total_batches,
+    batch_queue,
+    generation_params_state,
+    progress=gr.Progress(track_tqdm=True)
+):
+    """
+    Wrapper for generate_with_progress that adds batch queue management
+    """
+    # Call the original generation function
+    result = generate_with_progress(
+        dit_handler, llm_handler,
+        captions, lyrics, bpm, key_scale, time_signature, vocal_language,
+        inference_steps, guidance_scale, random_seed_checkbox, seed,
+        reference_audio, audio_duration, batch_size_input, src_audio,
+        text2music_audio_code_string, repainting_start, repainting_end,
+        instruction_display_gen, audio_cover_strength, task_type,
+        use_adg, cfg_interval_start, cfg_interval_end, audio_format, lm_temperature,
+        think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
+        use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
+        constrained_decoding_debug,
+        allow_lm_batch,
+        auto_score,
+        score_scale,
+        lm_batch_chunk_size,
+        progress
+    )
+    # Extract results from generation
+    all_audio_paths = result[8]  # generated_audio_batch
+    generation_info = result[9]
+    seed_value_for_ui = result[11]
+    lm_generated_metadata = result[34]  # Index 34 is lm_metadata_state
+    # Extract codes
+    generated_codes_single = result[26]
+    generated_codes_batch = [result[27], result[28], result[29], result[30], result[31], result[32], result[33], result[34]]
+    # Determine which codes to store based on mode
+    if allow_lm_batch and batch_size_input >= 2:
+        codes_to_store = generated_codes_batch[:int(batch_size_input)]
+    else:
+        codes_to_store = generated_codes_single
+    # Save parameters for history
+    saved_params = {
+        "captions": captions,
+        "lyrics": lyrics,
+        "bpm": bpm,
+        "key_scale": key_scale,
+        "time_signature": time_signature,
+        "vocal_language": vocal_language,
+        "inference_steps": inference_steps,
+        "guidance_scale": guidance_scale,
+        "random_seed_checkbox": random_seed_checkbox,
+        "seed": seed,
+        "reference_audio": reference_audio,
+        "audio_duration": audio_duration,
+        "batch_size_input": batch_size_input,
+        "src_audio": src_audio,
+        "text2music_audio_code_string": text2music_audio_code_string,
+        "repainting_start": repainting_start,
+        "repainting_end": repainting_end,
+        "instruction_display_gen": instruction_display_gen,
+        "audio_cover_strength": audio_cover_strength,
+        "task_type": task_type,
+        "use_adg": use_adg,
+        "cfg_interval_start": cfg_interval_start,
+        "cfg_interval_end": cfg_interval_end,
+        "audio_format": audio_format,
+        "lm_temperature": lm_temperature,
+        "think_checkbox": think_checkbox,
+        "lm_cfg_scale": lm_cfg_scale,
+        "lm_top_k": lm_top_k,
+        "lm_top_p": lm_top_p,
+        "lm_negative_prompt": lm_negative_prompt,
+        "use_cot_metas": use_cot_metas,
+        "use_cot_caption": use_cot_caption,
+        "use_cot_language": use_cot_language,
+        "constrained_decoding_debug": constrained_decoding_debug,
+        "allow_lm_batch": allow_lm_batch,
+        "auto_score": auto_score,
+        "score_scale": score_scale,
+        "lm_batch_chunk_size": lm_batch_chunk_size,
+        "track_name": track_name,
+        "complete_track_classes": complete_track_classes,
+    }
+    # Next batch parameters (with cleared codes & random seed)
+    next_params = saved_params.copy()
+    next_params["text2music_audio_code_string"] = ""
+    next_params["random_seed_checkbox"] = True
+    # Store current batch in queue
+    batch_queue = store_batch_in_queue(
+        batch_queue,
+        current_batch_index,
+        all_audio_paths,
+        generation_info,
+        seed_value_for_ui,
+        codes=codes_to_store,
+        allow_lm_batch=allow_lm_batch,
+        batch_size=int(batch_size_input),
+        generation_params=saved_params,
+        lm_generated_metadata=lm_generated_metadata,
+        status="completed"
+    )
+    # Update batch counters
+    total_batches = max(total_batches, current_batch_index + 1)
+    # Update batch indicator
+    batch_indicator_text = update_batch_indicator(current_batch_index, total_batches)
+    # Update navigation button states
+    can_go_previous, can_go_next = update_navigation_buttons(current_batch_index, total_batches)
+    # Prepare next batch status message
+    next_batch_status_text = ""
+    if autogen_checkbox:
+        next_batch_status_text = t("messages.autogen_enabled")
+    # Return original results plus batch management state updates
+    return result + (
+        current_batch_index,
+        total_batches,
+        batch_queue,
+        next_params,
+        batch_indicator_text,
+        gr.update(interactive=can_go_previous),
+        gr.update(interactive=can_go_next),
+        next_batch_status_text,
+        gr.update(interactive=True),
+    )
+def generate_next_batch_background(
+    dit_handler,
+    llm_handler,
+    autogen_enabled,
+    generation_params,
+    current_batch_index,
+    total_batches,
+    batch_queue,
+    is_format_caption,
+    progress=gr.Progress(track_tqdm=True)
+):
+    """
+    Generate next batch in background if AutoGen is enabled
+    """
+    # Early return if AutoGen not enabled
+    if not autogen_enabled:
+        return (
+            batch_queue,
+            total_batches,
+            "",
+            gr.update(interactive=False),
+        )
+    # Calculate next batch index
+    next_batch_idx = current_batch_index + 1
+    # Check if next batch already exists
+    if next_batch_idx in batch_queue and batch_queue[next_batch_idx].get("status") == "completed":
+        return (
+            batch_queue,
+            total_batches,
+            t("messages.batch_ready", n=next_batch_idx + 1),
+            gr.update(interactive=True),
+        )
+    # Update total batches count
+    total_batches = next_batch_idx + 1
+    gr.Info(t("messages.batch_generating", n=next_batch_idx + 1))
+    # Generate next batch using stored parameters
+    params = generation_params.copy()
+    # DEBUG LOGGING: Log all parameters used for background generation
+    logger.info(f"========== BACKGROUND GENERATION BATCH {next_batch_idx + 1} ==========")
+    logger.info(f"Parameters used for background generation:")
+    logger.info(f"  - captions: {params.get('captions', 'N/A')}")
+    logger.info(f"  - lyrics: {params.get('lyrics', 'N/A')[:50]}..." if params.get('lyrics') else "  - lyrics: N/A")
+    logger.info(f"  - bpm: {params.get('bpm')}")
+    logger.info(f"  - batch_size_input: {params.get('batch_size_input')}")
+    logger.info(f"  - allow_lm_batch: {params.get('allow_lm_batch')}")
+    logger.info(f"  - think_checkbox: {params.get('think_checkbox')}")
+    logger.info(f"  - lm_temperature: {params.get('lm_temperature')}")
+    logger.info(f"  - track_name: {params.get('track_name')}")
+    logger.info(f"  - complete_track_classes: {params.get('complete_track_classes')}")
+    logger.info(f"  - text2music_audio_code_string: {'<CLEARED>' if params.get('text2music_audio_code_string') == '' else 'HAS_VALUE'}")
+    logger.info(f"=========================================================")
+    # Add error handling for background generation
+    try:
+        # Ensure all parameters have default values to prevent None errors
+        params.setdefault("captions", "")
+        params.setdefault("lyrics", "")
+        params.setdefault("bpm", None)
+        params.setdefault("key_scale", "")
+        params.setdefault("time_signature", "")
+        params.setdefault("vocal_language", "unknown")
+        params.setdefault("inference_steps", 8)
+        params.setdefault("guidance_scale", 7.0)
+        params.setdefault("random_seed_checkbox", True)
+        params.setdefault("seed", "-1")
+        params.setdefault("reference_audio", None)
+        params.setdefault("audio_duration", -1)
+        params.setdefault("batch_size_input", 2)
+        params.setdefault("src_audio", None)
+        params.setdefault("text2music_audio_code_string", "")
+        params.setdefault("repainting_start", 0.0)
+        params.setdefault("repainting_end", -1)
+        params.setdefault("instruction_display_gen", "")
+        params.setdefault("audio_cover_strength", 1.0)
+        params.setdefault("task_type", "text2music")
+        params.setdefault("use_adg", False)
+        params.setdefault("cfg_interval_start", 0.0)
+        params.setdefault("cfg_interval_end", 1.0)
+        params.setdefault("audio_format", "mp3")
+        params.setdefault("lm_temperature", 0.85)
+        params.setdefault("think_checkbox", True)
+        params.setdefault("lm_cfg_scale", 2.0)
+        params.setdefault("lm_top_k", 0)
+        params.setdefault("lm_top_p", 0.9)
+        params.setdefault("lm_negative_prompt", "NO USER INPUT")
+        params.setdefault("use_cot_metas", True)
+        params.setdefault("use_cot_caption", True)
+        params.setdefault("use_cot_language", True)
+        params.setdefault("constrained_decoding_debug", False)
+        params.setdefault("allow_lm_batch", True)
+        params.setdefault("auto_score", False)
+        params.setdefault("score_scale", 0.5)
+        params.setdefault("lm_batch_chunk_size", 8)
+        params.setdefault("track_name", None)
+        params.setdefault("complete_track_classes", [])
+        # Call generate_with_progress with the saved parameters
+        result = generate_with_progress(
+            dit_handler,
+            llm_handler,
+            captions=params.get("captions"),
+            lyrics=params.get("lyrics"),
+            bpm=params.get("bpm"),
+            key_scale=params.get("key_scale"),
+            time_signature=params.get("time_signature"),
+            vocal_language=params.get("vocal_language"),
+            inference_steps=params.get("inference_steps"),
+            guidance_scale=params.get("guidance_scale"),
+            random_seed_checkbox=params.get("random_seed_checkbox"),
+            seed=params.get("seed"),
+            reference_audio=params.get("reference_audio"),
+            audio_duration=params.get("audio_duration"),
+            batch_size_input=params.get("batch_size_input"),
+            src_audio=params.get("src_audio"),
+            text2music_audio_code_string=params.get("text2music_audio_code_string"),
+            repainting_start=params.get("repainting_start"),
+            repainting_end=params.get("repainting_end"),
+            instruction_display_gen=params.get("instruction_display_gen"),
+            audio_cover_strength=params.get("audio_cover_strength"),
+            task_type=params.get("task_type"),
+            use_adg=params.get("use_adg"),
+            cfg_interval_start=params.get("cfg_interval_start"),
+            cfg_interval_end=params.get("cfg_interval_end"),
+            audio_format=params.get("audio_format"),
+            lm_temperature=params.get("lm_temperature"),
+            think_checkbox=params.get("think_checkbox"),
+            lm_cfg_scale=params.get("lm_cfg_scale"),
+            lm_top_k=params.get("lm_top_k"),
+            lm_top_p=params.get("lm_top_p"),
+            lm_negative_prompt=params.get("lm_negative_prompt"),
+            use_cot_metas=params.get("use_cot_metas"),
+            use_cot_caption=params.get("use_cot_caption"),
+            use_cot_language=params.get("use_cot_language"),
+            is_format_caption=is_format_caption,
+            constrained_decoding_debug=params.get("constrained_decoding_debug"),
+            allow_lm_batch=params.get("allow_lm_batch"),
+            auto_score=params.get("auto_score"),
+            score_scale=params.get("score_scale"),
+            lm_batch_chunk_size=params.get("lm_batch_chunk_size"),
+            progress=progress
+        )
+        # Extract results
+        all_audio_paths = result[8]  # generated_audio_batch
+        generation_info = result[9]
+        seed_value_for_ui = result[11]
+        lm_generated_metadata = result[34]  # Index 34 is lm_metadata_state
+        # Extract codes
+        generated_codes_single = result[26]
+        generated_codes_batch = [result[27], result[28], result[29], result[30], result[31], result[32], result[33], result[34]]
+        # Determine which codes to store
+        batch_size = params.get("batch_size_input", 2)
+        allow_lm_batch = params.get("allow_lm_batch", False)
+        if allow_lm_batch and batch_size >= 2:
+            codes_to_store = generated_codes_batch[:int(batch_size)]
+        else:
+            codes_to_store = generated_codes_single
+        # DEBUG LOGGING: Log codes extraction and storage
+        logger.info(f"Codes extraction for Batch {next_batch_idx + 1}:")
+        logger.info(f"  - allow_lm_batch: {allow_lm_batch}")
+        logger.info(f"  - batch_size: {batch_size}")
+        logger.info(f"  - generated_codes_single exists: {bool(generated_codes_single)}")
+        if isinstance(codes_to_store, list):
+            logger.info(f"  - codes_to_store: LIST with {len(codes_to_store)} items")
+            for idx, code in enumerate(codes_to_store):
+                logger.info(f"    * Sample {idx + 1}: {len(code) if code else 0} chars")
+        else:
+            logger.info(f"  - codes_to_store: STRING with {len(codes_to_store) if codes_to_store else 0} chars")
+        # Store next batch in queue with codes, batch settings, and ALL generation params
+        batch_queue = store_batch_in_queue(
+            batch_queue,
+            next_batch_idx,
+            all_audio_paths,
+            generation_info,
+            seed_value_for_ui,
+            codes=codes_to_store,
+            allow_lm_batch=allow_lm_batch,
+            batch_size=int(batch_size),
+            generation_params=params,
+            lm_generated_metadata=lm_generated_metadata,
+            status="completed"
+        )
+        logger.info(f"Batch {next_batch_idx + 1} stored in queue successfully")
+        # Success message
+        next_batch_status = t("messages.batch_ready", n=next_batch_idx + 1)
+        # Enable next button now that batch is ready
+        return (
+            batch_queue,
+            total_batches,
+            next_batch_status,
+            gr.update(interactive=True),
+        )
+    except Exception as e:
+        # Handle generation errors
+        import traceback
+        error_msg = t("messages.batch_failed", error=str(e))
+        gr.Warning(error_msg)
+        # Mark batch as failed in queue
+        batch_queue[next_batch_idx] = {
+            "status": "error",
+            "error": str(e),
+            "traceback": traceback.format_exc()
+        }
+        return (
+            batch_queue,
+            total_batches,
+            error_msg,
+            gr.update(interactive=False),
+        )
+def navigate_to_previous_batch(current_batch_index, batch_queue):
+    """Navigate to previous batch (Result View Only - Never touches Input UI)"""
+    if current_batch_index <= 0:
+        gr.Warning(t("messages.at_first_batch"))
+        return [gr.update()] * 24
+    # Move to previous batch
+    new_batch_index = current_batch_index - 1
+    # Load batch data from queue
+    if new_batch_index not in batch_queue:
+        gr.Warning(t("messages.batch_not_found", n=new_batch_index + 1))
+        return [gr.update()] * 24
+    batch_data = batch_queue[new_batch_index]
+    audio_paths = batch_data.get("audio_paths", [])
+    generation_info_text = batch_data.get("generation_info", "")
+    # Prepare audio outputs (up to 8)
+    audio_outputs = [None] * 8
+    for idx in range(min(len(audio_paths), 8)):
+        audio_outputs[idx] = audio_paths[idx]
+    # Update batch indicator
+    total_batches = len(batch_queue)
+    batch_indicator_text = update_batch_indicator(new_batch_index, total_batches)
+    # Update button states
+    can_go_previous, can_go_next = update_navigation_buttons(new_batch_index, total_batches)
+    # Restore score displays from batch queue
+    stored_scores = batch_data.get("scores", [""] * 8)
+    score_displays = stored_scores if stored_scores else [""] * 8
+    return (
+        audio_outputs[0], audio_outputs[1], audio_outputs[2], audio_outputs[3],
+        audio_outputs[4], audio_outputs[5], audio_outputs[6], audio_outputs[7],
+        audio_paths, generation_info_text, new_batch_index, batch_indicator_text,
+        gr.update(interactive=can_go_previous), gr.update(interactive=can_go_next),
+        t("messages.viewing_batch", n=new_batch_index + 1),
+        score_displays[0], score_displays[1], score_displays[2], score_displays[3],
+        score_displays[4], score_displays[5], score_displays[6], score_displays[7],
+        gr.update(interactive=True),
+    )
+def navigate_to_next_batch(autogen_enabled, current_batch_index, total_batches, batch_queue):
+    """Navigate to next batch (Result View Only - Never touches Input UI)"""
+    if current_batch_index >= total_batches - 1:
+        gr.Warning(t("messages.at_last_batch"))
+        return [gr.update()] * 25
+    # Move to next batch
+    new_batch_index = current_batch_index + 1
+    # Load batch data from queue
+    if new_batch_index not in batch_queue:
+        gr.Warning(t("messages.batch_not_found", n=new_batch_index + 1))
+        return [gr.update()] * 25
+    batch_data = batch_queue[new_batch_index]
+    audio_paths = batch_data.get("audio_paths", [])
+    generation_info_text = batch_data.get("generation_info", "")
+    # Prepare audio outputs (up to 8)
+    audio_outputs = [None] * 8
+    for idx in range(min(len(audio_paths), 8)):
+        audio_outputs[idx] = audio_paths[idx]
+    # Update batch indicator
+    batch_indicator_text = update_batch_indicator(new_batch_index, total_batches)
+    # Update button states
+    can_go_previous, can_go_next = update_navigation_buttons(new_batch_index, total_batches)
+    # Prepare next batch status message
+    next_batch_status_text = ""
+    is_latest_view = (new_batch_index == total_batches - 1)
+    if autogen_enabled and is_latest_view:
+        next_batch_status_text = "🔄 AutoGen will generate next batch in background..."
+    # Restore score displays from batch queue
+    stored_scores = batch_data.get("scores", [""] * 8)
+    score_displays = stored_scores if stored_scores else [""] * 8
+    return (
+        audio_outputs[0], audio_outputs[1], audio_outputs[2], audio_outputs[3],
+        audio_outputs[4], audio_outputs[5], audio_outputs[6], audio_outputs[7],
+        audio_paths, generation_info_text, new_batch_index, batch_indicator_text,
+        gr.update(interactive=can_go_previous), gr.update(interactive=can_go_next),
+        t("messages.viewing_batch", n=new_batch_index + 1), next_batch_status_text,
+        score_displays[0], score_displays[1], score_displays[2], score_displays[3],
+        score_displays[4], score_displays[5], score_displays[6], score_displays[7],
+        gr.update(interactive=True),
+    )
+def restore_batch_parameters(current_batch_index, batch_queue):
+    """
+    Restore parameters from currently viewed batch to Input UI.
+    This is the bridge allowing users to "reuse" historical settings.
+    """
+    if current_batch_index not in batch_queue:
+        gr.Warning(t("messages.no_batch_data"))
+        return [gr.update()] * 29
+    batch_data = batch_queue[current_batch_index]
+    params = batch_data.get("generation_params", {})
+    # Extract all parameters with defaults
+    captions = params.get("captions", "")
+    lyrics = params.get("lyrics", "")
+    bpm = params.get("bpm", None)
+    key_scale = params.get("key_scale", "")
+    time_signature = params.get("time_signature", "")
+    vocal_language = params.get("vocal_language", "unknown")
+    audio_duration = params.get("audio_duration", -1)
+    batch_size_input = params.get("batch_size_input", 2)
+    inference_steps = params.get("inference_steps", 8)
+    lm_temperature = params.get("lm_temperature", 0.85)
+    lm_cfg_scale = params.get("lm_cfg_scale", 2.0)
+    lm_top_k = params.get("lm_top_k", 0)
+    lm_top_p = params.get("lm_top_p", 0.9)
+    think_checkbox = params.get("think_checkbox", True)
+    use_cot_caption = params.get("use_cot_caption", True)
+    use_cot_language = params.get("use_cot_language", True)
+    allow_lm_batch = params.get("allow_lm_batch", True)
+    track_name = params.get("track_name", None)
+    complete_track_classes = params.get("complete_track_classes", [])
+    # Extract and process codes
+    stored_codes = batch_data.get("codes", "")
+    stored_allow_lm_batch = params.get("allow_lm_batch", False)
+    codes_outputs = [""] * 9  # [Main, 1-8]
+    if stored_codes:
+        if stored_allow_lm_batch and isinstance(stored_codes, list):
+            # Batch mode: populate codes 1-8, main shows first
+            codes_outputs[0] = stored_codes[0] if stored_codes else ""
+            for idx in range(min(len(stored_codes), 8)):
+                codes_outputs[idx + 1] = stored_codes[idx]
+        else:
+            # Single mode: populate main, clear 1-8
+            codes_outputs[0] = stored_codes if isinstance(stored_codes, str) else (stored_codes[0] if stored_codes else "")
+    gr.Info(t("messages.params_restored", n=current_batch_index + 1))
+    return (
+        codes_outputs[0], codes_outputs[1], codes_outputs[2], codes_outputs[3],
+        codes_outputs[4], codes_outputs[5], codes_outputs[6], codes_outputs[7],
+        codes_outputs[8], captions, lyrics, bpm, key_scale, time_signature,
+        vocal_language, audio_duration, batch_size_input, inference_steps,
+        lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, think_checkbox,
+        use_cot_caption, use_cot_language, allow_lm_batch,
+        track_name, complete_track_classes
+    )

acestep/gradio_ui/i18n.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""
+Internationalization (i18n) module for Gradio UI
+Supports multiple languages with easy translation management
+"""
+import os
+import json
+from typing import Dict, Optional
+class I18n:
+    """Internationalization handler"""
+    def __init__(self, default_language: str = "en"):
+        """
+        Initialize i18n handler
+        Args:
+            default_language: Default language code (en, zh, ja, etc.)
+        """
+        self.current_language = default_language
+        self.translations: Dict[str, Dict[str, str]] = {}
+        self._load_all_translations()
+    def _load_all_translations(self):
+        """Load all translation files from i18n directory"""
+        current_file = os.path.abspath(__file__)
+        module_dir = os.path.dirname(current_file)
+        i18n_dir = os.path.join(module_dir, "i18n")
+        if not os.path.exists(i18n_dir):
+            # Create i18n directory if it doesn't exist
+            os.makedirs(i18n_dir)
+            return
+        # Load all JSON files in i18n directory
+        for filename in os.listdir(i18n_dir):
+            if filename.endswith(".json"):
+                lang_code = filename[:-5]  # Remove .json extension
+                filepath = os.path.join(i18n_dir, filename)
+                try:
+                    with open(filepath, 'r', encoding='utf-8') as f:
+                        self.translations[lang_code] = json.load(f)
+                except Exception as e:
+                    print(f"Error loading translation file {filename}: {e}")
+    def set_language(self, language: str):
+        """Set current language"""
+        if language in self.translations:
+            self.current_language = language
+        else:
+            print(f"Warning: Language '{language}' not found, using default")
+    def t(self, key: str, **kwargs) -> str:
+        """
+        Translate a key to current language
+        Args:
+            key: Translation key (dot-separated for nested keys)
+            **kwargs: Optional format parameters
+        Returns:
+            Translated string
+        """
+        # Get translation from current language
+        translation = self._get_nested_value(
+            self.translations.get(self.current_language, {}),
+            key
+        )
+        # Fallback to English if not found
+        if translation is None:
+            translation = self._get_nested_value(
+                self.translations.get('en', {}),
+                key
+            )
+        # Final fallback to key itself
+        if translation is None:
+            translation = key
+        # Apply formatting if kwargs provided
+        if kwargs:
+            try:
+                translation = translation.format(**kwargs)
+            except KeyError:
+                pass
+        return translation
+    def _get_nested_value(self, data: dict, key: str) -> Optional[str]:
+        """
+        Get nested dictionary value using dot notation
+        Args:
+            data: Dictionary to search
+            key: Dot-separated key (e.g., "section.subsection.key")
+        Returns:
+            Value if found, None otherwise
+        """
+        keys = key.split('.')
+        current = data
+        for k in keys:
+            if isinstance(current, dict) and k in current:
+                current = current[k]
+            else:
+                return None
+        return current if isinstance(current, str) else None
+    def get_available_languages(self) -> list:
+        """Get list of available language codes"""
+        return list(self.translations.keys())
+# Global i18n instance
+_i18n_instance: Optional[I18n] = None
+def get_i18n(language: Optional[str] = None) -> I18n:
+    """
+    Get global i18n instance
+    Args:
+        language: Optional language to set
+    Returns:
+        I18n instance
+    """
+    global _i18n_instance
+    if _i18n_instance is None:
+        _i18n_instance = I18n(default_language=language or "en")
+    elif language is not None:
+        _i18n_instance.set_language(language)
+    return _i18n_instance
+def t(key: str, **kwargs) -> str:
+    """
+    Convenience function for translation
+    Args:
+        key: Translation key
+        **kwargs: Optional format parameters
+    Returns:
+        Translated string
+    """
+    return get_i18n().t(key, **kwargs)

acestep/gradio_ui/i18n/en.json ADDED Viewed

	@@ -0,0 +1,209 @@

+{
+  "app": {
+    "title": "🎛️ ACE-Step V1.5 Playground💡",
+    "subtitle": "Pushing the Boundaries of Open-Source Music Generation"
+  },
+  "dataset": {
+    "title": "📊 Dataset Explorer",
+    "dataset_label": "Dataset",
+    "dataset_info": "Choose dataset to explore",
+    "import_btn": "📥 Import Dataset",
+    "search_type_label": "Search Type",
+    "search_type_info": "How to find items",
+    "search_value_label": "Search Value",
+    "search_value_placeholder": "Enter keys or index (leave empty for random)",
+    "search_value_info": "Keys: exact match, Index: 0 to dataset size-1",
+    "instruction_label": "📝 Instruction",
+    "instruction_placeholder": "No instruction available",
+    "metadata_title": "📋 Item Metadata (JSON)",
+    "metadata_label": "Complete Item Information",
+    "source_audio": "Source Audio",
+    "target_audio": "Target Audio",
+    "reference_audio": "Reference Audio",
+    "get_item_btn": "🔍 Get Item",
+    "use_src_checkbox": "Use Source Audio from Dataset",
+    "use_src_info": "Check to use the source audio from dataset",
+    "data_status_label": "📊 Data Status",
+    "data_status_default": "❌ No dataset imported",
+    "autofill_btn": "📋 Auto-fill Generation Form"
+  },
+  "service": {
+    "title": "🔧 Service Configuration",
+    "checkpoint_label": "Checkpoint File",
+    "checkpoint_info": "Select a trained model checkpoint file (full path or filename)",
+    "refresh_btn": "🔄 Refresh",
+    "model_path_label": "Main Model Path",
+    "model_path_info": "Select the model configuration directory (auto-scanned from checkpoints)",
+    "device_label": "Device",
+    "device_info": "Processing device (auto-detect recommended)",
+    "lm_model_path_label": "5Hz LM Model Path",
+    "lm_model_path_info": "Select the 5Hz LM model checkpoint (auto-scanned from checkpoints)",
+    "backend_label": "5Hz LM Backend",
+    "backend_info": "Select backend for 5Hz LM: vllm (faster) or pt (PyTorch, more compatible)",
+    "init_llm_label": "Initialize 5Hz LM",
+    "init_llm_info": "Check to initialize 5Hz LM during service initialization",
+    "flash_attention_label": "Use Flash Attention",
+    "flash_attention_info_enabled": "Enable flash attention for faster inference (requires flash_attn package)",
+    "flash_attention_info_disabled": "Flash attention not available (flash_attn package not installed)",
+    "offload_cpu_label": "Offload to CPU",
+    "offload_cpu_info": "Offload models to CPU when not in use to save GPU memory",
+    "offload_dit_cpu_label": "Offload DiT to CPU",
+    "offload_dit_cpu_info": "Offload DiT to CPU (needs Offload to CPU)",
+    "init_btn": "Initialize Service",
+    "status_label": "Status",
+    "language_label": "UI Language",
+    "language_info": "Select interface language"
+  },
+  "generation": {
+    "required_inputs": "📝 Required Inputs",
+    "task_type_label": "Task Type",
+    "task_type_info": "Select the task type for generation",
+    "instruction_label": "Instruction",
+    "instruction_info": "Instruction is automatically generated based on task type",
+    "load_btn": "Load",
+    "track_name_label": "Track Name",
+    "track_name_info": "Select track name for lego/extract tasks",
+    "track_classes_label": "Track Names",
+    "track_classes_info": "Select multiple track classes for complete task",
+    "audio_uploads": "🎵 Audio Uploads",
+    "reference_audio": "Reference Audio (optional)",
+    "source_audio": "Source Audio (optional)",
+    "convert_codes_btn": "Convert to Codes",
+    "lm_codes_hints": "🎼 LM Codes Hints",
+    "lm_codes_label": "LM Codes Hints",
+    "lm_codes_placeholder": "<|audio_code_10695|><|audio_code_54246|>...",
+    "lm_codes_info": "Paste LM codes hints for text2music generation",
+    "lm_codes_sample": "LM Codes Hints (Sample {n})",
+    "lm_codes_sample_info": "Codes for sample {n}",
+    "transcribe_btn": "Transcribe",
+    "repainting_controls": "🎨 Repainting Controls (seconds)",
+    "repainting_start": "Repainting Start",
+    "repainting_end": "Repainting End",
+    "caption_title": "📝 Music Caption",
+    "caption_label": "Music Caption (optional)",
+    "caption_placeholder": "A peaceful acoustic guitar melody with soft vocals...",
+    "caption_info": "Describe the style, genre, instruments, and mood",
+    "sample_btn": "Sample",
+    "lyrics_title": "📝 Lyrics",
+    "lyrics_label": "Lyrics (optional)",
+    "lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
+    "lyrics_info": "Song lyrics with structure",
+    "instrumental_label": "Instrumental",
+    "optional_params": "⚙️ Optional Parameters",
+    "vocal_language_label": "Vocal Language (optional)",
+    "vocal_language_info": "use `unknown` for inst",
+    "bpm_label": "BPM (optional)",
+    "bpm_info": "leave empty for N/A",
+    "keyscale_label": "KeyScale (optional)",
+    "keyscale_placeholder": "Leave empty for N/A",
+    "keyscale_info": "A-G, #/♭, major/minor",
+    "timesig_label": "Time Signature (optional)",
+    "timesig_info": "2/4, 3/4, 4/4...",
+    "duration_label": "Audio Duration (seconds)",
+    "duration_info": "Use -1 for random",
+    "batch_size_label": "Batch Size",
+    "batch_size_info": "Number of audio to generate (max 8)",
+    "advanced_settings": "🔧 Advanced Settings",
+    "inference_steps_label": "DiT Inference Steps",
+    "inference_steps_info": "Turbo: max 8, Base: max 100",
+    "guidance_scale_label": "DiT Guidance Scale (Only support for base model)",
+    "guidance_scale_info": "Higher values follow text more closely",
+    "seed_label": "Seed",
+    "seed_info": "Use comma-separated values for batches",
+    "random_seed_label": "Random Seed",
+    "random_seed_info": "Enable to auto-generate seeds",
+    "audio_format_label": "Audio Format",
+    "audio_format_info": "Audio format for saved files",
+    "use_adg_label": "Use ADG",
+    "use_adg_info": "Enable Angle Domain Guidance",
+    "cfg_interval_start": "CFG Interval Start",
+    "cfg_interval_end": "CFG Interval End",
+    "lm_params_title": "🤖 LM Generation Parameters",
+    "lm_temperature_label": "LM Temperature",
+    "lm_temperature_info": "5Hz LM temperature (higher = more random)",
+    "lm_cfg_scale_label": "LM CFG Scale",
+    "lm_cfg_scale_info": "5Hz LM CFG (1.0 = no CFG)",
+    "lm_top_k_label": "LM Top-K",
+    "lm_top_k_info": "Top-K (0 = disabled)",
+    "lm_top_p_label": "LM Top-P",
+    "lm_top_p_info": "Top-P (1.0 = disabled)",
+    "lm_negative_prompt_label": "LM Negative Prompt",
+    "lm_negative_prompt_placeholder": "Enter negative prompt for CFG (default: NO USER INPUT)",
+    "lm_negative_prompt_info": "Negative prompt (use when LM CFG Scale > 1.0)",
+    "cot_metas_label": "CoT Metas",
+    "cot_metas_info": "Use LM to generate CoT metadata (uncheck to skip LM CoT generation)",
+    "cot_language_label": "CoT Language",
+    "cot_language_info": "Generate language in CoT (chain-of-thought)",
+    "constrained_debug_label": "Constrained Decoding Debug",
+    "constrained_debug_info": "Enable debug logging for constrained decoding (check to see detailed logs)",
+    "auto_score_label": "Auto Score",
+    "auto_score_info": "Automatically calculate quality scores for all generated audios",
+    "lm_batch_chunk_label": "LM Batch Chunk Size",
+    "lm_batch_chunk_info": "Max items per LM batch chunk (default: 8, limited by GPU memory)",
+    "codes_strength_label": "LM Codes Strength",
+    "codes_strength_info": "Control how many denoising steps use LM-generated codes",
+    "cover_strength_label": "Audio Cover Strength",
+    "cover_strength_info": "Control how many denoising steps use cover mode",
+    "score_sensitivity_label": "Quality Score Sensitivity",
+    "score_sensitivity_info": "Lower = more sensitive (default: 1.0). Adjusts how PMI maps to [0,1]",
+    "attention_focus_label": "Output Attention Focus Score (disabled)",
+    "attention_focus_info": "Output attention focus score analysis",
+    "think_label": "Think",
+    "parallel_thinking_label": "ParallelThinking",
+    "generate_btn": "🎵 Generate Music",
+    "autogen_label": "AutoGen",
+    "caption_rewrite_label": "CaptionRewrite"
+  },
+  "results": {
+    "title": "🎵 Results",
+    "generated_music": "🎵 Generated Music (Sample {n})",
+    "send_to_src_btn": "🔗 Send To Src Audio",
+    "save_btn": "💾 Save",
+    "score_btn": "📊 Score",
+    "quality_score_label": "Quality Score (Sample {n})",
+    "quality_score_placeholder": "Click 'Score' to calculate perplexity-based quality score",
+    "generation_status": "Generation Status",
+    "current_batch": "Current Batch",
+    "batch_indicator": "Batch {current} / {total}",
+    "next_batch_status": "Next Batch Status",
+    "prev_btn": "◀ Previous",
+    "next_btn": "Next ▶",
+    "restore_params_btn": "↙️ Apply These Settings to UI (Restore Batch Parameters)",
+    "batch_results_title": "📁 Batch Results & Generation Details",
+    "all_files_label": "📁 All Generated Files (Download)",
+    "generation_details": "Generation Details",
+    "attention_analysis": "⚖️ Attention Focus Score Analysis",
+    "attention_score": "Attention Focus Score (Sample {n})",
+    "lyric_timestamps": "Lyric Timestamps (Sample {n})",
+    "attention_heatmap": "Attention Focus Score Heatmap (Sample {n})"
+  },
+  "messages": {
+    "no_audio_to_save": "❌ No audio to save",
+    "save_success": "✅ Saved audio and metadata to {filename}",
+    "save_failed": "❌ Failed to save: {error}",
+    "no_file_selected": "⚠️ No file selected",
+    "params_loaded": "✅ Parameters loaded from {filename}",
+    "invalid_json": "❌ Invalid JSON file: {error}",
+    "load_error": "❌ Error loading file: {error}",
+    "example_loaded": "📁 Loaded example from {filename}",
+    "example_failed": "Failed to parse JSON file {filename}: {error}",
+    "example_error": "Error loading example: {error}",
+    "lm_generated": "🤖 Generated example using LM",
+    "lm_fallback": "Failed to generate example using LM, falling back to examples directory",
+    "lm_not_initialized": "❌ 5Hz LM not initialized. Please initialize it first.",
+    "autogen_enabled": "🔄 AutoGen enabled - next batch will generate after this",
+    "batch_ready": "✅ Batch {n} ready! Click 'Next' to view.",
+    "batch_generating": "🔄 Starting background generation for Batch {n}...",
+    "batch_failed": "❌ Background generation failed: {error}",
+    "viewing_batch": "✅ Viewing Batch {n}",
+    "at_first_batch": "Already at first batch",
+    "at_last_batch": "No next batch available",
+    "batch_not_found": "Batch {n} not found in queue",
+    "no_batch_data": "No batch data found to restore.",
+    "params_restored": "✅ UI Parameters restored from Batch {n}",
+    "scoring_failed": "❌ Error: Batch data not found",
+    "no_codes": "❌ No audio codes available. Please generate music first.",
+    "score_failed": "❌ Scoring failed: {error}",
+    "score_error": "❌ Error calculating score: {error}"
+  }
+}

acestep/gradio_ui/i18n/ja.json ADDED Viewed

	@@ -0,0 +1,209 @@

+{
+  "app": {
+    "title": "🎛️ ACE-Step V1.5 プレイグラウンド💡",
+    "subtitle": "オープンソース音楽生成の限界を押し広げる"
+  },
+  "dataset": {
+    "title": "📊 データセットエクスプローラー",
+    "dataset_label": "データセット",
+    "dataset_info": "探索するデータセットを選択",
+    "import_btn": "📥 データセットをインポート",
+    "search_type_label": "検索タイプ",
+    "search_type_info": "アイテムの検索方法",
+    "search_value_label": "検索値",
+    "search_value_placeholder": "キーまたはインデックスを入力(空白の場合はランダム)",
+    "search_value_info": "キー: 完全一致、インデックス: 0からデータセットサイズ-1",
+    "instruction_label": "📝 指示",
+    "instruction_placeholder": "利用可能な指示がありません",
+    "metadata_title": "📋 アイテムメタデータ (JSON)",
+    "metadata_label": "完全なアイテム情報",
+    "source_audio": "ソースオーディオ",
+    "target_audio": "ターゲットオーディオ",
+    "reference_audio": "リファレンスオーディオ",
+    "get_item_btn": "🔍 アイテムを取得",
+    "use_src_checkbox": "データセットのソースオーディオを使用",
+    "use_src_info": "データセットのソースオーディオを使用する場合はチェック",
+    "data_status_label": "📊 データステータス",
+    "data_status_default": "❌ データセットがインポートされていません",
+    "autofill_btn": "📋 生成フォームを自動入力"
+  },
+  "service": {
+    "title": "🔧 サービス設定",
+    "checkpoint_label": "チェックポイントファイル",
+    "checkpoint_info": "訓練済みモデルのチェックポイントファイルを選択(フルパスまたはファイル名)",
+    "refresh_btn": "🔄 更新",
+    "model_path_label": "メインモデルパス",
+    "model_path_info": "モデル設定ディレクトリを選択(チェックポイントから自動スキャン)",
+    "device_label": "デバイス",
+    "device_info": "処理デバイス(自動検出を推奨)",
+    "lm_model_path_label": "5Hz LM モデルパス",
+    "lm_model_path_info": "5Hz LMモデルチェックポイントを選択(チェックポイントから自動スキャン)",
+    "backend_label": "5Hz LM バックエンド",
+    "backend_info": "5Hz LMのバックエンドを選択: vllm(高速)またはpt(PyTorch、より互換性あり)",
+    "init_llm_label": "5Hz LM を初期化",
+    "init_llm_info": "サービス初期化中に5Hz LMを初期化する場合はチェック",
+    "flash_attention_label": "Flash Attention を使用",
+    "flash_attention_info_enabled": "推論を高速化するためにflash attentionを有効にする(flash_attnパッケージが必要)",
+    "flash_attention_info_disabled": "Flash attentionは利用できません(flash_attnパッケージがインストールされていません)",
+    "offload_cpu_label": "CPUにオフロード",
+    "offload_cpu_info": "使用していない時にモデルをCPUにオフロードしてGPUメモリを節約",
+    "offload_dit_cpu_label": "DiTをCPUにオフロード",
+    "offload_dit_cpu_info": "DiTをCPUにオフロード(CPUへのオフロードが必要)",
+    "init_btn": "サービスを初期化",
+    "status_label": "ステータス",
+    "language_label": "UI言語",
+    "language_info": "インターフェース言語を選択"
+  },
+  "generation": {
+    "required_inputs": "📝 必須入力",
+    "task_type_label": "タスクタイプ",
+    "task_type_info": "生成のタスクタイプを選択",
+    "instruction_label": "指示",
+    "instruction_info": "指示はタスクタイプに基づいて自動生成されます",
+    "load_btn": "読み込む",
+    "track_name_label": "トラック名",
+    "track_name_info": "lego/extractタスクのトラック名を選択",
+    "track_classes_label": "トラック名",
+    "track_classes_info": "completeタスクの複数のトラッククラスを選択",
+    "audio_uploads": "🎵 オーディオアップロード",
+    "reference_audio": "リファレンスオーディオ(オプション)",
+    "source_audio": "ソースオーディオ(オプション)",
+    "convert_codes_btn": "コードに変換",
+    "lm_codes_hints": "🎼 LM コードヒント",
+    "lm_codes_label": "LM コードヒント",
+    "lm_codes_placeholder": "<|audio_code_10695|><|audio_code_54246|>...",
+    "lm_codes_info": "text2music生成用のLMコードヒントを貼り付け",
+    "lm_codes_sample": "LM コードヒント(サンプル {n})",
+    "lm_codes_sample_info": "サンプル{n}のコード",
+    "transcribe_btn": "転写",
+    "repainting_controls": "🎨 再描画コントロール(秒)",
+    "repainting_start": "再描画開始",
+    "repainting_end": "再描画終了",
+    "caption_title": "📝 音楽キャプション",
+    "caption_label": "音楽キャプション(���プション)",
+    "caption_placeholder": "柔らかいボーカルを伴う穏やかなアコースティックギターのメロディー...",
+    "caption_info": "スタイル、ジャンル、楽器、ムードを説明",
+    "sample_btn": "サンプル",
+    "lyrics_title": "📝 歌詞",
+    "lyrics_label": "歌詞(オプション)",
+    "lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
+    "lyrics_info": "構造を持つ曲の歌詞",
+    "instrumental_label": "インストゥルメンタル",
+    "optional_params": "⚙️ オプションパラメータ",
+    "vocal_language_label": "ボーカル言語(オプション)",
+    "vocal_language_info": "インストには`unknown`を使用",
+    "bpm_label": "BPM(オプション)",
+    "bpm_info": "空白の場合はN/A",
+    "keyscale_label": "キースケール(オプション)",
+    "keyscale_placeholder": "空白の場合はN/A",
+    "keyscale_info": "A-G, #/♭, メジャー/マイナー",
+    "timesig_label": "拍子記号(オプション)",
+    "timesig_info": "2/4, 3/4, 4/4...",
+    "duration_label": "オーディオ長(秒)",
+    "duration_info": "ランダムの場合は-1を使用",
+    "batch_size_label": "バッチサイズ",
+    "batch_size_info": "生成するオーディオの数(最大8)",
+    "advanced_settings": "🔧 詳細設定",
+    "inference_steps_label": "DiT 推論ステップ",
+    "inference_steps_info": "Turbo: 最大8、Base: 最大100",
+    "guidance_scale_label": "DiT ガイダンススケール(baseモデルのみサポート)",
+    "guidance_scale_info": "値が高いほどテキストに忠実に従う",
+    "seed_label": "シード",
+    "seed_info": "バッチにはカンマ区切りの値を使用",
+    "random_seed_label": "ランダムシード",
+    "random_seed_info": "有効にすると自動的にシードを生成",
+    "audio_format_label": "オーディオフォーマット",
+    "audio_format_info": "保存ファイルのオーディオフォーマット",
+    "use_adg_label": "ADG を使用",
+    "use_adg_info": "角度ドメインガイダンスを有効化",
+    "cfg_interval_start": "CFG 間隔開始",
+    "cfg_interval_end": "CFG 間隔終了",
+    "lm_params_title": "🤖 LM 生成パラメータ",
+    "lm_temperature_label": "LM 温度",
+    "lm_temperature_info": "5Hz LM温度(高いほどランダム)",
+    "lm_cfg_scale_label": "LM CFG スケール",
+    "lm_cfg_scale_info": "5Hz LM CFG (1.0 = CFGなし)",
+    "lm_top_k_label": "LM Top-K",
+    "lm_top_k_info": "Top-K (0 = 無効)",
+    "lm_top_p_label": "LM Top-P",
+    "lm_top_p_info": "Top-P (1.0 = 無効)",
+    "lm_negative_prompt_label": "LM ネガティブプロンプト",
+    "lm_negative_prompt_placeholder": "CFGのネガティブプロンプトを入力(デフォルト: NO USER INPUT)",
+    "lm_negative_prompt_info": "ネガティブプロンプト(LM CFGスケール > 1.0の場合に使用)",
+    "cot_metas_label": "CoT メタデータ",
+    "cot_metas_info": "LMを使用してCoTメタデータを生成(チェックを外すとLM CoT生成をスキップ)",
+    "cot_language_label": "CoT 言語",
+    "cot_language_info": "CoTで言語を生成(思考の連鎖)",
+    "constrained_debug_label": "制約付きデコーディングデバッグ",
+    "constrained_debug_info": "制約付きデコーディングのデバッグログを有効化(チェックすると詳細ログを表示)",
+    "auto_score_label": "自動スコアリング",
+    "auto_score_info": "生成されたすべてのオーディオの品質スコアを自動計算",
+    "lm_batch_chunk_label": "LM バッチチャンクサイズ",
+    "lm_batch_chunk_info": "LMバッチチャンクあたりの最大アイテム数(デフォルト: 8、GPUメモリによる制限)",
+    "codes_strength_label": "LM コード強度",
+    "codes_strength_info": "LM生成コードを使用するデノイジングステップ数を制御",
+    "cover_strength_label": "オーディオカバー強度",
+    "cover_strength_info": "カバーモードを使用するデノイジングステップ数を制御",
+    "score_sensitivity_label": "品質スコア感度",
+    "score_sensitivity_info": "低い = より敏感(デフォルト: 1.0)。PMIが[0,1]にマッピングする方法を調整",
+    "attention_focus_label": "注意焦点スコアを出力(無効)",
+    "attention_focus_info": "注意焦点スコア分析を出力",
+    "think_label": "思考",
+    "parallel_thinking_label": "並列思考",
+    "generate_btn": "🎵 音楽を生成",
+    "autogen_label": "自動生成",
+    "caption_rewrite_label": "キャプション書き換え"
+  },
+  "results": {
+    "title": "🎵 結果",
+    "generated_music": "🎵 生成された音楽(サンプル {n})",
+    "send_to_src_btn": "🔗 ソースオーディオに送信",
+    "save_btn": "💾 保存",
+    "score_btn": "📊 スコア",
+    "quality_score_label": "品質スコア(サンプル {n})",
+    "quality_score_placeholder": "'��コア'をクリックしてパープレキシティベースの品質スコアを計算",
+    "generation_status": "生成ステータス",
+    "current_batch": "現在のバッチ",
+    "batch_indicator": "バッチ {current} / {total}",
+    "next_batch_status": "次のバッチステータス",
+    "prev_btn": "◀ 前へ",
+    "next_btn": "次へ ▶",
+    "restore_params_btn": "↙️ これらの設定をUIに適用(バッチパラメータを復元)",
+    "batch_results_title": "📁 バッチ結果と生成詳細",
+    "all_files_label": "📁 すべての生成ファイル(ダウンロード)",
+    "generation_details": "生成詳細",
+    "attention_analysis": "⚖️ 注意焦点スコア分析",
+    "attention_score": "注意焦点スコア(サンプル {n})",
+    "lyric_timestamps": "歌詞タイムスタンプ(サンプル {n})",
+    "attention_heatmap": "注意焦点スコアヒートマップ(サンプル {n})"
+  },
+  "messages": {
+    "no_audio_to_save": "❌ 保存するオーディオがありません",
+    "save_success": "✅ オーディオとメタデータを {filename} に保存しました",
+    "save_failed": "❌ 保存に失敗しました: {error}",
+    "no_file_selected": "⚠️ ファイルが選択されていません",
+    "params_loaded": "✅ {filename} からパラメータを読み込みました",
+    "invalid_json": "❌ 無効なJSONファイル: {error}",
+    "load_error": "❌ ファイルの読み込みエラー: {error}",
+    "example_loaded": "📁 {filename} からサンプルを読み込みました",
+    "example_failed": "JSONファイル {filename} の解析に失敗しました: {error}",
+    "example_error": "サンプル読み込みエラー: {error}",
+    "lm_generated": "🤖 LMを使用してサンプルを生成しました",
+    "lm_fallback": "LMを使用したサンプル生成に失敗、サンプルディレクトリにフォールバック",
+    "lm_not_initialized": "❌ 5Hz LMが初期化されていません。最初に初期化してください。",
+    "autogen_enabled": "🔄 自動生成が有効 - このあと次のバッチを生成します",
+    "batch_ready": "✅ バッチ {n} の準備完了！'次へ'をクリックして表示。",
+    "batch_generating": "🔄 バッチ {n} のバックグラウンド生成を開始...",
+    "batch_failed": "❌ バックグラウンド生成に失敗しました: {error}",
+    "viewing_batch": "✅ バッチ {n} を表示中",
+    "at_first_batch": "すでに最初のバッチです",
+    "at_last_batch": "次のバッチはありません",
+    "batch_not_found": "キューにバッチ {n} が見つかりません",
+    "no_batch_data": "復元するバッチデータがありません。",
+    "params_restored": "✅ バッチ {n} からUIパラメータを復元しました",
+    "scoring_failed": "❌ エラー: バッチデータが見つかりません",
+    "no_codes": "❌ 利用可能なオーディオコードがありません。最初に音楽を生成してください。",
+    "score_failed": "❌ スコアリングに失敗しました: {error}",
+    "score_error": "❌ スコア計算エラー: {error}"
+  }
+}

acestep/gradio_ui/i18n/zh.json ADDED Viewed

	@@ -0,0 +1,209 @@

+{
+  "app": {
+    "title": "🎛️ ACE-Step V1.5 演练场💡",
+    "subtitle": "推动开源音乐生成的边界"
+  },
+  "dataset": {
+    "title": "📊 数据集浏览器",
+    "dataset_label": "数据集",
+    "dataset_info": "选择要浏览的数据集",
+    "import_btn": "📥 导入数据集",
+    "search_type_label": "搜索类型",
+    "search_type_info": "如何查找项目",
+    "search_value_label": "搜索值",
+    "search_value_placeholder": "输入键或索引(留空表示随机)",
+    "search_value_info": "键: 精确匹配, 索引: 0到数据集大小-1",
+    "instruction_label": "📝 指令",
+    "instruction_placeholder": "无可用指令",
+    "metadata_title": "📋 项目元数据 (JSON)",
+    "metadata_label": "完整项目信息",
+    "source_audio": "源音频",
+    "target_audio": "目标音频",
+    "reference_audio": "参考音频",
+    "get_item_btn": "🔍 获取项目",
+    "use_src_checkbox": "使用数据集中的源音频",
+    "use_src_info": "勾选以使用数据集中的源音频",
+    "data_status_label": "📊 数据状态",
+    "data_status_default": "❌ 未导入数据集",
+    "autofill_btn": "📋 自动填充生成表单"
+  },
+  "service": {
+    "title": "🔧 服务配置",
+    "checkpoint_label": "检查点文件",
+    "checkpoint_info": "选择训练好的模型检查点文件(完整路径或文件名)",
+    "refresh_btn": "🔄 刷新",
+    "model_path_label": "主模型路径",
+    "model_path_info": "选择模型配置目录(从检查点自动扫描)",
+    "device_label": "设备",
+    "device_info": "处理设备(建议自动检测)",
+    "lm_model_path_label": "5Hz LM 模型路径",
+    "lm_model_path_info": "选择5Hz LM模型检查点(从检查点自动扫描)",
+    "backend_label": "5Hz LM 后端",
+    "backend_info": "选择5Hz LM的后端: vllm(更快)或pt(PyTorch, 更兼容)",
+    "init_llm_label": "初始化 5Hz LM",
+    "init_llm_info": "勾选以在服务初始化期间初始化5Hz LM",
+    "flash_attention_label": "使用Flash Attention",
+    "flash_attention_info_enabled": "启用flash attention以加快推理速度(需要flash_attn包)",
+    "flash_attention_info_disabled": "Flash attention不可用(未安装flash_attn包)",
+    "offload_cpu_label": "卸载到CPU",
+    "offload_cpu_info": "不使用时将模型卸载到CPU以节省GPU内存",
+    "offload_dit_cpu_label": "将DiT卸载到CPU",
+    "offload_dit_cpu_info": "将DiT卸载到CPU(需要启用卸载到CPU)",
+    "init_btn": "初始化服务",
+    "status_label": "状态",
+    "language_label": "界面语言",
+    "language_info": "选择界面语言"
+  },
+  "generation": {
+    "required_inputs": "📝 必需输入",
+    "task_type_label": "任务类型",
+    "task_type_info": "选择生成的任务类型",
+    "instruction_label": "指令",
+    "instruction_info": "指令根据任务类型自动生成",
+    "load_btn": "加载",
+    "track_name_label": "音轨名称",
+    "track_name_info": "为lego/extract任务选择音轨名称",
+    "track_classes_label": "音轨名称",
+    "track_classes_info": "为complete任务选择多个音轨类别",
+    "audio_uploads": "🎵 音频上传",
+    "reference_audio": "参考音频(可选)",
+    "source_audio": "源音频(可选)",
+    "convert_codes_btn": "转换为代码",
+    "lm_codes_hints": "🎼 LM 代码提示",
+    "lm_codes_label": "LM 代码提示",
+    "lm_codes_placeholder": "<|audio_code_10695|><|audio_code_54246|>...",
+    "lm_codes_info": "粘贴用于text2music生成的LM代码提示",
+    "lm_codes_sample": "LM 代码提示(样本 {n})",
+    "lm_codes_sample_info": "样本{n}的代码",
+    "transcribe_btn": "转录",
+    "repainting_controls": "🎨 重绘控制(秒)",
+    "repainting_start": "重绘开始",
+    "repainting_end": "重绘结束",
+    "caption_title": "📝 音乐描述",
+    "caption_label": "音乐描述(可选)",
+    "caption_placeholder": "一段平和的原声吉他旋律,配有柔和的人声...",
+    "caption_info": "描述风格、流派、乐器和情绪",
+    "sample_btn": "示例",
+    "lyrics_title": "📝 歌词",
+    "lyrics_label": "歌词(可选)",
+    "lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
+    "lyrics_info": "带有结构的歌曲歌词",
+    "instrumental_label": "纯音乐",
+    "optional_params": "⚙️ 可选参数",
+    "vocal_language_label": "人声语言(可选)",
+    "vocal_language_info": "纯音乐使用 `unknown`",
+    "bpm_label": "BPM(可选)",
+    "bpm_info": "留空表示N/A",
+    "keyscale_label": "调性(可选)",
+    "keyscale_placeholder": "留空表示N/A",
+    "keyscale_info": "A-G, #/♭, 大调/小调",
+    "timesig_label": "拍号(可选)",
+    "timesig_info": "2/4, 3/4, 4/4...",
+    "duration_label": "音频时长(秒)",
+    "duration_info": "使用-1表示随机",
+    "batch_size_label": "批量大小",
+    "batch_size_info": "要生成的音频数量(最多8个)",
+    "advanced_settings": "🔧 高级设置",
+    "inference_steps_label": "DiT 推理步数",
+    "inference_steps_info": "Turbo: 最多8, Base: 最多100",
+    "guidance_scale_label": "DiT 引导比例(仅支持base模型)",
+    "guidance_scale_info": "更高的值更紧密地遵循文本",
+    "seed_label": "种子",
+    "seed_info": "批量使用逗号分隔的值",
+    "random_seed_label": "随机种子",
+    "random_seed_info": "启用以自动生成种子",
+    "audio_format_label": "音频格式",
+    "audio_format_info": "保存文件的音频格式",
+    "use_adg_label": "使用 ADG",
+    "use_adg_info": "启用角域引导",
+    "cfg_interval_start": "CFG 间隔开始",
+    "cfg_interval_end": "CFG 间隔结束",
+    "lm_params_title": "🤖 LM 生成参数",
+    "lm_temperature_label": "LM 温度",
+    "lm_temperature_info": "5Hz LM温度(越高越随机)",
+    "lm_cfg_scale_label": "LM CFG 比例",
+    "lm_cfg_scale_info": "5Hz LM CFG (1.0 = 无CFG)",
+    "lm_top_k_label": "LM Top-K",
+    "lm_top_k_info": "Top-K (0 = 禁用)",
+    "lm_top_p_label": "LM Top-P",
+    "lm_top_p_info": "Top-P (1.0 = 禁用)",
+    "lm_negative_prompt_label": "LM 负面提示",
+    "lm_negative_prompt_placeholder": "输入CFG的负面提示(默认: NO USER INPUT)",
+    "lm_negative_prompt_info": "负面提示(当LM CFG比例 > 1.0时使用)",
+    "cot_metas_label": "CoT 元数据",
+    "cot_metas_info": "使用LM生成CoT元数据(取消勾选以跳过LM CoT生成)",
+    "cot_language_label": "CoT 语言",
+    "cot_language_info": "在CoT中生成语言(思维链)",
+    "constrained_debug_label": "约束解码调试",
+    "constrained_debug_info": "启用约束解码的调试日志(勾选以查看详细日志)",
+    "auto_score_label": "自动评分",
+    "auto_score_info": "自动计算所有生成音频的质量分数",
+    "lm_batch_chunk_label": "LM 批量块大小",
+    "lm_batch_chunk_info": "每个LM批量块的最大项目数(默认: 8, 受GPU内存限制)",
+    "codes_strength_label": "LM 代码强度",
+    "codes_strength_info": "控制使用LM生成代码的去噪步骤数量",
+    "cover_strength_label": "音频覆盖强度",
+    "cover_strength_info": "控制使用覆盖模式的去噪步骤数量",
+    "score_sensitivity_label": "质量评分敏感度",
+    "score_sensitivity_info": "更低 = 更敏感(默认: 1.0). 调整PMI如何映射到[0,1]",
+    "attention_focus_label": "输出注意力焦点分数(已禁用)",
+    "attention_focus_info": "输出注意力焦点分数分析",
+    "think_label": "思考",
+    "parallel_thinking_label": "并行思考",
+    "generate_btn": "🎵 生成音乐",
+    "autogen_label": "自动生成",
+    "caption_rewrite_label": "描述重写"
+  },
+  "results": {
+    "title": "🎵 结果",
+    "generated_music": "🎵 生成的音乐(样本 {n})",
+    "send_to_src_btn": "🔗 发送到源音频",
+    "save_btn": "💾 保存",
+    "score_btn": "📊 评分",
+    "quality_score_label": "质量分数(样本 {n})",
+    "quality_score_placeholder": "点击'评分'以计算基于困惑度的质量分数",
+    "generation_status": "生成状态",
+    "current_batch": "当前批次",
+    "batch_indicator": "批次 {current} / {total}",
+    "next_batch_status": "下一批次状态",
+    "prev_btn": "◀ 上一个",
+    "next_btn": "下一个 ▶",
+    "restore_params_btn": "↙️ 将这些设置应用到UI(恢复批次参数)",
+    "batch_results_title": "📁 批量结果和生成详情",
+    "all_files_label": "📁 所有生成的文件(下载)",
+    "generation_details": "生成详情",
+    "attention_analysis": "⚖️ 注意力焦点分数分析",
+    "attention_score": "注意力焦点分数(样本 {n})",
+    "lyric_timestamps": "歌词时间戳(样本 {n})",
+    "attention_heatmap": "注意力焦点分数热图(样本 {n})"
+  },
+  "messages": {
+    "no_audio_to_save": "❌ 没有要保存的音频",
+    "save_success": "✅ 已将音频和元数据保存到 {filename}",
+    "save_failed": "❌ 保存失败: {error}",
+    "no_file_selected": "⚠️ 未选择文件",
+    "params_loaded": "✅ 已从 {filename} 加载参数",
+    "invalid_json": "❌ 无效的JSON文件: {error}",
+    "load_error": "❌ 加载文件时出错: {error}",
+    "example_loaded": "📁 已从 {filename} 加载示例",
+    "example_failed": "解析JSON文件 {filename} 失败: {error}",
+    "example_error": "加载示例时出错: {error}",
+    "lm_generated": "🤖 使用LM生成的示例",
+    "lm_fallback": "使用LM生成示例失败,回退到示例目录",
+    "lm_not_initialized": "❌ 5Hz LM未初始化。请先初始化它。",
+    "autogen_enabled": "🔄 已启用自动生成 - 下一批次将在此之后生成",
+    "batch_ready": "✅ 批次 {n} 就绪!点击'下一个'查看。",
+    "batch_generating": "🔄 开始为批次 {n} 进行后台生成...",
+    "batch_failed": "❌ 后台生成失败: {error}",
+    "viewing_batch": "✅ 查看批次 {n}",
+    "at_first_batch": "已在第一批次",
+    "at_last_batch": "没有下一批次可用",
+    "batch_not_found": "在队列中未找到批次 {n}",
+    "no_batch_data": "没有要恢复的批次数据。",
+    "params_restored": "✅ 已从批次 {n} 恢复UI参数",
+    "scoring_failed": "❌ 错误: 未找到批次数据",
+    "no_codes": "❌ 没有可用的音频代码。请先生成音乐。",
+    "score_failed": "❌ 评分失败: {error}",
+    "score_error": "❌ 计算分数时出错: {error}"
+  }
+}

acestep/gradio_ui/interfaces/__init__.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+Gradio UI Components Module
+Contains all Gradio interface component definitions and layouts
+"""
+import gradio as gr
+from acestep.gradio_ui.i18n import get_i18n, t
+from acestep.gradio_ui.interfaces.dataset import create_dataset_section
+from acestep.gradio_ui.interfaces.generation import create_generation_section
+from acestep.gradio_ui.interfaces.result import create_results_section
+from acestep.gradio_ui.events import setup_event_handlers
+def create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_params=None, language='en') -> gr.Blocks:
+    """
+    Create Gradio interface
+    Args:
+        dit_handler: DiT handler instance
+        llm_handler: LM handler instance
+        dataset_handler: Dataset handler instance
+        init_params: Dictionary containing initialization parameters and state.
+                    If None, service will not be pre-initialized.
+        language: UI language code ('en', 'zh', 'ja', default: 'en')
+    Returns:
+        Gradio Blocks instance
+    """
+    # Initialize i18n with selected language
+    i18n = get_i18n(language)
+    with gr.Blocks(
+        title=t("app.title"),
+        theme=gr.themes.Soft(),
+        css="""
+        .main-header {
+            text-align: center;
+            margin-bottom: 2rem;
+        }
+        .section-header {
+            background: linear-gradient(90deg, #4CAF50, #45a049);
+            color: white;
+            padding: 10px;
+            border-radius: 5px;
+            margin: 10px 0;
+        }
+        .lm-hints-row {
+            align-items: stretch;
+        }
+        .lm-hints-col {
+            display: flex;
+        }
+        .lm-hints-col > div {
+            flex: 1;
+            display: flex;
+        }
+        .lm-hints-btn button {
+            height: 100%;
+            width: 100%;
+        }
+        """
+    ) as demo:
+        gr.HTML(f"""
+        <div class="main-header">
+            <h1>{t("app.title")}</h1>
+            <p>{t("app.subtitle")}</p>
+        </div>
+        """)
+        # Dataset Explorer Section
+        dataset_section = create_dataset_section(dataset_handler)
+        # Generation Section (pass init_params and language to support pre-initialization)
+        generation_section = create_generation_section(dit_handler, llm_handler, init_params=init_params, language=language)
+        # Results Section
+        results_section = create_results_section(dit_handler)
+        # Connect event handlers
+        setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, dataset_section, generation_section, results_section)
+    return demo

acestep/gradio_ui/interfaces/dataset.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+Gradio UI Dataset Section Module
+Contains dataset explorer section component definitions
+"""
+import gradio as gr
+def create_dataset_section(dataset_handler) -> dict:
+    """Create dataset explorer section"""
+    with gr.Accordion("📊 Dataset Explorer", open=False, visible=False):
+        with gr.Row(equal_height=True):
+            dataset_type = gr.Dropdown(
+                choices=["train", "test"],
+                value="train",
+                label="Dataset",
+                info="Choose dataset to explore",
+                scale=2
+            )
+            import_dataset_btn = gr.Button("📥 Import Dataset", variant="primary", scale=1)
+            search_type = gr.Dropdown(
+                choices=["keys", "idx", "random"],
+                value="random",
+                label="Search Type",
+                info="How to find items",
+                scale=1
+            )
+            search_value = gr.Textbox(
+                label="Search Value",
+                placeholder="Enter keys or index (leave empty for random)",
+                info="Keys: exact match, Index: 0 to dataset size-1",
+                scale=2
+            )
+        instruction_display = gr.Textbox(
+            label="📝 Instruction",
+            interactive=False,
+            placeholder="No instruction available",
+            lines=1
+        )
+        repaint_viz_plot = gr.Plot()
+        with gr.Accordion("📋 Item Metadata (JSON)", open=False):
+            item_info_json = gr.Code(
+                label="Complete Item Information",
+                language="json",
+                interactive=False,
+                lines=15
+            )
+        with gr.Row(equal_height=True):
+            item_src_audio = gr.Audio(
+                label="Source Audio",
+                type="filepath",
+                interactive=False,
+                scale=8
+            )
+            get_item_btn = gr.Button("🔍 Get Item", variant="secondary", interactive=False, scale=2)
+        with gr.Row(equal_height=True):
+            item_target_audio = gr.Audio(
+                label="Target Audio",
+                type="filepath",
+                interactive=False,
+                scale=8
+            )
+            item_refer_audio = gr.Audio(
+                label="Reference Audio",
+                type="filepath",
+                interactive=False,
+                scale=2
+            )
+        with gr.Row():
+            use_src_checkbox = gr.Checkbox(
+                label="Use Source Audio from Dataset",
+                value=True,
+                info="Check to use the source audio from dataset"
+            )
+        data_status = gr.Textbox(label="📊 Data Status", interactive=False, value="❌ No dataset imported")
+        auto_fill_btn = gr.Button("📋 Auto-fill Generation Form", variant="primary")
+    return {
+        "dataset_type": dataset_type,
+        "import_dataset_btn": import_dataset_btn,
+        "search_type": search_type,
+        "search_value": search_value,
+        "instruction_display": instruction_display,
+        "repaint_viz_plot": repaint_viz_plot,
+        "item_info_json": item_info_json,
+        "item_src_audio": item_src_audio,
+        "get_item_btn": get_item_btn,
+        "item_target_audio": item_target_audio,
+        "item_refer_audio": item_refer_audio,
+        "use_src_checkbox": use_src_checkbox,
+        "data_status": data_status,
+        "auto_fill_btn": auto_fill_btn,
+    }

acestep/gradio_ui/interfaces/generation.py ADDED Viewed

	@@ -0,0 +1,683 @@

+"""
+Gradio UI Generation Section Module
+Contains generation section component definitions
+"""
+import gradio as gr
+from acestep.constants import (
+    VALID_LANGUAGES,
+    TRACK_NAMES,
+    TASK_TYPES_TURBO,
+    TASK_TYPES_BASE,
+    DEFAULT_DIT_INSTRUCTION,
+)
+from acestep.gradio_ui.i18n import t
+def create_generation_section(dit_handler, llm_handler, init_params=None, language='en') -> dict:
+    """Create generation section
+    Args:
+        dit_handler: DiT handler instance
+        llm_handler: LM handler instance
+        init_params: Dictionary containing initialization parameters and state.
+                    If None, service will not be pre-initialized.
+        language: UI language code ('en', 'zh', 'ja')
+    """
+    # Check if service is pre-initialized
+    service_pre_initialized = init_params is not None and init_params.get('pre_initialized', False)
+    # Get current language from init_params if available
+    current_language = init_params.get('language', language) if init_params else language
+    with gr.Group():
+        # Service Configuration - collapse if pre-initialized, hide if in service mode
+        accordion_open = not service_pre_initialized
+        accordion_visible = not service_pre_initialized  # Hide when running in service mode
+        with gr.Accordion(t("service.title"), open=accordion_open, visible=accordion_visible) as service_config_accordion:
+            # Language selector at the top
+            with gr.Row():
+                language_dropdown = gr.Dropdown(
+                    choices=[
+                        ("English", "en"),
+                        ("中文", "zh"),
+                        ("日本語", "ja"),
+                    ],
+                    value=current_language,
+                    label=t("service.language_label"),
+                    info=t("service.language_info"),
+                    scale=1,
+                )
+            # Dropdown options section - all dropdowns grouped together
+            with gr.Row(equal_height=True):
+                with gr.Column(scale=4):
+                    # Set checkpoint value from init_params if pre-initialized
+                    checkpoint_value = init_params.get('checkpoint') if service_pre_initialized else None
+                    checkpoint_dropdown = gr.Dropdown(
+                        label=t("service.checkpoint_label"),
+                        choices=dit_handler.get_available_checkpoints(),
+                        value=checkpoint_value,
+                        info=t("service.checkpoint_info")
+                    )
+                with gr.Column(scale=1, min_width=90):
+                    refresh_btn = gr.Button(t("service.refresh_btn"), size="sm")
+            with gr.Row():
+                # Get available acestep-v15- model list
+                available_models = dit_handler.get_available_acestep_v15_models()
+                default_model = "acestep-v15-turbo" if "acestep-v15-turbo" in available_models else (available_models[0] if available_models else None)
+                # Set config_path value from init_params if pre-initialized
+                config_path_value = init_params.get('config_path', default_model) if service_pre_initialized else default_model
+                config_path = gr.Dropdown(
+                    label=t("service.model_path_label"),
+                    choices=available_models,
+                    value=config_path_value,
+                    info=t("service.model_path_info")
+                )
+                # Set device value from init_params if pre-initialized
+                device_value = init_params.get('device', 'auto') if service_pre_initialized else 'auto'
+                device = gr.Dropdown(
+                    choices=["auto", "cuda", "cpu"],
+                    value=device_value,
+                    label=t("service.device_label"),
+                    info=t("service.device_info")
+                )
+            with gr.Row():
+                # Get available 5Hz LM model list
+                available_lm_models = llm_handler.get_available_5hz_lm_models()
+                default_lm_model = "acestep-5Hz-lm-0.6B" if "acestep-5Hz-lm-0.6B" in available_lm_models else (available_lm_models[0] if available_lm_models else None)
+                # Set lm_model_path value from init_params if pre-initialized
+                lm_model_path_value = init_params.get('lm_model_path', default_lm_model) if service_pre_initialized else default_lm_model
+                lm_model_path = gr.Dropdown(
+                    label=t("service.lm_model_path_label"),
+                    choices=available_lm_models,
+                    value=lm_model_path_value,
+                    info=t("service.lm_model_path_info")
+                )
+                # Set backend value from init_params if pre-initialized
+                backend_value = init_params.get('backend', 'vllm') if service_pre_initialized else 'vllm'
+                backend_dropdown = gr.Dropdown(
+                    choices=["vllm", "pt"],
+                    value=backend_value,
+                    label=t("service.backend_label"),
+                    info=t("service.backend_info")
+                )
+            # Checkbox options section - all checkboxes grouped together
+            with gr.Row():
+                # Set init_llm value from init_params if pre-initialized
+                init_llm_value = init_params.get('init_llm', True) if service_pre_initialized else True
+                init_llm_checkbox = gr.Checkbox(
+                    label=t("service.init_llm_label"),
+                    value=init_llm_value,
+                    info=t("service.init_llm_info"),
+                )
+                # Auto-detect flash attention availability
+                flash_attn_available = dit_handler.is_flash_attention_available()
+                # Set use_flash_attention value from init_params if pre-initialized
+                use_flash_attention_value = init_params.get('use_flash_attention', flash_attn_available) if service_pre_initialized else flash_attn_available
+                use_flash_attention_checkbox = gr.Checkbox(
+                    label=t("service.flash_attention_label"),
+                    value=use_flash_attention_value,
+                    interactive=flash_attn_available,
+                    info=t("service.flash_attention_info_enabled") if flash_attn_available else t("service.flash_attention_info_disabled")
+                )
+                # Set offload_to_cpu value from init_params if pre-initialized
+                offload_to_cpu_value = init_params.get('offload_to_cpu', False) if service_pre_initialized else False
+                offload_to_cpu_checkbox = gr.Checkbox(
+                    label=t("service.offload_cpu_label"),
+                    value=offload_to_cpu_value,
+                    info=t("service.offload_cpu_info")
+                )
+                # Set offload_dit_to_cpu value from init_params if pre-initialized
+                offload_dit_to_cpu_value = init_params.get('offload_dit_to_cpu', False) if service_pre_initialized else False
+                offload_dit_to_cpu_checkbox = gr.Checkbox(
+                    label=t("service.offload_dit_cpu_label"),
+                    value=offload_dit_to_cpu_value,
+                    info=t("service.offload_dit_cpu_info")
+                )
+            init_btn = gr.Button(t("service.init_btn"), variant="primary", size="lg")
+            # Set init_status value from init_params if pre-initialized
+            init_status_value = init_params.get('init_status', '') if service_pre_initialized else ''
+            init_status = gr.Textbox(label=t("service.status_label"), interactive=False, lines=3, value=init_status_value)
+        # Inputs
+        with gr.Row():
+            with gr.Column(scale=2):
+                with gr.Accordion(t("generation.required_inputs"), open=True):
+                    # Task type
+                    # Determine initial task_type choices based on default model
+                    default_model_lower = (default_model or "").lower()
+                    if "turbo" in default_model_lower:
+                        initial_task_choices = TASK_TYPES_TURBO
+                    else:
+                        initial_task_choices = TASK_TYPES_BASE
+                    with gr.Row(equal_height=True):
+                        with gr.Column(scale=2):
+                            task_type = gr.Dropdown(
+                                choices=initial_task_choices,
+                                value="text2music",
+                                label=t("generation.task_type_label"),
+                                info=t("generation.task_type_info"),
+                            )
+                        with gr.Column(scale=7):
+                            instruction_display_gen = gr.Textbox(
+                                label=t("generation.instruction_label"),
+                                value=DEFAULT_DIT_INSTRUCTION,
+                                interactive=False,
+                                lines=1,
+                                info=t("generation.instruction_info"),
+                            )
+                        with gr.Column(scale=1, min_width=100):
+                            load_file = gr.UploadButton(
+                                t("generation.load_btn"),
+                                file_types=[".json"],
+                                file_count="single",
+                                variant="secondary",
+                                size="sm",
+                            )
+                    track_name = gr.Dropdown(
+                        choices=TRACK_NAMES,
+                        value=None,
+                        label=t("generation.track_name_label"),
+                        info=t("generation.track_name_info"),
+                        visible=False
+                    )
+                    complete_track_classes = gr.CheckboxGroup(
+                        choices=TRACK_NAMES,
+                        label=t("generation.track_classes_label"),
+                        info=t("generation.track_classes_info"),
+                        visible=False
+                    )
+                    # Audio uploads
+                    audio_uploads_accordion = gr.Accordion(t("generation.audio_uploads"), open=False)
+                    with audio_uploads_accordion:
+                        with gr.Row(equal_height=True):
+                            with gr.Column(scale=2):
+                                reference_audio = gr.Audio(
+                                    label=t("generation.reference_audio"),
+                                    type="filepath",
+                                )
+                            with gr.Column(scale=7):
+                                src_audio = gr.Audio(
+                                    label=t("generation.source_audio"),
+                                    type="filepath",
+                                )
+                            with gr.Column(scale=1, min_width=80):
+                                convert_src_to_codes_btn = gr.Button(
+                                    t("generation.convert_codes_btn"),
+                                    variant="secondary",
+                                    size="sm"
+                                )
+                    # Audio Codes for text2music (dynamic display based on batch size and allow_lm_batch)
+                    with gr.Accordion(t("generation.lm_codes_hints"), open=False, visible=True) as text2music_audio_codes_group:
+                        # Single codes input (default mode)
+                        with gr.Row(equal_height=True, visible=True) as codes_single_row:
+                            text2music_audio_code_string = gr.Textbox(
+                                label=t("generation.lm_codes_label"),
+                                placeholder=t("generation.lm_codes_placeholder"),
+                                lines=6,
+                                info=t("generation.lm_codes_info"),
+                                scale=9,
+                            )
+                            transcribe_btn = gr.Button(
+                                t("generation.transcribe_btn"),
+                                variant="secondary",
+                                size="sm",
+                                scale=1,
+                            )
+                        # Multiple codes inputs (batch mode when allow_lm_batch is enabled)
+                        with gr.Row(visible=False) as codes_batch_row:
+                            with gr.Column(visible=True) as codes_col_1:
+                                text2music_audio_code_string_1 = gr.Textbox(
+                                    label=t("generation.lm_codes_sample", n=1),
+                                    placeholder="<|audio_code_...|>",
+                                    lines=4,
+                                    info=t("generation.lm_codes_sample_info", n=1),
+                                )
+                            with gr.Column(visible=True) as codes_col_2:
+                                text2music_audio_code_string_2 = gr.Textbox(
+                                    label=t("generation.lm_codes_sample", n=2),
+                                    placeholder="<|audio_code_...|>",
+                                    lines=4,
+                                    info=t("generation.lm_codes_sample_info", n=2),
+                                )
+                            with gr.Column(visible=False) as codes_col_3:
+                                text2music_audio_code_string_3 = gr.Textbox(
+                                    label=t("generation.lm_codes_sample", n=3),
+                                    placeholder="<|audio_code_...|>",
+                                    lines=4,
+                                    info=t("generation.lm_codes_sample_info", n=3),
+                                )
+                            with gr.Column(visible=False) as codes_col_4:
+                                text2music_audio_code_string_4 = gr.Textbox(
+                                    label=t("generation.lm_codes_sample", n=4),
+                                    placeholder="<|audio_code_...|>",
+                                    lines=4,
+                                    info=t("generation.lm_codes_sample_info", n=4),
+                                )
+                        # Additional row for codes 5-8
+                        with gr.Row(visible=False) as codes_batch_row_2:
+                            with gr.Column() as codes_col_5:
+                                text2music_audio_code_string_5 = gr.Textbox(
+                                    label=t("generation.lm_codes_sample", n=5),
+                                    placeholder="<|audio_code_...|>",
+                                    lines=4,
+                                    info=t("generation.lm_codes_sample_info", n=5),
+                                )
+                            with gr.Column() as codes_col_6:
+                                text2music_audio_code_string_6 = gr.Textbox(
+                                    label=t("generation.lm_codes_sample", n=6),
+                                    placeholder="<|audio_code_...|>",
+                                    lines=4,
+                                    info=t("generation.lm_codes_sample_info", n=6),
+                                )
+                            with gr.Column() as codes_col_7:
+                                text2music_audio_code_string_7 = gr.Textbox(
+                                    label=t("generation.lm_codes_sample", n=7),
+                                    placeholder="<|audio_code_...|>",
+                                    lines=4,
+                                    info=t("generation.lm_codes_sample_info", n=7),
+                                )
+                            with gr.Column() as codes_col_8:
+                                text2music_audio_code_string_8 = gr.Textbox(
+                                    label=t("generation.lm_codes_sample", n=8),
+                                    placeholder="<|audio_code_...|>",
+                                    lines=4,
+                                    info=t("generation.lm_codes_sample_info", n=8),
+                                )
+                    # Repainting controls
+                    with gr.Group(visible=False) as repainting_group:
+                        gr.HTML(f"<h5>{t('generation.repainting_controls')}</h5>")
+                        with gr.Row():
+                            repainting_start = gr.Number(
+                                label=t("generation.repainting_start"),
+                                value=0.0,
+                                step=0.1,
+                            )
+                            repainting_end = gr.Number(
+                                label=t("generation.repainting_end"),
+                                value=-1,
+                                minimum=-1,
+                                step=0.1,
+                            )
+                # Music Caption
+                with gr.Accordion(t("generation.caption_title"), open=True):
+                    with gr.Row(equal_height=True):
+                        captions = gr.Textbox(
+                            label=t("generation.caption_label"),
+                            placeholder=t("generation.caption_placeholder"),
+                            lines=3,
+                            info=t("generation.caption_info"),
+                            scale=9,
+                        )
+                        sample_btn = gr.Button(
+                            t("generation.sample_btn"),
+                            variant="secondary",
+                            size="sm",
+                            scale=1,
+                        )
+                # Lyrics
+                with gr.Accordion(t("generation.lyrics_title"), open=True):
+                    lyrics = gr.Textbox(
+                        label=t("generation.lyrics_label"),
+                        placeholder=t("generation.lyrics_placeholder"),
+                        lines=8,
+                        info=t("generation.lyrics_info")
+                    )
+                    instrumental_checkbox = gr.Checkbox(
+                        label=t("generation.instrumental_label"),
+                        value=False,
+                        scale=1,
+                    )
+                # Optional Parameters
+                with gr.Accordion(t("generation.optional_params"), open=True):
+                    with gr.Row():
+                        vocal_language = gr.Dropdown(
+                            choices=VALID_LANGUAGES,
+                            value="unknown",
+                            label=t("generation.vocal_language_label"),
+                            allow_custom_value=True,
+                            info=t("generation.vocal_language_info")
+                        )
+                        bpm = gr.Number(
+                            label=t("generation.bpm_label"),
+                            value=None,
+                            step=1,
+                            info=t("generation.bpm_info")
+                        )
+                        key_scale = gr.Textbox(
+                            label=t("generation.keyscale_label"),
+                            placeholder=t("generation.keyscale_placeholder"),
+                            value="",
+                            info=t("generation.keyscale_info")
+                        )
+                        time_signature = gr.Dropdown(
+                            choices=["2", "3", "4", "N/A", ""],
+                            value="",
+                            label=t("generation.timesig_label"),
+                            allow_custom_value=True,
+                            info=t("generation.timesig_info")
+                        )
+                        audio_duration = gr.Number(
+                            label=t("generation.duration_label"),
+                            value=-1,
+                            minimum=-1,
+                            maximum=600.0,
+                            step=0.1,
+                            info=t("generation.duration_info")
+                        )
+                        batch_size_input = gr.Number(
+                            label=t("generation.batch_size_label"),
+                            value=2,
+                            minimum=1,
+                            maximum=8,
+                            step=1,
+                            info=t("generation.batch_size_info")
+                        )
+        # Advanced Settings
+        with gr.Accordion(t("generation.advanced_settings"), open=False):
+            with gr.Row():
+                inference_steps = gr.Slider(
+                    minimum=1,
+                    maximum=8,
+                    value=8,
+                    step=1,
+                    label=t("generation.inference_steps_label"),
+                    info=t("generation.inference_steps_info")
+                )
+                guidance_scale = gr.Slider(
+                    minimum=1.0,
+                    maximum=15.0,
+                    value=7.0,
+                    step=0.1,
+                    label=t("generation.guidance_scale_label"),
+                    info=t("generation.guidance_scale_info"),
+                    visible=False
+                )
+                with gr.Column():
+                    seed = gr.Textbox(
+                        label=t("generation.seed_label"),
+                        value="-1",
+                        info=t("generation.seed_info")
+                    )
+                    random_seed_checkbox = gr.Checkbox(
+                        label=t("generation.random_seed_label"),
+                        value=True,
+                        info=t("generation.random_seed_info")
+                    )
+                audio_format = gr.Dropdown(
+                    choices=["mp3", "flac"],
+                    value="mp3",
+                    label=t("generation.audio_format_label"),
+                    info=t("generation.audio_format_info")
+                )
+            with gr.Row():
+                use_adg = gr.Checkbox(
+                    label=t("generation.use_adg_label"),
+                    value=False,
+                    info=t("generation.use_adg_info"),
+                    visible=False
+                )
+            with gr.Row():
+                cfg_interval_start = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.0,
+                    step=0.01,
+                    label=t("generation.cfg_interval_start"),
+                    visible=False
+                )
+                cfg_interval_end = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=1.0,
+                    step=0.01,
+                    label=t("generation.cfg_interval_end"),
+                    visible=False
+                )
+            # LM (Language Model) Parameters
+            gr.HTML(f"<h4>{t('generation.lm_params_title')}</h4>")
+            with gr.Row():
+                lm_temperature = gr.Slider(
+                    label=t("generation.lm_temperature_label"),
+                    minimum=0.0,
+                    maximum=2.0,
+                    value=0.85,
+                    step=0.1,
+                    scale=1,
+                    info=t("generation.lm_temperature_info")
+                )
+                lm_cfg_scale = gr.Slider(
+                    label=t("generation.lm_cfg_scale_label"),
+                    minimum=1.0,
+                    maximum=3.0,
+                    value=2.0,
+                    step=0.1,
+                    scale=1,
+                    info=t("generation.lm_cfg_scale_info")
+                )
+                lm_top_k = gr.Slider(
+                    label=t("generation.lm_top_k_label"),
+                    minimum=0,
+                    maximum=100,
+                    value=0,
+                    step=1,
+                    scale=1,
+                    info=t("generation.lm_top_k_info")
+                )
+                lm_top_p = gr.Slider(
+                    label=t("generation.lm_top_p_label"),
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.9,
+                    step=0.01,
+                    scale=1,
+                    info=t("generation.lm_top_p_info")
+                )
+            with gr.Row():
+                lm_negative_prompt = gr.Textbox(
+                    label=t("generation.lm_negative_prompt_label"),
+                    value="NO USER INPUT",
+                    placeholder=t("generation.lm_negative_prompt_placeholder"),
+                    info=t("generation.lm_negative_prompt_info"),
+                    lines=2,
+                    scale=2,
+                )
+            with gr.Row():
+                use_cot_metas = gr.Checkbox(
+                    label=t("generation.cot_metas_label"),
+                    value=True,
+                    info=t("generation.cot_metas_info"),
+                    scale=1,
+                )
+                use_cot_language = gr.Checkbox(
+                    label=t("generation.cot_language_label"),
+                    value=True,
+                    info=t("generation.cot_language_info"),
+                    scale=1,
+                )
+                constrained_decoding_debug = gr.Checkbox(
+                    label=t("generation.constrained_debug_label"),
+                    value=False,
+                    info=t("generation.constrained_debug_info"),
+                    scale=1,
+                )
+            with gr.Row():
+                auto_score = gr.Checkbox(
+                    label=t("generation.auto_score_label"),
+                    value=False,
+                    info=t("generation.auto_score_info"),
+                    scale=1,
+                )
+                lm_batch_chunk_size = gr.Number(
+                    label=t("generation.lm_batch_chunk_label"),
+                    value=8,
+                    minimum=1,
+                    maximum=32,
+                    step=1,
+                    info=t("generation.lm_batch_chunk_info"),
+                    scale=1,
+                )
+            with gr.Row():
+                audio_cover_strength = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=1.0,
+                    step=0.01,
+                    label=t("generation.codes_strength_label"),
+                    info=t("generation.codes_strength_info"),
+                    scale=1,
+                )
+                score_scale = gr.Slider(
+                    minimum=0.01,
+                    maximum=1.0,
+                    value=0.5,
+                    step=0.01,
+                    label=t("generation.score_sensitivity_label"),
+                    info=t("generation.score_sensitivity_info"),
+                    scale=1,
+                )
+                output_alignment_preference = gr.Checkbox(
+                    label=t("generation.attention_focus_label"),
+                    value=False,
+                    info=t("generation.attention_focus_info"),
+                    interactive=False,
+                    scale=1,
+                )
+        # Set generate_btn to interactive if service is pre-initialized
+        generate_btn_interactive = init_params.get('enable_generate', False) if service_pre_initialized else False
+        with gr.Row(equal_height=True):
+            think_checkbox = gr.Checkbox(
+                label=t("generation.think_label"),
+                value=True,
+                scale=1,
+            )
+            allow_lm_batch = gr.Checkbox(
+                label=t("generation.parallel_thinking_label"),
+                value=True,
+                scale=1,
+            )
+            generate_btn = gr.Button(t("generation.generate_btn"), variant="primary", size="lg", interactive=generate_btn_interactive, scale=9)
+            autogen_checkbox = gr.Checkbox(
+                label=t("generation.autogen_label"),
+                value=True,
+                scale=1,
+            )
+            use_cot_caption = gr.Checkbox(
+                label=t("generation.caption_rewrite_label"),
+                value=True,
+                scale=1,
+            )
+    return {
+        "service_config_accordion": service_config_accordion,
+        "language_dropdown": language_dropdown,
+        "checkpoint_dropdown": checkpoint_dropdown,
+        "refresh_btn": refresh_btn,
+        "config_path": config_path,
+        "device": device,
+        "init_btn": init_btn,
+        "init_status": init_status,
+        "lm_model_path": lm_model_path,
+        "init_llm_checkbox": init_llm_checkbox,
+        "backend_dropdown": backend_dropdown,
+        "use_flash_attention_checkbox": use_flash_attention_checkbox,
+        "offload_to_cpu_checkbox": offload_to_cpu_checkbox,
+        "offload_dit_to_cpu_checkbox": offload_dit_to_cpu_checkbox,
+        "task_type": task_type,
+        "instruction_display_gen": instruction_display_gen,
+        "track_name": track_name,
+        "complete_track_classes": complete_track_classes,
+        "audio_uploads_accordion": audio_uploads_accordion,
+        "reference_audio": reference_audio,
+        "src_audio": src_audio,
+        "convert_src_to_codes_btn": convert_src_to_codes_btn,
+        "text2music_audio_code_string": text2music_audio_code_string,
+        "transcribe_btn": transcribe_btn,
+        "text2music_audio_codes_group": text2music_audio_codes_group,
+        "lm_temperature": lm_temperature,
+        "lm_cfg_scale": lm_cfg_scale,
+        "lm_top_k": lm_top_k,
+        "lm_top_p": lm_top_p,
+        "lm_negative_prompt": lm_negative_prompt,
+        "use_cot_metas": use_cot_metas,
+        "use_cot_caption": use_cot_caption,
+        "use_cot_language": use_cot_language,
+        "repainting_group": repainting_group,
+        "repainting_start": repainting_start,
+        "repainting_end": repainting_end,
+        "audio_cover_strength": audio_cover_strength,
+        "captions": captions,
+        "sample_btn": sample_btn,
+        "load_file": load_file,
+        "lyrics": lyrics,
+        "vocal_language": vocal_language,
+        "bpm": bpm,
+        "key_scale": key_scale,
+        "time_signature": time_signature,
+        "audio_duration": audio_duration,
+        "batch_size_input": batch_size_input,
+        "inference_steps": inference_steps,
+        "guidance_scale": guidance_scale,
+        "seed": seed,
+        "random_seed_checkbox": random_seed_checkbox,
+        "use_adg": use_adg,
+        "cfg_interval_start": cfg_interval_start,
+        "cfg_interval_end": cfg_interval_end,
+        "audio_format": audio_format,
+        "output_alignment_preference": output_alignment_preference,
+        "think_checkbox": think_checkbox,
+        "autogen_checkbox": autogen_checkbox,
+        "generate_btn": generate_btn,
+        "instrumental_checkbox": instrumental_checkbox,
+        "constrained_decoding_debug": constrained_decoding_debug,
+        "score_scale": score_scale,
+        "allow_lm_batch": allow_lm_batch,
+        "auto_score": auto_score,
+        "lm_batch_chunk_size": lm_batch_chunk_size,
+        "codes_single_row": codes_single_row,
+        "codes_batch_row": codes_batch_row,
+        "codes_batch_row_2": codes_batch_row_2,
+        "text2music_audio_code_string_1": text2music_audio_code_string_1,
+        "text2music_audio_code_string_2": text2music_audio_code_string_2,
+        "text2music_audio_code_string_3": text2music_audio_code_string_3,
+        "text2music_audio_code_string_4": text2music_audio_code_string_4,
+        "text2music_audio_code_string_5": text2music_audio_code_string_5,
+        "text2music_audio_code_string_6": text2music_audio_code_string_6,
+        "text2music_audio_code_string_7": text2music_audio_code_string_7,
+        "text2music_audio_code_string_8": text2music_audio_code_string_8,
+        "codes_col_1": codes_col_1,
+        "codes_col_2": codes_col_2,
+        "codes_col_3": codes_col_3,
+        "codes_col_4": codes_col_4,
+        "codes_col_5": codes_col_5,
+        "codes_col_6": codes_col_6,
+        "codes_col_7": codes_col_7,
+        "codes_col_8": codes_col_8,
+    }

acestep/gradio_ui/interfaces/result.py ADDED Viewed

	@@ -0,0 +1,341 @@

+"""
+Gradio UI Results Section Module
+Contains results display section component definitions
+"""
+import gradio as gr
+from acestep.gradio_ui.i18n import t
+def create_results_section(dit_handler) -> dict:
+    """Create results display section"""
+    with gr.Accordion(t("results.title"), open=True):
+        # Hidden state to store LM-generated metadata
+        lm_metadata_state = gr.State(value=None)
+        # Hidden state to track if caption/metadata is from formatted source (LM/transcription)
+        is_format_caption_state = gr.State(value=False)
+        # Batch management states
+        current_batch_index = gr.State(value=0)  # Currently displayed batch index
+        total_batches = gr.State(value=1)  # Total number of batches generated
+        batch_queue = gr.State(value={})  # Dictionary storing all batch data
+        generation_params_state = gr.State(value={})  # Store generation parameters for next batches
+        is_generating_background = gr.State(value=False)  # Background generation flag
+        # All audio components in one row with dynamic visibility
+        with gr.Row():
+            with gr.Column(visible=True) as audio_col_1:
+                generated_audio_1 = gr.Audio(
+                    label=t("results.generated_music", n=1),
+                    type="filepath",
+                    interactive=False
+                )
+                with gr.Row(equal_height=True):
+                    send_to_src_btn_1 = gr.Button(
+                        t("results.send_to_src_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    save_btn_1 = gr.Button(
+                        t("results.save_btn"),
+                        variant="primary",
+                        size="sm",
+                        scale=1
+                    )
+                    score_btn_1 = gr.Button(
+                        t("results.score_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                score_display_1 = gr.Textbox(
+                    label=t("results.quality_score_label", n=1),
+                    interactive=False,
+                    placeholder=t("results.quality_score_placeholder")
+                )
+            with gr.Column(visible=True) as audio_col_2:
+                generated_audio_2 = gr.Audio(
+                    label=t("results.generated_music", n=2),
+                    type="filepath",
+                    interactive=False
+                )
+                with gr.Row(equal_height=True):
+                    send_to_src_btn_2 = gr.Button(
+                        t("results.send_to_src_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    save_btn_2 = gr.Button(
+                        t("results.save_btn"),
+                        variant="primary",
+                        size="sm",
+                        scale=1
+                    )
+                    score_btn_2 = gr.Button(
+                        t("results.score_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                score_display_2 = gr.Textbox(
+                    label=t("results.quality_score_label", n=2),
+                    interactive=False,
+                    placeholder=t("results.quality_score_placeholder")
+                )
+            with gr.Column(visible=False) as audio_col_3:
+                generated_audio_3 = gr.Audio(
+                    label=t("results.generated_music", n=3),
+                    type="filepath",
+                    interactive=False
+                )
+                with gr.Row(equal_height=True):
+                    send_to_src_btn_3 = gr.Button(
+                        t("results.send_to_src_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    save_btn_3 = gr.Button(
+                        t("results.save_btn"),
+                        variant="primary",
+                        size="sm",
+                        scale=1
+                    )
+                    score_btn_3 = gr.Button(
+                        t("results.score_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                score_display_3 = gr.Textbox(
+                    label=t("results.quality_score_label", n=3),
+                    interactive=False,
+                    placeholder=t("results.quality_score_placeholder")
+                )
+            with gr.Column(visible=False) as audio_col_4:
+                generated_audio_4 = gr.Audio(
+                    label=t("results.generated_music", n=4),
+                    type="filepath",
+                    interactive=False
+                )
+                with gr.Row(equal_height=True):
+                    send_to_src_btn_4 = gr.Button(
+                        t("results.send_to_src_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    save_btn_4 = gr.Button(
+                        t("results.save_btn"),
+                        variant="primary",
+                        size="sm",
+                        scale=1
+                    )
+                    score_btn_4 = gr.Button(
+                        t("results.score_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                score_display_4 = gr.Textbox(
+                    label=t("results.quality_score_label", n=4),
+                    interactive=False,
+                    placeholder=t("results.quality_score_placeholder")
+                )
+        # Second row for batch size 5-8 (initially hidden)
+        with gr.Row(visible=False) as audio_row_5_8:
+            with gr.Column() as audio_col_5:
+                generated_audio_5 = gr.Audio(
+                    label=t("results.generated_music", n=5),
+                    type="filepath",
+                    interactive=False
+                )
+                with gr.Row(equal_height=True):
+                    send_to_src_btn_5 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
+                    save_btn_5 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
+                    score_btn_5 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
+                score_display_5 = gr.Textbox(
+                    label=t("results.quality_score_label", n=5),
+                    interactive=False,
+                    placeholder=t("results.quality_score_placeholder")
+                )
+            with gr.Column() as audio_col_6:
+                generated_audio_6 = gr.Audio(
+                    label=t("results.generated_music", n=6),
+                    type="filepath",
+                    interactive=False
+                )
+                with gr.Row(equal_height=True):
+                    send_to_src_btn_6 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
+                    save_btn_6 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
+                    score_btn_6 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
+                score_display_6 = gr.Textbox(
+                    label=t("results.quality_score_label", n=6),
+                    interactive=False,
+                    placeholder=t("results.quality_score_placeholder")
+                )
+            with gr.Column() as audio_col_7:
+                generated_audio_7 = gr.Audio(
+                    label=t("results.generated_music", n=7),
+                    type="filepath",
+                    interactive=False
+                )
+                with gr.Row(equal_height=True):
+                    send_to_src_btn_7 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
+                    save_btn_7 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
+                    score_btn_7 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
+                score_display_7 = gr.Textbox(
+                    label=t("results.quality_score_label", n=7),
+                    interactive=False,
+                    placeholder=t("results.quality_score_placeholder")
+                )
+            with gr.Column() as audio_col_8:
+                generated_audio_8 = gr.Audio(
+                    label=t("results.generated_music", n=8),
+                    type="filepath",
+                    interactive=False
+                )
+                with gr.Row(equal_height=True):
+                    send_to_src_btn_8 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
+                    save_btn_8 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
+                    score_btn_8 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
+                score_display_8 = gr.Textbox(
+                    label=t("results.quality_score_label", n=8),
+                    interactive=False,
+                    placeholder=t("results.quality_score_placeholder")
+                )
+        status_output = gr.Textbox(label=t("results.generation_status"), interactive=False)
+        # Batch navigation controls
+        with gr.Row(equal_height=True):
+            prev_batch_btn = gr.Button(
+                t("results.prev_btn"),
+                variant="secondary",
+                interactive=False,
+                scale=1,
+                size="sm"
+            )
+            batch_indicator = gr.Textbox(
+                label=t("results.current_batch"),
+                value=t("results.batch_indicator", current=1, total=1),
+                interactive=False,
+                scale=3
+            )
+            next_batch_status = gr.Textbox(
+                label=t("results.next_batch_status"),
+                value="",
+                interactive=False,
+                scale=3
+            )
+            next_batch_btn = gr.Button(
+                t("results.next_btn"),
+                variant="primary",
+                interactive=False,
+                scale=1,
+                size="sm"
+            )
+        # One-click restore parameters button
+        restore_params_btn = gr.Button(
+            t("results.restore_params_btn"),
+            variant="secondary",
+            interactive=False,  # Initially disabled, enabled after generation
+            size="sm"
+        )
+        with gr.Accordion(t("results.batch_results_title"), open=False):
+            generated_audio_batch = gr.File(
+                label=t("results.all_files_label"),
+                file_count="multiple",
+                interactive=False
+            )
+            generation_info = gr.Markdown(label=t("results.generation_details"))
+        with gr.Accordion(t("results.attention_analysis"), open=False):
+            with gr.Row():
+                with gr.Column():
+                    align_score_1 = gr.Textbox(label=t("results.attention_score", n=1), interactive=False)
+                    align_text_1 = gr.Textbox(label=t("results.lyric_timestamps", n=1), interactive=False, lines=10)
+                    align_plot_1 = gr.Plot(label=t("results.attention_heatmap", n=1))
+                with gr.Column():
+                    align_score_2 = gr.Textbox(label=t("results.attention_score", n=2), interactive=False)
+                    align_text_2 = gr.Textbox(label=t("results.lyric_timestamps", n=2), interactive=False, lines=10)
+                    align_plot_2 = gr.Plot(label=t("results.attention_heatmap", n=2))
+    return {
+        "lm_metadata_state": lm_metadata_state,
+        "is_format_caption_state": is_format_caption_state,
+        "current_batch_index": current_batch_index,
+        "total_batches": total_batches,
+        "batch_queue": batch_queue,
+        "generation_params_state": generation_params_state,
+        "is_generating_background": is_generating_background,
+        "status_output": status_output,
+        "prev_batch_btn": prev_batch_btn,
+        "batch_indicator": batch_indicator,
+        "next_batch_btn": next_batch_btn,
+        "next_batch_status": next_batch_status,
+        "restore_params_btn": restore_params_btn,
+        "generated_audio_1": generated_audio_1,
+        "generated_audio_2": generated_audio_2,
+        "generated_audio_3": generated_audio_3,
+        "generated_audio_4": generated_audio_4,
+        "generated_audio_5": generated_audio_5,
+        "generated_audio_6": generated_audio_6,
+        "generated_audio_7": generated_audio_7,
+        "generated_audio_8": generated_audio_8,
+        "audio_row_5_8": audio_row_5_8,
+        "audio_col_1": audio_col_1,
+        "audio_col_2": audio_col_2,
+        "audio_col_3": audio_col_3,
+        "audio_col_4": audio_col_4,
+        "audio_col_5": audio_col_5,
+        "audio_col_6": audio_col_6,
+        "audio_col_7": audio_col_7,
+        "audio_col_8": audio_col_8,
+        "send_to_src_btn_1": send_to_src_btn_1,
+        "send_to_src_btn_2": send_to_src_btn_2,
+        "send_to_src_btn_3": send_to_src_btn_3,
+        "send_to_src_btn_4": send_to_src_btn_4,
+        "send_to_src_btn_5": send_to_src_btn_5,
+        "send_to_src_btn_6": send_to_src_btn_6,
+        "send_to_src_btn_7": send_to_src_btn_7,
+        "send_to_src_btn_8": send_to_src_btn_8,
+        "save_btn_1": save_btn_1,
+        "save_btn_2": save_btn_2,
+        "save_btn_3": save_btn_3,
+        "save_btn_4": save_btn_4,
+        "save_btn_5": save_btn_5,
+        "save_btn_6": save_btn_6,
+        "save_btn_7": save_btn_7,
+        "save_btn_8": save_btn_8,
+        "score_btn_1": score_btn_1,
+        "score_btn_2": score_btn_2,
+        "score_btn_3": score_btn_3,
+        "score_btn_4": score_btn_4,
+        "score_btn_5": score_btn_5,
+        "score_btn_6": score_btn_6,
+        "score_btn_7": score_btn_7,
+        "score_btn_8": score_btn_8,
+        "score_display_1": score_display_1,
+        "score_display_2": score_display_2,
+        "score_display_3": score_display_3,
+        "score_display_4": score_display_4,
+        "score_display_5": score_display_5,
+        "score_display_6": score_display_6,
+        "score_display_7": score_display_7,
+        "score_display_8": score_display_8,
+        "generated_audio_batch": generated_audio_batch,
+        "generation_info": generation_info,
+        "align_score_1": align_score_1,
+        "align_text_1": align_text_1,
+        "align_plot_1": align_plot_1,
+        "align_score_2": align_score_2,
+        "align_text_2": align_text_2,
+        "align_plot_2": align_plot_2,
+    }

acestep/handler.py CHANGED Viewed

@@ -9,6 +9,7 @@ import tempfile
 import traceback
 import re
 import random
 from contextlib import contextmanager
 from typing import Optional, Dict, Any, Tuple, List, Union
@@ -25,7 +26,7 @@ from transformers.generation.streamers import BaseStreamer
 from diffusers.models import AutoencoderOobleck
 from acestep.constants import (
     TASK_INSTRUCTIONS,
-    TRACK_NAMES,
     DEFAULT_DIT_INSTRUCTION,
 )
@@ -33,16 +34,6 @@ from acestep.constants import (
 warnings.filterwarnings("ignore")
-SFT_GEN_PROMPT = """# Instruction
-{}
-# Caption
-{}
-# Metas
-{}<|endoftext|>
-"""
 class AceStepHandler:
     """ACE-Step Business Logic Handler"""
@@ -2237,12 +2228,16 @@ class AceStepHandler:
                 audio_format_lower = "wav"
             saved_files = []
             for i in range(actual_batch_size):
-                audio_file = os.path.join(self.temp_dir, f"generated_{i}_{actual_seed_list[i]}.{audio_format_lower}")
                 # Convert to numpy: [channels, samples] -> [samples, channels]
                 audio_np = pred_wavs[i].cpu().float().numpy().T
                 sf.write(audio_file, audio_np, self.sample_rate)
                 saved_files.append(audio_file)
             # Prepare return values
             first_audio = saved_files[0] if len(saved_files) > 0 else None

 import traceback
 import re
 import random
+import uuid
 from contextlib import contextmanager
 from typing import Optional, Dict, Any, Tuple, List, Union
 from diffusers.models import AutoencoderOobleck
 from acestep.constants import (
     TASK_INSTRUCTIONS,
+    SFT_GEN_PROMPT,
     DEFAULT_DIT_INSTRUCTION,
 )
 warnings.filterwarnings("ignore")
 class AceStepHandler:
     """ACE-Step Business Logic Handler"""
                 audio_format_lower = "wav"
             saved_files = []
+            saved_uuids = []  # Store UUIDs for each file
             for i in range(actual_batch_size):
+                # Generate unique UUID for each audio file
+                file_uuid = str(uuid.uuid4())
+                audio_file = os.path.join(self.temp_dir, f"{file_uuid}.{audio_format_lower}")
                 # Convert to numpy: [channels, samples] -> [samples, channels]
                 audio_np = pred_wavs[i].cpu().float().numpy().T
                 sf.write(audio_file, audio_np, self.sample_rate)
                 saved_files.append(audio_file)
+                saved_uuids.append(file_uuid)
             # Prepare return values
             first_audio = saved_files[0] if len(saved_files) > 0 else None

acestep/test_time_scaling.py CHANGED Viewed

@@ -228,6 +228,97 @@ def _calculate_log_prob(
     return mean_log_prob
 # ==============================================================================
 # Main Public API
 # ==============================================================================
@@ -300,16 +391,16 @@ def calculate_pmi_score_per_condition(
         # 4. Global Score
         global_score = sum(scores.values()) / len(scores)
         # Status Message
-        status_lines = ["✅ Per-condition scores (0-1):"]
         for key, score in sorted(scores.items()):
             metric = "Top-k Recall" if key in metadata_recall_keys else "PMI (Norm)"
             status_lines.append(f"  {key}: {score:.4f} ({metric})")
-        status_lines.append(f"Global score: {global_score:.4f}")
-        logger.info(f"Calculated scores: {global_score:.4f}")
-        return scores, global_score, "\n".join(status_lines)
     except Exception as e:
         import traceback

     return mean_log_prob
+def calculate_reward_score(
+    scores: Dict[str, float],
+    weights_config: Optional[Dict[str, float]] = None
+) -> Tuple[float, str]:
+    """
+    Reward Model Calculator: Computes a final reward based on user priorities.
+    Priority Logic:
+        1. Caption (Highest): The overall vibe/style must match.
+        2. Lyrics (Medium): Content accuracy is important but secondary to vibe.
+        3. Metadata (Lowest): Technical constraints (BPM, Key) allow for slight deviations.
+    Strategy: Dynamic Weighted Sum
+    - Metadata fields are aggregated into a single 'metadata' score first.
+    - Weights are dynamically renormalized if any component (e.g., lyrics) is missing.
+    Args:
+        scores: Dictionary of raw scores (0.0 - 1.0) from the evaluation module.
+        weights_config: Optional custom weights. Defaults to:
+                        Caption (50%), Lyrics (30%), Metadata (20%).
+    Returns:
+        final_reward: The calculated reward score (0.0 - 1.0).
+        explanation: A formatted string explaining how the score was derived.
+    """
+    # 1. Default Preference Configuration
+    # These weights determine the relative importance of each component.
+    if weights_config is None:
+        weights_config = {
+            'caption': 0.50,  # High priority: Style/Vibe
+            'lyrics':  0.30,  # Medium priority: Content
+            'metadata': 0.20  # Low priority: Technical details
+        }
+    # 2. Extract and Group Scores
+    # Caption and Lyrics are standalone high-level features.
+    caption_score = scores.get('caption')
+    lyrics_score = scores.get('lyrics')
+    # Metadata fields (bpm, key, duration, etc.) are aggregated.
+    # We treat them as a single "Technical Score" to prevent them from
+    # diluting the weight of Caption/Lyrics simply by having many fields.
+    meta_scores_list = [
+        val for key, val in scores.items()
+        if key not in ['caption', 'lyrics']
+    ]
+    # Calculate average of all metadata fields (if any exist)
+    meta_aggregate_score = None
+    if meta_scores_list:
+        meta_aggregate_score = sum(meta_scores_list) / len(meta_scores_list)
+    # 3. specific Active Components & Dynamic Weighting
+    # We only include components that actually exist in this generation.
+    active_components = {}
+    if caption_score is not None:
+        active_components['caption'] = (caption_score, weights_config['caption'])
+    if lyrics_score is not None:
+        active_components['lyrics'] = (lyrics_score, weights_config['lyrics'])
+    if meta_aggregate_score is not None:
+        active_components['metadata'] = (meta_aggregate_score, weights_config['metadata'])
+    # 4. Calculate Final Weighted Score
+    total_base_weight = sum(w for _, w in active_components.values())
+    total_score = 0.0
+    breakdown_lines = []
+    if total_base_weight == 0:
+        return 0.0, "❌ No valid scores available to calculate reward."
+    # Sort by weight (importance) for display
+    sorted_components = sorted(active_components.items(), key=lambda x: x[1][1], reverse=True)
+    for name, (score, base_weight) in sorted_components:
+        # Renormalize weight: If lyrics are missing, caption/metadata weights scale up proportionately.
+        normalized_weight = base_weight / total_base_weight
+        weighted_contribution = score * normalized_weight
+        total_score += weighted_contribution
+        breakdown_lines.append(
+            f"  • {name.title():<8} | Score: {score:.4f} | Weight: {normalized_weight:.2f} "
+            f"-> Contrib: +{weighted_contribution:.4f}"
+        )
+    return total_score, "\n".join(breakdown_lines)
 # ==============================================================================
 # Main Public API
 # ==============================================================================
         # 4. Global Score
         global_score = sum(scores.values()) / len(scores)
+        global_score, breakdown_lines = calculate_reward_score(scores)
         # Status Message
+        status_lines = [breakdown_lines, "\n✅ Per-condition scores (0-1):"]
         for key, score in sorted(scores.items()):
             metric = "Top-k Recall" if key in metadata_recall_keys else "PMI (Norm)"
             status_lines.append(f"  {key}: {score:.4f} ({metric})")
+        status = "\n".join(status_lines)
+        logger.info(f"Calculated scores: {global_score:.4f}\n{status}")
+        return scores, global_score, status
     except Exception as e:
         import traceback