Spaces:

victor
/

ace-step-jam

Running on Zero

victor HF Staff commited on 10 days ago

Commit

dc5fc4b

0 Parent(s):

feat: ACE-Step Studio — custom frontend for ACE-Step v1.5 music generation

- gr.Server with custom HTML frontend + API endpoints
- v1.5 AceStepHandler with acestep-v15-xl-turbo (8-step turbo)
- Peak normalization, ZeroGPU permission fixes
- /generate and /inspire API endpoints

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +39 -0
.gitignore +1 -0
README.md +31 -0
acestep/__init__.py +1 -0
acestep/acestep_v15_pipeline.py +303 -0
acestep/api_server.py +1700 -0
acestep/audio_utils.py +378 -0
acestep/constants.py +109 -0
acestep/constrained_logits_processor.py +0 -0
acestep/dataset_handler.py +37 -0
acestep/dit_alignment_score.py +870 -0
acestep/genres_vocab.txt +0 -0
acestep/gradio_ui/__init__.py +1 -0
acestep/gradio_ui/events/__init__.py +1355 -0
acestep/gradio_ui/events/generation_handlers.py +1071 -0
acestep/gradio_ui/events/results_handlers.py +0 -0
acestep/gradio_ui/events/training_handlers.py +644 -0
acestep/gradio_ui/i18n.py +152 -0
acestep/gradio_ui/i18n/en.json +245 -0
acestep/gradio_ui/i18n/ja.json +245 -0
acestep/gradio_ui/i18n/zh.json +245 -0
acestep/gradio_ui/interfaces/__init__.py +105 -0
acestep/gradio_ui/interfaces/dataset.py +101 -0
acestep/gradio_ui/interfaces/generation.py +694 -0
acestep/gradio_ui/interfaces/result.py +598 -0
acestep/gradio_ui/interfaces/training.py +562 -0
acestep/handler.py +0 -0
acestep/inference.py +1181 -0
acestep/llm_inference.py +0 -0
acestep/local_cache.py +129 -0
acestep/test_time_scaling.py +410 -0
acestep/third_parts/nano-vllm/LICENSE +21 -0
acestep/third_parts/nano-vllm/README.md +66 -0
acestep/third_parts/nano-vllm/bench.py +32 -0
acestep/third_parts/nano-vllm/example.py +33 -0
acestep/third_parts/nano-vllm/nanovllm/__init__.py +2 -0
acestep/third_parts/nano-vllm/nanovllm/config.py +26 -0
acestep/third_parts/nano-vllm/nanovllm/engine/block_manager.py +119 -0
acestep/third_parts/nano-vllm/nanovllm/engine/llm_engine.py +178 -0
acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py +543 -0
acestep/third_parts/nano-vllm/nanovllm/engine/scheduler.py +230 -0
acestep/third_parts/nano-vllm/nanovllm/engine/sequence.py +96 -0
acestep/third_parts/nano-vllm/nanovllm/layers/activation.py +14 -0
acestep/third_parts/nano-vllm/nanovllm/layers/attention.py +75 -0
acestep/third_parts/nano-vllm/nanovllm/layers/embed_head.py +66 -0
acestep/third_parts/nano-vllm/nanovllm/layers/layernorm.py +50 -0
acestep/third_parts/nano-vllm/nanovllm/layers/linear.py +153 -0
acestep/third_parts/nano-vllm/nanovllm/layers/rotary_embedding.py +61 -0
acestep/third_parts/nano-vllm/nanovllm/layers/sampler.py +114 -0
acestep/third_parts/nano-vllm/nanovllm/llm.py +5 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,39 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .claude/

README.md ADDED Viewed

	@@ -0,0 +1,31 @@

+---
+title: Ace-Step Studio
+emoji: 🎵
+colorFrom: gray
+colorTo: gray
+sdk: gradio
+sdk_version: 6.12.0
+app_file: app.py
+pinned: false
+license: mit
+short_description: Minimalist dark UI for ACE-Step music generation
+models:
+  - ACE-Step/Ace-Step1.5
+  - ACE-Step/acestep-v15-xl-turbo
+preload_from_hub:
+  - ACE-Step/Ace-Step1.5
+  - ACE-Step/acestep-v15-xl-turbo
+---
+# ACE-Step Studio
+A minimalist, dark-themed interface for generating music with [ACE-Step](https://github.com/ace-step/ACE-Step).
+**Model**: `ACE-Step/acestep-v15-xl-turbo` — generates 1 minute of audio in ~2 seconds (8-step turbo distillation).
+## Usage
+1. Enter style tags (e.g. `lo-fi, chill, piano, female vocals`)
+2. Write lyrics with `[verse]`, `[chorus]`, `[bridge]` section markers
+3. Hit **Generate** — a waveform appears when ready
+4. Use **✨ Inspire me** to auto-generate lyrics via LLM

acestep/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """ACE-Step package."""

acestep/acestep_v15_pipeline.py ADDED Viewed

	@@ -0,0 +1,303 @@

+"""
+ACE-Step V1.5 Pipeline
+Handler wrapper connecting model and UI
+"""
+import os
+import sys
+# Load environment variables from .env file in project root
+# This allows configuration without hardcoding values
+# Falls back to .env.example if .env is not found
+try:
+    from dotenv import load_dotenv
+    # Get project root directory
+    _current_file = os.path.abspath(__file__)
+    _project_root = os.path.dirname(os.path.dirname(_current_file))
+    _env_path = os.path.join(_project_root, '.env')
+    _env_example_path = os.path.join(_project_root, '.env.example')
+    if os.path.exists(_env_path):
+        load_dotenv(_env_path)
+        print(f"Loaded configuration from {_env_path}")
+    elif os.path.exists(_env_example_path):
+        load_dotenv(_env_example_path)
+        print(f"Loaded configuration from {_env_example_path} (fallback)")
+except ImportError:
+    # python-dotenv not installed, skip loading .env
+    pass
+# Clear proxy settings that may affect Gradio
+for proxy_var in ['http_proxy', 'https_proxy', 'HTTP_PROXY', 'HTTPS_PROXY', 'ALL_PROXY']:
+    os.environ.pop(proxy_var, None)
+try:
+    # When executed as a module: `python -m acestep.acestep_v15_pipeline`
+    from .handler import AceStepHandler
+    from .llm_inference import LLMHandler
+    from .dataset_handler import DatasetHandler
+    from .gradio_ui import create_gradio_interface
+except ImportError:
+    # When executed as a script: `python acestep/acestep_v15_pipeline.py`
+    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    if project_root not in sys.path:
+        sys.path.insert(0, project_root)
+    from acestep.handler import AceStepHandler
+    from acestep.llm_inference import LLMHandler
+    from acestep.dataset_handler import DatasetHandler
+    from acestep.gradio_ui import create_gradio_interface
+def create_demo(init_params=None, language='en'):
+    """
+    Create Gradio demo interface
+    Args:
+        init_params: Dictionary containing initialization parameters and state.
+                    If None, service will not be pre-initialized.
+                    Keys: 'pre_initialized' (bool), 'checkpoint', 'config_path', 'device',
+                          'init_llm', 'lm_model_path', 'backend', 'use_flash_attention',
+                          'offload_to_cpu', 'offload_dit_to_cpu', 'init_status',
+                          'dit_handler', 'llm_handler' (initialized handlers if pre-initialized),
+                          'language' (UI language code)
+        language: UI language code ('en', 'zh', 'ja', default: 'en')
+    Returns:
+        Gradio Blocks instance
+    """
+    # Get persistent storage path from init_params (for HuggingFace Space)
+    persistent_storage_path = None
+    if init_params:
+        persistent_storage_path = init_params.get('persistent_storage_path')
+    # Use pre-initialized handlers if available, otherwise create new ones
+    if init_params and init_params.get('pre_initialized') and 'dit_handler' in init_params:
+        dit_handler = init_params['dit_handler']
+        llm_handler = init_params['llm_handler']
+    else:
+        dit_handler = AceStepHandler(persistent_storage_path=persistent_storage_path)
+        llm_handler = LLMHandler(persistent_storage_path=persistent_storage_path)
+    dataset_handler = DatasetHandler()  # Dataset handler
+    # Create Gradio interface with all handlers and initialization parameters
+    demo = create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_params=init_params, language=language)
+    return demo
+def get_gpu_memory_gb():
+    """
+    Get GPU memory in GB. Returns 0 if no GPU is available.
+    """
+    try:
+        import torch
+        if torch.cuda.is_available():
+            # Get total memory of the first GPU in GB
+            total_memory = torch.cuda.get_device_properties(0).total_memory
+            memory_gb = total_memory / (1024**3)  # Convert bytes to GB
+            return memory_gb
+        else:
+            return 0
+    except Exception as e:
+        print(f"Warning: Failed to detect GPU memory: {e}", file=sys.stderr)
+        return 0
+def main():
+    """Main entry function"""
+    import argparse
+    # Detect GPU memory to auto-configure offload settings
+    gpu_memory_gb = get_gpu_memory_gb()
+    auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < 16
+    if auto_offload:
+        print(f"Detected GPU memory: {gpu_memory_gb:.2f} GB (< 16GB)")
+        print("Auto-enabling CPU offload to reduce GPU memory usage")
+    elif gpu_memory_gb > 0:
+        print(f"Detected GPU memory: {gpu_memory_gb:.2f} GB (>= 16GB)")
+        print("CPU offload disabled by default")
+    else:
+        print("No GPU detected, running on CPU")
+    parser = argparse.ArgumentParser(description="Gradio Demo for ACE-Step V1.5")
+    parser.add_argument("--port", type=int, default=7860, help="Port to run the gradio server on")
+    parser.add_argument("--share", action="store_true", help="Create a public link")
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    parser.add_argument("--server-name", type=str, default="127.0.0.1", help="Server name (default: 127.0.0.1, use 0.0.0.0 for all interfaces)")
+    parser.add_argument("--language", type=str, default="en", choices=["en", "zh", "ja"], help="UI language: en (English), zh (中文), ja (日本語)")
+    # Service mode argument
+    parser.add_argument("--service_mode", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False,
+                       help="Enable service mode (default: False). When enabled, uses preset models and restricts UI options.")
+    # Service initialization arguments
+    parser.add_argument("--init_service", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False, help="Initialize service on startup (default: False)")
+    parser.add_argument("--checkpoint", type=str, default=None, help="Checkpoint file path (optional, for display purposes)")
+    parser.add_argument("--config_path", type=str, default=None, help="Main model path (e.g., 'acestep-v15-turbo')")
+    parser.add_argument("--device", type=str, default="auto", choices=["auto", "cuda", "cpu"], help="Processing device (default: auto)")
+    parser.add_argument("--init_llm", type=lambda x: x.lower() in ['true', '1', 'yes'], default=True, help="Initialize 5Hz LM (default: True)")
+    parser.add_argument("--lm_model_path", type=str, default=None, help="5Hz LM model path (e.g., 'acestep-5Hz-lm-0.6B')")
+    parser.add_argument("--backend", type=str, default="vllm", choices=["vllm", "pt"], help="5Hz LM backend (default: vllm)")
+    parser.add_argument("--use_flash_attention", type=lambda x: x.lower() in ['true', '1', 'yes'], default=None, help="Use flash attention (default: auto-detect)")
+    parser.add_argument("--offload_to_cpu", type=lambda x: x.lower() in ['true', '1', 'yes'], default=auto_offload, help=f"Offload models to CPU (default: {'True' if auto_offload else 'False'}, auto-detected based on GPU VRAM)")
+    parser.add_argument("--offload_dit_to_cpu", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False, help="Offload DiT to CPU (default: False)")
+    args = parser.parse_args()
+    # Service mode defaults (can be configured via .env file)
+    if args.service_mode:
+        print("Service mode enabled - applying preset configurations...")
+        # Force init_service in service mode
+        args.init_service = True
+        # Default DiT model for service mode (from env or fallback)
+        if args.config_path is None:
+            args.config_path = os.environ.get(
+                "SERVICE_MODE_DIT_MODEL",
+                "acestep-v15-turbo-fix-inst-shift-dynamic"
+            )
+        # Default LM model for service mode (from env or fallback)
+        if args.lm_model_path is None:
+            args.lm_model_path = os.environ.get(
+                "SERVICE_MODE_LM_MODEL",
+                "acestep-5Hz-lm-1.7B-v4-fix"
+            )
+        # Backend for service mode (from env or fallback to vllm)
+        args.backend = os.environ.get("SERVICE_MODE_BACKEND", "vllm")
+        print(f"  DiT model: {args.config_path}")
+        print(f"  LM model: {args.lm_model_path}")
+        print(f"  Backend: {args.backend}")
+    try:
+        init_params = None
+        # If init_service is True, perform initialization before creating UI
+        if args.init_service:
+            print("Initializing service from command line...")
+            # Create handler instances for initialization
+            dit_handler = AceStepHandler()
+            llm_handler = LLMHandler()
+            # Auto-select config_path if not provided
+            if args.config_path is None:
+                available_models = dit_handler.get_available_acestep_v15_models()
+                if available_models:
+                    args.config_path = "acestep-v15-turbo" if "acestep-v15-turbo" in available_models else available_models[0]
+                    print(f"Auto-selected config_path: {args.config_path}")
+                else:
+                    print("Error: No available models found. Please specify --config_path", file=sys.stderr)
+                    sys.exit(1)
+            # Get project root (same logic as in handler)
+            current_file = os.path.abspath(__file__)
+            project_root = os.path.dirname(os.path.dirname(current_file))
+            # Determine flash attention setting
+            use_flash_attention = args.use_flash_attention
+            if use_flash_attention is None:
+                use_flash_attention = dit_handler.is_flash_attention_available()
+            # Initialize DiT handler
+            print(f"Initializing DiT model: {args.config_path} on {args.device}...")
+            init_status, enable_generate = dit_handler.initialize_service(
+                project_root=project_root,
+                config_path=args.config_path,
+                device=args.device,
+                use_flash_attention=use_flash_attention,
+                compile_model=False,
+                offload_to_cpu=args.offload_to_cpu,
+                offload_dit_to_cpu=args.offload_dit_to_cpu
+            )
+            if not enable_generate:
+                print(f"Error initializing DiT model: {init_status}", file=sys.stderr)
+                sys.exit(1)
+            print(f"DiT model initialized successfully")
+            # Initialize LM handler if requested
+            lm_status = ""
+            if args.init_llm:
+                if args.lm_model_path is None:
+                    # Try to get default LM model
+                    available_lm_models = llm_handler.get_available_5hz_lm_models()
+                    if available_lm_models:
+                        args.lm_model_path = available_lm_models[0]
+                        print(f"Using default LM model: {args.lm_model_path}")
+                    else:
+                        print("Warning: No LM models available, skipping LM initialization", file=sys.stderr)
+                        args.init_llm = False
+                if args.init_llm and args.lm_model_path:
+                    checkpoint_dir = os.path.join(project_root, "checkpoints")
+                    print(f"Initializing 5Hz LM: {args.lm_model_path} on {args.device}...")
+                    lm_status, lm_success = llm_handler.initialize(
+                        checkpoint_dir=checkpoint_dir,
+                        lm_model_path=args.lm_model_path,
+                        backend=args.backend,
+                        device=args.device,
+                        offload_to_cpu=args.offload_to_cpu,
+                        dtype=dit_handler.dtype
+                    )
+                    if lm_success:
+                        print(f"5Hz LM initialized successfully")
+                        init_status += f"\n{lm_status}"
+                    else:
+                        print(f"Warning: 5Hz LM initialization failed: {lm_status}", file=sys.stderr)
+                        init_status += f"\n{lm_status}"
+            # Prepare initialization parameters for UI
+            init_params = {
+                'pre_initialized': True,
+                'service_mode': args.service_mode,
+                'checkpoint': args.checkpoint,
+                'config_path': args.config_path,
+                'device': args.device,
+                'init_llm': args.init_llm,
+                'lm_model_path': args.lm_model_path,
+                'backend': args.backend,
+                'use_flash_attention': use_flash_attention,
+                'offload_to_cpu': args.offload_to_cpu,
+                'offload_dit_to_cpu': args.offload_dit_to_cpu,
+                'init_status': init_status,
+                'enable_generate': enable_generate,
+                'dit_handler': dit_handler,
+                'llm_handler': llm_handler,
+                'language': args.language
+            }
+            print("Service initialization completed successfully!")
+        # Create and launch demo
+        print(f"Creating Gradio interface with language: {args.language}...")
+        demo = create_demo(init_params=init_params, language=args.language)
+        # Enable queue for multi-user support
+        # This ensures proper request queuing and prevents concurrent generation conflicts
+        print("Enabling queue for multi-user support...")
+        demo.queue(
+            max_size=20,  # Maximum queue size (adjust based on your needs)
+            status_update_rate="auto",  # Update rate for queue status
+        )
+        print(f"Launching server on {args.server_name}:{args.port}...")
+        demo.launch(
+            server_name=args.server_name,
+            server_port=args.port,
+            share=args.share,
+            debug=args.debug,
+            show_error=True,
+            prevent_thread_lock=False,  # Keep thread locked to maintain server running
+            inbrowser=False,  # Don't auto-open browser
+        )
+    except Exception as e:
+        print(f"Error launching Gradio: {e}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

acestep/api_server.py ADDED Viewed

	@@ -0,0 +1,1700 @@

+"""FastAPI server for ACE-Step V1.5.
+Endpoints:
+- POST /release_task     Create music generation task
+- POST /query_result     Batch query task results
+- POST /v1/music/random  Create random sample task
+- GET  /v1/models        List available models
+- GET  /v1/audio         Download audio file
+- GET  /health           Health check
+NOTE:
+- In-memory queue and job store -> run uvicorn with workers=1.
+"""
+from __future__ import annotations
+import asyncio
+import json
+import os
+import sys
+import time
+import traceback
+import tempfile
+import urllib.parse
+from collections import deque
+from concurrent.futures import ThreadPoolExecutor
+from contextlib import asynccontextmanager
+from dataclasses import dataclass
+from pathlib import Path
+from threading import Lock
+from typing import Any, Dict, List, Literal, Optional
+from uuid import uuid4
+try:
+    from dotenv import load_dotenv
+except ImportError:  # Optional dependency
+    load_dotenv = None  # type: ignore
+from fastapi import FastAPI, HTTPException, Request
+from pydantic import BaseModel, Field
+from starlette.datastructures import UploadFile as StarletteUploadFile
+from acestep.handler import AceStepHandler
+from acestep.llm_inference import LLMHandler
+from acestep.constants import (
+    DEFAULT_DIT_INSTRUCTION,
+    DEFAULT_LM_INSTRUCTION,
+    TASK_INSTRUCTIONS,
+)
+from acestep.inference import (
+    GenerationParams,
+    GenerationConfig,
+    generate_music,
+    create_sample,
+    format_sample,
+)
+from acestep.gradio_ui.events.results_handlers import _build_generation_info
+# =============================================================================
+# Constants
+# =============================================================================
+RESULT_KEY_PREFIX = "ace_step_v1.5_"
+RESULT_EXPIRE_SECONDS = 7 * 24 * 60 * 60  # 7 days
+TASK_TIMEOUT_SECONDS = 3600  # 1 hour
+STATUS_MAP = {"queued": 0, "running": 0, "succeeded": 1, "failed": 2}
+LM_DEFAULT_TEMPERATURE = 0.85
+LM_DEFAULT_CFG_SCALE = 2.5
+LM_DEFAULT_TOP_P = 0.9
+# Parameter aliases for request parsing
+PARAM_ALIASES = {
+    "prompt": ["prompt"],
+    "sample_mode": ["sample_mode", "sampleMode"],
+    "sample_query": ["sample_query", "sampleQuery", "description", "desc"],
+    "use_format": ["use_format", "useFormat", "format"],
+    "model": ["model", "dit_model", "ditModel"],
+    "key_scale": ["key_scale", "keyscale", "keyScale"],
+    "time_signature": ["time_signature", "timesignature", "timeSignature"],
+    "audio_duration": ["audio_duration", "duration", "audioDuration", "target_duration", "targetDuration"],
+    "vocal_language": ["vocal_language", "vocalLanguage"],
+    "inference_steps": ["inference_steps", "inferenceSteps"],
+    "guidance_scale": ["guidance_scale", "guidanceScale"],
+    "use_random_seed": ["use_random_seed", "useRandomSeed"],
+    "audio_code_string": ["audio_code_string", "audioCodeString"],
+    "audio_cover_strength": ["audio_cover_strength", "audioCoverStrength"],
+    "task_type": ["task_type", "taskType"],
+    "infer_method": ["infer_method", "inferMethod"],
+    "use_tiled_decode": ["use_tiled_decode", "useTiledDecode"],
+    "constrained_decoding": ["constrained_decoding", "constrainedDecoding", "constrained"],
+    "constrained_decoding_debug": ["constrained_decoding_debug", "constrainedDecodingDebug"],
+    "use_cot_caption": ["use_cot_caption", "cot_caption", "cot-caption"],
+    "use_cot_language": ["use_cot_language", "cot_language", "cot-language"],
+    "is_format_caption": ["is_format_caption", "isFormatCaption"],
+}
+def _parse_description_hints(description: str) -> tuple[Optional[str], bool]:
+    """
+    Parse a description string to extract language code and instrumental flag.
+    This function analyzes user descriptions like "Pop rock. English" or "piano solo"
+    to detect:
+    - Language: Maps language names to ISO codes (e.g., "English" -> "en")
+    - Instrumental: Detects patterns indicating instrumental/no-vocal music
+    Args:
+        description: User's natural language music description
+    Returns:
+        (language_code, is_instrumental) tuple:
+        - language_code: ISO language code (e.g., "en", "zh") or None if not detected
+        - is_instrumental: True if description indicates instrumental music
+    """
+    import re
+    if not description:
+        return None, False
+    description_lower = description.lower().strip()
+    # Language mapping: input patterns -> ISO code
+    language_mapping = {
+        'english': 'en', 'en': 'en',
+        'chinese': 'zh', '中文': 'zh', 'zh': 'zh', 'mandarin': 'zh',
+        'japanese': 'ja', '日本語': 'ja', 'ja': 'ja',
+        'korean': 'ko', '한국어': 'ko', 'ko': 'ko',
+        'spanish': 'es', 'español': 'es', 'es': 'es',
+        'french': 'fr', 'français': 'fr', 'fr': 'fr',
+        'german': 'de', 'deutsch': 'de', 'de': 'de',
+        'italian': 'it', 'italiano': 'it', 'it': 'it',
+        'portuguese': 'pt', 'português': 'pt', 'pt': 'pt',
+        'russian': 'ru', 'русский': 'ru', 'ru': 'ru',
+        'bengali': 'bn', 'bn': 'bn',
+        'hindi': 'hi', 'hi': 'hi',
+        'arabic': 'ar', 'ar': 'ar',
+        'thai': 'th', 'th': 'th',
+        'vietnamese': 'vi', 'vi': 'vi',
+        'indonesian': 'id', 'id': 'id',
+        'turkish': 'tr', 'tr': 'tr',
+        'dutch': 'nl', 'nl': 'nl',
+        'polish': 'pl', 'pl': 'pl',
+    }
+    # Detect language
+    detected_language = None
+    for lang_name, lang_code in language_mapping.items():
+        if len(lang_name) <= 2:
+            pattern = r'(?:^|\s|[.,;:!?])' + re.escape(lang_name) + r'(?:$|\s|[.,;:!?])'
+        else:
+            pattern = r'\b' + re.escape(lang_name) + r'\b'
+        if re.search(pattern, description_lower):
+            detected_language = lang_code
+            break
+    # Detect instrumental
+    is_instrumental = False
+    if 'instrumental' in description_lower:
+        is_instrumental = True
+    elif 'pure music' in description_lower or 'pure instrument' in description_lower:
+        is_instrumental = True
+    elif description_lower.endswith(' solo') or description_lower == 'solo':
+        is_instrumental = True
+    return detected_language, is_instrumental
+JobStatus = Literal["queued", "running", "succeeded", "failed"]
+class GenerateMusicRequest(BaseModel):
+    prompt: str = Field(default="", description="Text prompt describing the music")
+    lyrics: str = Field(default="", description="Lyric text")
+    # New API semantics:
+    # - thinking=True: use 5Hz LM to generate audio codes (lm-dit behavior)
+    # - thinking=False: do not use LM to generate codes (dit behavior)
+    # Regardless of thinking, if some metas are missing, server may use LM to fill them.
+    thinking: bool = False
+    # Sample-mode requests auto-generate caption/lyrics/metas via LM (no user prompt).
+    sample_mode: bool = False
+    # Description for sample mode: auto-generate caption/lyrics from description query
+    sample_query: str = Field(default="", description="Query/description for sample mode (use create_sample)")
+    # Whether to use format_sample() to enhance input caption/lyrics
+    use_format: bool = Field(default=False, description="Use format_sample() to enhance input (default: False)")
+    # Model name for multi-model support (select which DiT model to use)
+    model: Optional[str] = Field(default=None, description="Model name to use (e.g., 'acestep-v15-turbo')")
+    bpm: Optional[int] = None
+    # Accept common client keys via manual parsing (see RequestParser).
+    key_scale: str = ""
+    time_signature: str = ""
+    vocal_language: str = "en"
+    inference_steps: int = 8
+    guidance_scale: float = 7.0
+    use_random_seed: bool = True
+    seed: int = -1
+    reference_audio_path: Optional[str] = None
+    src_audio_path: Optional[str] = None
+    audio_duration: Optional[float] = None
+    batch_size: Optional[int] = None
+    audio_code_string: str = ""
+    repainting_start: float = 0.0
+    repainting_end: Optional[float] = None
+    instruction: str = DEFAULT_DIT_INSTRUCTION
+    audio_cover_strength: float = 1.0
+    task_type: str = "text2music"
+    use_adg: bool = False
+    cfg_interval_start: float = 0.0
+    cfg_interval_end: float = 1.0
+    infer_method: str = "ode"  # "ode" or "sde" - diffusion inference method
+    shift: float = Field(
+        default=3.0,
+        description="Timestep shift factor (range 1.0~5.0, default 3.0). Only effective for base models, not turbo models."
+    )
+    timesteps: Optional[str] = Field(
+        default=None,
+        description="Custom timesteps (comma-separated, e.g., '0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0'). Overrides inference_steps and shift."
+    )
+    audio_format: str = "mp3"
+    use_tiled_decode: bool = True
+    # 5Hz LM (server-side): used for metadata completion and (when thinking=True) codes generation.
+    lm_model_path: Optional[str] = None  # e.g. "acestep-5Hz-lm-0.6B"
+    lm_backend: Literal["vllm", "pt"] = "vllm"
+    constrained_decoding: bool = True
+    constrained_decoding_debug: bool = False
+    use_cot_caption: bool = True
+    use_cot_language: bool = True
+    is_format_caption: bool = False
+    lm_temperature: float = 0.85
+    lm_cfg_scale: float = 2.5
+    lm_top_k: Optional[int] = None
+    lm_top_p: Optional[float] = 0.9
+    lm_repetition_penalty: float = 1.0
+    lm_negative_prompt: str = "NO USER INPUT"
+    class Config:
+        allow_population_by_field_name = True
+        allow_population_by_alias = True
+class CreateJobResponse(BaseModel):
+    task_id: str
+    status: JobStatus
+    queue_position: int = 0  # 1-based best-effort position when queued
+class JobResult(BaseModel):
+    first_audio_path: Optional[str] = None
+    second_audio_path: Optional[str] = None
+    audio_paths: list[str] = Field(default_factory=list)
+    generation_info: str = ""
+    status_message: str = ""
+    seed_value: str = ""
+    metas: Dict[str, Any] = Field(default_factory=dict)
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    genres: Optional[str] = None
+    keyscale: Optional[str] = None
+    timesignature: Optional[str] = None
+    # Model information
+    lm_model: Optional[str] = None
+    dit_model: Optional[str] = None
+class JobResponse(BaseModel):
+    job_id: str
+    status: JobStatus
+    created_at: float
+    started_at: Optional[float] = None
+    finished_at: Optional[float] = None
+    # queue observability
+    queue_position: int = 0
+    eta_seconds: Optional[float] = None
+    avg_job_seconds: Optional[float] = None
+    result: Optional[JobResult] = None
+    error: Optional[str] = None
+@dataclass
+class _JobRecord:
+    job_id: str
+    status: JobStatus
+    created_at: float
+    started_at: Optional[float] = None
+    finished_at: Optional[float] = None
+    result: Optional[Dict[str, Any]] = None
+    error: Optional[str] = None
+    env: str = "development"
+class _JobStore:
+    def __init__(self) -> None:
+        self._lock = Lock()
+        self._jobs: Dict[str, _JobRecord] = {}
+    def create(self) -> _JobRecord:
+        job_id = str(uuid4())
+        rec = _JobRecord(job_id=job_id, status="queued", created_at=time.time())
+        with self._lock:
+            self._jobs[job_id] = rec
+        return rec
+    def create_with_id(self, job_id: str, env: str = "development") -> _JobRecord:
+        """Create job record with specified ID"""
+        rec = _JobRecord(
+            job_id=job_id,
+            status="queued",
+            created_at=time.time(),
+            env=env
+        )
+        with self._lock:
+            self._jobs[job_id] = rec
+        return rec
+    def get(self, job_id: str) -> Optional[_JobRecord]:
+        with self._lock:
+            return self._jobs.get(job_id)
+    def mark_running(self, job_id: str) -> None:
+        with self._lock:
+            rec = self._jobs[job_id]
+            rec.status = "running"
+            rec.started_at = time.time()
+    def mark_succeeded(self, job_id: str, result: Dict[str, Any]) -> None:
+        with self._lock:
+            rec = self._jobs[job_id]
+            rec.status = "succeeded"
+            rec.finished_at = time.time()
+            rec.result = result
+            rec.error = None
+    def mark_failed(self, job_id: str, error: str) -> None:
+        with self._lock:
+            rec = self._jobs[job_id]
+            rec.status = "failed"
+            rec.finished_at = time.time()
+            rec.result = None
+            rec.error = error
+def _env_bool(name: str, default: bool) -> bool:
+    v = os.getenv(name)
+    if v is None:
+        return default
+    return v.strip().lower() in {"1", "true", "yes", "y", "on"}
+def _get_project_root() -> str:
+    current_file = os.path.abspath(__file__)
+    return os.path.dirname(os.path.dirname(current_file))
+def _get_model_name(config_path: str) -> str:
+    """
+    Extract model name from config_path.
+    Args:
+        config_path: Path like "acestep-v15-turbo" or "/path/to/acestep-v15-turbo"
+    Returns:
+        Model name (last directory name from config_path)
+    """
+    if not config_path:
+        return ""
+    normalized = config_path.rstrip("/\\")
+    return os.path.basename(normalized)
+def _load_project_env() -> None:
+    if load_dotenv is None:
+        return
+    try:
+        project_root = _get_project_root()
+        env_path = os.path.join(project_root, ".env")
+        if os.path.exists(env_path):
+            load_dotenv(env_path, override=False)
+    except Exception:
+        # Optional best-effort: continue even if .env loading fails.
+        pass
+_load_project_env()
+def _to_int(v: Any, default: Optional[int] = None) -> Optional[int]:
+    if v is None:
+        return default
+    if isinstance(v, int):
+        return v
+    s = str(v).strip()
+    if s == "":
+        return default
+    try:
+        return int(s)
+    except Exception:
+        return default
+def _to_float(v: Any, default: Optional[float] = None) -> Optional[float]:
+    if v is None:
+        return default
+    if isinstance(v, float):
+        return v
+    s = str(v).strip()
+    if s == "":
+        return default
+    try:
+        return float(s)
+    except Exception:
+        return default
+def _to_bool(v: Any, default: bool = False) -> bool:
+    if v is None:
+        return default
+    if isinstance(v, bool):
+        return v
+    s = str(v).strip().lower()
+    if s == "":
+        return default
+    return s in {"1", "true", "yes", "y", "on"}
+def _map_status(status: str) -> int:
+    """Map job status string to integer code."""
+    return STATUS_MAP.get(status, 2)
+def _parse_timesteps(s: Optional[str]) -> Optional[List[float]]:
+    """Parse comma-separated timesteps string to list of floats."""
+    if not s or not s.strip():
+        return None
+    try:
+        return [float(t.strip()) for t in s.split(",") if t.strip()]
+    except (ValueError, Exception):
+        return None
+class RequestParser:
+    """Parse request parameters from multiple sources with alias support."""
+    def __init__(self, raw: dict):
+        self._raw = dict(raw) if raw else {}
+        self._param_obj = self._parse_json(self._raw.get("param_obj"))
+        self._metas = self._find_metas()
+    def _parse_json(self, v) -> dict:
+        if isinstance(v, dict):
+            return v
+        if isinstance(v, str) and v.strip():
+            try:
+                return json.loads(v)
+            except Exception:
+                pass
+        return {}
+    def _find_metas(self) -> dict:
+        for key in ("metas", "meta", "metadata", "user_metadata", "userMetadata"):
+            v = self._raw.get(key)
+            if v:
+                return self._parse_json(v)
+        return {}
+    def get(self, name: str, default=None):
+        """Get parameter by canonical name from all sources."""
+        aliases = PARAM_ALIASES.get(name, [name])
+        for source in (self._raw, self._param_obj, self._metas):
+            for alias in aliases:
+                v = source.get(alias)
+                if v is not None:
+                    return v
+        return default
+    def str(self, name: str, default: str = "") -> str:
+        v = self.get(name)
+        return str(v) if v is not None else default
+    def int(self, name: str, default: Optional[int] = None) -> Optional[int]:
+        return _to_int(self.get(name), default)
+    def float(self, name: str, default: Optional[float] = None) -> Optional[float]:
+        return _to_float(self.get(name), default)
+    def bool(self, name: str, default: bool = False) -> bool:
+        return _to_bool(self.get(name), default)
+async def _save_upload_to_temp(upload: StarletteUploadFile, *, prefix: str) -> str:
+    suffix = Path(upload.filename or "").suffix
+    fd, path = tempfile.mkstemp(prefix=f"{prefix}_", suffix=suffix)
+    os.close(fd)
+    try:
+        with open(path, "wb") as f:
+            while True:
+                chunk = await upload.read(1024 * 1024)
+                if not chunk:
+                    break
+                f.write(chunk)
+    except Exception:
+        try:
+            os.remove(path)
+        except Exception:
+            pass
+        raise
+    finally:
+        try:
+            await upload.close()
+        except Exception:
+            pass
+    return path
+def create_app() -> FastAPI:
+    store = _JobStore()
+    QUEUE_MAXSIZE = int(os.getenv("ACESTEP_QUEUE_MAXSIZE", "200"))
+    WORKER_COUNT = int(os.getenv("ACESTEP_QUEUE_WORKERS", "1"))  # Single GPU recommended
+    INITIAL_AVG_JOB_SECONDS = float(os.getenv("ACESTEP_AVG_JOB_SECONDS", "5.0"))
+    AVG_WINDOW = int(os.getenv("ACESTEP_AVG_WINDOW", "50"))
+    def _path_to_audio_url(path: str) -> str:
+        """Convert local file path to downloadable relative URL"""
+        if not path:
+            return path
+        if path.startswith("http://") or path.startswith("https://"):
+            return path
+        encoded_path = urllib.parse.quote(path, safe="")
+        return f"/v1/audio?path={encoded_path}"
+    @asynccontextmanager
+    async def lifespan(app: FastAPI):
+        # Clear proxy env that may affect downstream libs
+        for proxy_var in ["http_proxy", "https_proxy", "HTTP_PROXY", "HTTPS_PROXY", "ALL_PROXY"]:
+            os.environ.pop(proxy_var, None)
+        # Ensure compilation/temp caches do not fill up small default /tmp.
+        # Triton/Inductor (and the system compiler) can create large temporary files.
+        project_root = _get_project_root()
+        cache_root = os.path.join(project_root, ".cache", "acestep")
+        tmp_root = (os.getenv("ACESTEP_TMPDIR") or os.path.join(cache_root, "tmp")).strip()
+        triton_cache_root = (os.getenv("TRITON_CACHE_DIR") or os.path.join(cache_root, "triton")).strip()
+        inductor_cache_root = (os.getenv("TORCHINDUCTOR_CACHE_DIR") or os.path.join(cache_root, "torchinductor")).strip()
+        for p in [cache_root, tmp_root, triton_cache_root, inductor_cache_root]:
+            try:
+                os.makedirs(p, exist_ok=True)
+            except Exception:
+                # Best-effort: do not block startup if directory creation fails.
+                pass
+        # Respect explicit user overrides; if ACESTEP_TMPDIR is set, it should win.
+        if os.getenv("ACESTEP_TMPDIR"):
+            os.environ["TMPDIR"] = tmp_root
+            os.environ["TEMP"] = tmp_root
+            os.environ["TMP"] = tmp_root
+        else:
+            os.environ.setdefault("TMPDIR", tmp_root)
+            os.environ.setdefault("TEMP", tmp_root)
+            os.environ.setdefault("TMP", tmp_root)
+        os.environ.setdefault("TRITON_CACHE_DIR", triton_cache_root)
+        os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", inductor_cache_root)
+        handler = AceStepHandler()
+        llm_handler = LLMHandler()
+        init_lock = asyncio.Lock()
+        app.state._initialized = False
+        app.state._init_error = None
+        app.state._init_lock = init_lock
+        app.state.llm_handler = llm_handler
+        app.state._llm_initialized = False
+        app.state._llm_init_error = None
+        app.state._llm_init_lock = Lock()
+        # Multi-model support: secondary DiT handlers
+        handler2 = None
+        handler3 = None
+        config_path2 = os.getenv("ACESTEP_CONFIG_PATH2", "").strip()
+        config_path3 = os.getenv("ACESTEP_CONFIG_PATH3", "").strip()
+        if config_path2:
+            handler2 = AceStepHandler()
+        if config_path3:
+            handler3 = AceStepHandler()
+        app.state.handler2 = handler2
+        app.state.handler3 = handler3
+        app.state._initialized2 = False
+        app.state._initialized3 = False
+        app.state._config_path = os.getenv("ACESTEP_CONFIG_PATH", "acestep-v15-turbo")
+        app.state._config_path2 = config_path2
+        app.state._config_path3 = config_path3
+        max_workers = int(os.getenv("ACESTEP_API_WORKERS", "1"))
+        executor = ThreadPoolExecutor(max_workers=max_workers)
+        # Queue & observability
+        app.state.job_queue = asyncio.Queue(maxsize=QUEUE_MAXSIZE)  # (job_id, req)
+        app.state.pending_ids = deque()  # queued job_ids
+        app.state.pending_lock = asyncio.Lock()
+        # temp files per job (from multipart uploads)
+        app.state.job_temp_files = {}  # job_id -> list[path]
+        app.state.job_temp_files_lock = asyncio.Lock()
+        # stats
+        app.state.stats_lock = asyncio.Lock()
+        app.state.recent_durations = deque(maxlen=AVG_WINDOW)
+        app.state.avg_job_seconds = INITIAL_AVG_JOB_SECONDS
+        app.state.handler = handler
+        app.state.executor = executor
+        app.state.job_store = store
+        app.state._python_executable = sys.executable
+        # Temporary directory for saving generated audio files
+        app.state.temp_audio_dir = os.path.join(tmp_root, "api_audio")
+        os.makedirs(app.state.temp_audio_dir, exist_ok=True)
+        # Initialize local cache
+        try:
+            from acestep.local_cache import get_local_cache
+            local_cache_dir = os.path.join(cache_root, "local_redis")
+            app.state.local_cache = get_local_cache(local_cache_dir)
+        except ImportError:
+            app.state.local_cache = None
+        async def _ensure_initialized() -> None:
+            h: AceStepHandler = app.state.handler
+            if getattr(app.state, "_initialized", False):
+                return
+            if getattr(app.state, "_init_error", None):
+                raise RuntimeError(app.state._init_error)
+            async with app.state._init_lock:
+                if getattr(app.state, "_initialized", False):
+                    return
+                if getattr(app.state, "_init_error", None):
+                    raise RuntimeError(app.state._init_error)
+                project_root = _get_project_root()
+                config_path = os.getenv("ACESTEP_CONFIG_PATH", "acestep-v15-turbo")
+                device = os.getenv("ACESTEP_DEVICE", "auto")
+                use_flash_attention = _env_bool("ACESTEP_USE_FLASH_ATTENTION", True)
+                offload_to_cpu = _env_bool("ACESTEP_OFFLOAD_TO_CPU", False)
+                offload_dit_to_cpu = _env_bool("ACESTEP_OFFLOAD_DIT_TO_CPU", False)
+                # Initialize primary model
+                status_msg, ok = h.initialize_service(
+                    project_root=project_root,
+                    config_path=config_path,
+                    device=device,
+                    use_flash_attention=use_flash_attention,
+                    compile_model=False,
+                    offload_to_cpu=offload_to_cpu,
+                    offload_dit_to_cpu=offload_dit_to_cpu,
+                )
+                if not ok:
+                    app.state._init_error = status_msg
+                    raise RuntimeError(status_msg)
+                app.state._initialized = True
+                # Initialize secondary model if configured
+                if app.state.handler2 and app.state._config_path2:
+                    try:
+                        status_msg2, ok2 = app.state.handler2.initialize_service(
+                            project_root=project_root,
+                            config_path=app.state._config_path2,
+                            device=device,
+                            use_flash_attention=use_flash_attention,
+                            compile_model=False,
+                            offload_to_cpu=offload_to_cpu,
+                            offload_dit_to_cpu=offload_dit_to_cpu,
+                        )
+                        app.state._initialized2 = ok2
+                        if ok2:
+                            print(f"[API Server] Secondary model loaded: {_get_model_name(app.state._config_path2)}")
+                        else:
+                            print(f"[API Server] Warning: Secondary model failed to load: {status_msg2}")
+                    except Exception as e:
+                        print(f"[API Server] Warning: Failed to initialize secondary model: {e}")
+                        app.state._initialized2 = False
+                # Initialize third model if configured
+                if app.state.handler3 and app.state._config_path3:
+                    try:
+                        status_msg3, ok3 = app.state.handler3.initialize_service(
+                            project_root=project_root,
+                            config_path=app.state._config_path3,
+                            device=device,
+                            use_flash_attention=use_flash_attention,
+                            compile_model=False,
+                            offload_to_cpu=offload_to_cpu,
+                            offload_dit_to_cpu=offload_dit_to_cpu,
+                        )
+                        app.state._initialized3 = ok3
+                        if ok3:
+                            print(f"[API Server] Third model loaded: {_get_model_name(app.state._config_path3)}")
+                        else:
+                            print(f"[API Server] Warning: Third model failed to load: {status_msg3}")
+                    except Exception as e:
+                        print(f"[API Server] Warning: Failed to initialize third model: {e}")
+                        app.state._initialized3 = False
+        async def _cleanup_job_temp_files(job_id: str) -> None:
+            async with app.state.job_temp_files_lock:
+                paths = app.state.job_temp_files.pop(job_id, [])
+            for p in paths:
+                try:
+                    os.remove(p)
+                except Exception:
+                    pass
+        def _update_local_cache(job_id: str, result: Optional[Dict], status: str) -> None:
+            """Update local cache with job result"""
+            local_cache = getattr(app.state, 'local_cache', None)
+            if not local_cache:
+                return
+            rec = store.get(job_id)
+            env = getattr(rec, 'env', 'development') if rec else 'development'
+            create_time = rec.created_at if rec else time.time()
+            status_int = _map_status(status)
+            if status == "succeeded" and result:
+                audio_paths = result.get("audio_paths", [])
+                # Final prompt/lyrics (may be modified by thinking/format)
+                final_prompt = result.get("prompt", "")
+                final_lyrics = result.get("lyrics", "")
+                # Original user input from metas
+                metas_raw = result.get("metas", {}) or {}
+                original_prompt = metas_raw.get("prompt", "")
+                original_lyrics = metas_raw.get("lyrics", "")
+                # metas contains original input + other metadata
+                metas = {
+                    "bpm": metas_raw.get("bpm"),
+                    "duration": metas_raw.get("duration"),
+                    "genres": metas_raw.get("genres", ""),
+                    "keyscale": metas_raw.get("keyscale", ""),
+                    "timesignature": metas_raw.get("timesignature", ""),
+                    "prompt": original_prompt,
+                    "lyrics": original_lyrics,
+                }
+                # Extra fields for Discord bot
+                generation_info = result.get("generation_info", "")
+                seed_value = result.get("seed_value", "")
+                lm_model = result.get("lm_model", "")
+                dit_model = result.get("dit_model", "")
+                if audio_paths:
+                    result_data = [
+                        {
+                            "file": p,
+                            "wave": "",
+                            "status": status_int,
+                            "create_time": int(create_time),
+                            "env": env,
+                            "prompt": final_prompt,
+                            "lyrics": final_lyrics,
+                            "metas": metas,
+                            "generation_info": generation_info,
+                            "seed_value": seed_value,
+                            "lm_model": lm_model,
+                            "dit_model": dit_model,
+                        }
+                        for p in audio_paths
+                    ]
+                else:
+                    result_data = [{
+                        "file": "",
+                        "wave": "",
+                        "status": status_int,
+                        "create_time": int(create_time),
+                        "env": env,
+                        "prompt": final_prompt,
+                        "lyrics": final_lyrics,
+                        "metas": metas,
+                        "generation_info": generation_info,
+                        "seed_value": seed_value,
+                        "lm_model": lm_model,
+                        "dit_model": dit_model,
+                    }]
+            else:
+                result_data = [{"file": "", "wave": "", "status": status_int, "create_time": int(create_time), "env": env}]
+            result_key = f"{RESULT_KEY_PREFIX}{job_id}"
+            local_cache.set(result_key, result_data, ex=RESULT_EXPIRE_SECONDS)
+        async def _run_one_job(job_id: str, req: GenerateMusicRequest) -> None:
+            job_store: _JobStore = app.state.job_store
+            llm: LLMHandler = app.state.llm_handler
+            executor: ThreadPoolExecutor = app.state.executor
+            await _ensure_initialized()
+            job_store.mark_running(job_id)
+            # Select DiT handler based on user's model choice
+            # Default: use primary handler
+            selected_handler: AceStepHandler = app.state.handler
+            selected_model_name = _get_model_name(app.state._config_path)
+            if req.model:
+                model_matched = False
+                # Check if it matches the second model
+                if app.state.handler2 and getattr(app.state, "_initialized2", False):
+                    model2_name = _get_model_name(app.state._config_path2)
+                    if req.model == model2_name:
+                        selected_handler = app.state.handler2
+                        selected_model_name = model2_name
+                        model_matched = True
+                        print(f"[API Server] Job {job_id}: Using second model: {model2_name}")
+                # Check if it matches the third model
+                if not model_matched and app.state.handler3 and getattr(app.state, "_initialized3", False):
+                    model3_name = _get_model_name(app.state._config_path3)
+                    if req.model == model3_name:
+                        selected_handler = app.state.handler3
+                        selected_model_name = model3_name
+                        model_matched = True
+                        print(f"[API Server] Job {job_id}: Using third model: {model3_name}")
+                if not model_matched:
+                    available_models = [_get_model_name(app.state._config_path)]
+                    if app.state.handler2 and getattr(app.state, "_initialized2", False):
+                        available_models.append(_get_model_name(app.state._config_path2))
+                    if app.state.handler3 and getattr(app.state, "_initialized3", False):
+                        available_models.append(_get_model_name(app.state._config_path3))
+                    print(f"[API Server] Job {job_id}: Model '{req.model}' not found in {available_models}, using primary: {selected_model_name}")
+            # Use selected handler for generation
+            h: AceStepHandler = selected_handler
+            def _blocking_generate() -> Dict[str, Any]:
+                """Generate music using unified inference logic from acestep.inference"""
+                def _ensure_llm_ready() -> None:
+                    """Ensure LLM handler is initialized when needed"""
+                    with app.state._llm_init_lock:
+                        initialized = getattr(app.state, "_llm_initialized", False)
+                        had_error = getattr(app.state, "_llm_init_error", None)
+                        if initialized or had_error is not None:
+                            return
+                        project_root = _get_project_root()
+                        checkpoint_dir = os.path.join(project_root, "checkpoints")
+                        lm_model_path = (req.lm_model_path or os.getenv("ACESTEP_LM_MODEL_PATH") or "acestep-5Hz-lm-0.6B").strip()
+                        backend = (req.lm_backend or os.getenv("ACESTEP_LM_BACKEND") or "vllm").strip().lower()
+                        if backend not in {"vllm", "pt"}:
+                            backend = "vllm"
+                        lm_device = os.getenv("ACESTEP_LM_DEVICE", os.getenv("ACESTEP_DEVICE", "auto"))
+                        lm_offload = _env_bool("ACESTEP_LM_OFFLOAD_TO_CPU", False)
+                        status, ok = llm.initialize(
+                            checkpoint_dir=checkpoint_dir,
+                            lm_model_path=lm_model_path,
+                            backend=backend,
+                            device=lm_device,
+                            offload_to_cpu=lm_offload,
+                            dtype=h.dtype,
+                        )
+                        if not ok:
+                            app.state._llm_init_error = status
+                        else:
+                            app.state._llm_initialized = True
+                def _normalize_metas(meta: Dict[str, Any]) -> Dict[str, Any]:
+                    """Ensure a stable `metas` dict (keys always present)."""
+                    meta = meta or {}
+                    out: Dict[str, Any] = dict(meta)
+                    # Normalize key aliases
+                    if "keyscale" not in out and "key_scale" in out:
+                        out["keyscale"] = out.get("key_scale")
+                    if "timesignature" not in out and "time_signature" in out:
+                        out["timesignature"] = out.get("time_signature")
+                    # Ensure required keys exist
+                    for k in ["bpm", "duration", "genres", "keyscale", "timesignature"]:
+                        if out.get(k) in (None, ""):
+                            out[k] = "N/A"
+                    return out
+                # Normalize LM sampling parameters
+                lm_top_k = req.lm_top_k if req.lm_top_k and req.lm_top_k > 0 else 0
+                lm_top_p = req.lm_top_p if req.lm_top_p and req.lm_top_p < 1.0 else 0.9
+                # Determine if LLM is needed
+                thinking = bool(req.thinking)
+                sample_mode = bool(req.sample_mode)
+                has_sample_query = bool(req.sample_query and req.sample_query.strip())
+                use_format = bool(req.use_format)
+                use_cot_caption = bool(req.use_cot_caption)
+                use_cot_language = bool(req.use_cot_language)
+                # LLM is needed for:
+                # - thinking mode (LM generates audio codes)
+                # - sample_mode (LM generates random caption/lyrics/metas)
+                # - sample_query/description (LM generates from description)
+                # - use_format (LM enhances caption/lyrics)
+                # - use_cot_caption or use_cot_language (LM enhances metadata)
+                need_llm = thinking or sample_mode or has_sample_query or use_format or use_cot_caption or use_cot_language
+                # Ensure LLM is ready if needed
+                if need_llm:
+                    _ensure_llm_ready()
+                    if getattr(app.state, "_llm_init_error", None):
+                        raise RuntimeError(f"5Hz LM init failed: {app.state._llm_init_error}")
+                # Handle sample mode or description: generate caption/lyrics/metas via LM
+                caption = req.prompt
+                lyrics = req.lyrics
+                bpm = req.bpm
+                key_scale = req.key_scale
+                time_signature = req.time_signature
+                audio_duration = req.audio_duration
+                # Save original user input for metas
+                original_prompt = req.prompt or ""
+                original_lyrics = req.lyrics or ""
+                if sample_mode or has_sample_query:
+                    if has_sample_query:
+                        # Use create_sample() with description query
+                        parsed_language, parsed_instrumental = _parse_description_hints(req.sample_query)
+                        # Determine vocal_language with priority:
+                        # 1. User-specified vocal_language (if not default "en")
+                        # 2. Language parsed from description
+                        # 3. None (no constraint)
+                        if req.vocal_language and req.vocal_language not in ("en", "unknown", ""):
+                            sample_language = req.vocal_language
+                        else:
+                            sample_language = parsed_language
+                        sample_result = create_sample(
+                            llm_handler=llm,
+                            query=req.sample_query,
+                            instrumental=parsed_instrumental,
+                            vocal_language=sample_language,
+                            temperature=req.lm_temperature,
+                            top_k=lm_top_k if lm_top_k > 0 else None,
+                            top_p=lm_top_p if lm_top_p < 1.0 else None,
+                            use_constrained_decoding=req.constrained_decoding,
+                        )
+                        if not sample_result.success:
+                            raise RuntimeError(f"create_sample failed: {sample_result.error or sample_result.status_message}")
+                        # Use generated sample data
+                        caption = sample_result.caption
+                        lyrics = sample_result.lyrics
+                        bpm = sample_result.bpm
+                        key_scale = sample_result.keyscale
+                        time_signature = sample_result.timesignature
+                        audio_duration = sample_result.duration
+                    else:
+                        # Original sample_mode behavior: random generation
+                        sample_metadata, sample_status = llm.understand_audio_from_codes(
+                            audio_codes="NO USER INPUT",
+                            temperature=req.lm_temperature,
+                            top_k=lm_top_k if lm_top_k > 0 else None,
+                            top_p=lm_top_p if lm_top_p < 1.0 else None,
+                            repetition_penalty=req.lm_repetition_penalty,
+                            use_constrained_decoding=req.constrained_decoding,
+                            constrained_decoding_debug=req.constrained_decoding_debug,
+                        )
+                        if not sample_metadata or str(sample_status).startswith("❌"):
+                            raise RuntimeError(f"Sample generation failed: {sample_status}")
+                        # Use generated values with fallback defaults
+                        caption = sample_metadata.get("caption", "")
+                        lyrics = sample_metadata.get("lyrics", "")
+                        bpm = _to_int(sample_metadata.get("bpm"), None) or _to_int(os.getenv("ACESTEP_SAMPLE_DEFAULT_BPM", "120"), 120)
+                        key_scale = sample_metadata.get("keyscale", "") or os.getenv("ACESTEP_SAMPLE_DEFAULT_KEY", "C Major")
+                        time_signature = sample_metadata.get("timesignature", "") or os.getenv("ACESTEP_SAMPLE_DEFAULT_TIMESIGNATURE", "4/4")
+                        audio_duration = _to_float(sample_metadata.get("duration"), None) or _to_float(os.getenv("ACESTEP_SAMPLE_DEFAULT_DURATION_SECONDS", "120"), 120.0)
+                # Apply format_sample() if use_format is True and caption/lyrics are provided
+                format_has_duration = False
+                if req.use_format and (caption or lyrics):
+                    _ensure_llm_ready()
+                    if getattr(app.state, "_llm_init_error", None):
+                        raise RuntimeError(f"5Hz LM init failed (needed for format): {app.state._llm_init_error}")
+                    # Build user_metadata from request params (matching bot.py behavior)
+                    user_metadata_for_format = {}
+                    if bpm is not None:
+                        user_metadata_for_format['bpm'] = bpm
+                    if audio_duration is not None and audio_duration > 0:
+                        user_metadata_for_format['duration'] = int(audio_duration)
+                    if key_scale:
+                        user_metadata_for_format['keyscale'] = key_scale
+                    if time_signature:
+                        user_metadata_for_format['timesignature'] = time_signature
+                    if req.vocal_language and req.vocal_language != "unknown":
+                        user_metadata_for_format['language'] = req.vocal_language
+                    format_result = format_sample(
+                        llm_handler=llm,
+                        caption=caption,
+                        lyrics=lyrics,
+                        user_metadata=user_metadata_for_format if user_metadata_for_format else None,
+                        temperature=req.lm_temperature,
+                        top_k=lm_top_k if lm_top_k > 0 else None,
+                        top_p=lm_top_p if lm_top_p < 1.0 else None,
+                        use_constrained_decoding=req.constrained_decoding,
+                    )
+                    if format_result.success:
+                        # Extract all formatted data (matching bot.py behavior)
+                        caption = format_result.caption or caption
+                        lyrics = format_result.lyrics or lyrics
+                        if format_result.duration:
+                            audio_duration = format_result.duration
+                            format_has_duration = True
+                        if format_result.bpm:
+                            bpm = format_result.bpm
+                        if format_result.keyscale:
+                            key_scale = format_result.keyscale
+                        if format_result.timesignature:
+                            time_signature = format_result.timesignature
+                # Parse timesteps string to list of floats if provided
+                parsed_timesteps = _parse_timesteps(req.timesteps)
+                # Determine actual inference steps (timesteps override inference_steps)
+                actual_inference_steps = len(parsed_timesteps) if parsed_timesteps else req.inference_steps
+                # Auto-select instruction based on task_type if user didn't provide custom instruction
+                # This matches gradio behavior which uses TASK_INSTRUCTIONS for each task type
+                instruction_to_use = req.instruction
+                if instruction_to_use == DEFAULT_DIT_INSTRUCTION and req.task_type in TASK_INSTRUCTIONS:
+                    instruction_to_use = TASK_INSTRUCTIONS[req.task_type]
+                # Build GenerationParams using unified interface
+                # Note: thinking controls LM code generation, sample_mode only affects CoT metas
+                params = GenerationParams(
+                    task_type=req.task_type,
+                    instruction=instruction_to_use,
+                    reference_audio=req.reference_audio_path,
+                    src_audio=req.src_audio_path,
+                    audio_codes=req.audio_code_string,
+                    caption=caption,
+                    lyrics=lyrics,
+                    instrumental=False,
+                    vocal_language=req.vocal_language,
+                    bpm=bpm,
+                    keyscale=key_scale,
+                    timesignature=time_signature,
+                    duration=audio_duration if audio_duration else -1.0,
+                    inference_steps=req.inference_steps,
+                    seed=req.seed,
+                    guidance_scale=req.guidance_scale,
+                    use_adg=req.use_adg,
+                    cfg_interval_start=req.cfg_interval_start,
+                    cfg_interval_end=req.cfg_interval_end,
+                    shift=req.shift,
+                    infer_method=req.infer_method,
+                    timesteps=parsed_timesteps,
+                    repainting_start=req.repainting_start,
+                    repainting_end=req.repainting_end if req.repainting_end else -1,
+                    audio_cover_strength=req.audio_cover_strength,
+                    # LM parameters
+                    thinking=thinking,  # Use LM for code generation when thinking=True
+                    lm_temperature=req.lm_temperature,
+                    lm_cfg_scale=req.lm_cfg_scale,
+                    lm_top_k=lm_top_k,
+                    lm_top_p=lm_top_p,
+                    lm_negative_prompt=req.lm_negative_prompt,
+                    # use_cot_metas logic:
+                    # - sample_mode: metas already generated, skip Phase 1
+                    # - format with duration: metas already generated, skip Phase 1
+                    # - format without duration: need Phase 1 to generate duration
+                    # - no format: need Phase 1 to generate all metas
+                    use_cot_metas=not sample_mode and not format_has_duration,
+                    use_cot_caption=req.use_cot_caption,
+                    use_cot_language=req.use_cot_language,
+                    use_constrained_decoding=req.constrained_decoding,
+                )
+                # Build GenerationConfig - default to 2 audios like gradio_ui
+                batch_size = req.batch_size if req.batch_size is not None else 2
+                config = GenerationConfig(
+                    batch_size=batch_size,
+                    use_random_seed=req.use_random_seed,
+                    seeds=None,  # Let unified logic handle seed generation
+                    audio_format=req.audio_format,
+                    constrained_decoding_debug=req.constrained_decoding_debug,
+                )
+                # Check LLM initialization status
+                llm_is_initialized = getattr(app.state, "_llm_initialized", False)
+                llm_to_pass = llm if llm_is_initialized else None
+                # Generate music using unified interface
+                result = generate_music(
+                    dit_handler=h,
+                    llm_handler=llm_to_pass,
+                    params=params,
+                    config=config,
+                    save_dir=app.state.temp_audio_dir,
+                    progress=None,
+                )
+                if not result.success:
+                    raise RuntimeError(f"Music generation failed: {result.error or result.status_message}")
+                # Extract results
+                audio_paths = [audio["path"] for audio in result.audios if audio.get("path")]
+                first_audio = audio_paths[0] if len(audio_paths) > 0 else None
+                second_audio = audio_paths[1] if len(audio_paths) > 1 else None
+                # Get metadata from LM or CoT results
+                lm_metadata = result.extra_outputs.get("lm_metadata", {})
+                metas_out = _normalize_metas(lm_metadata)
+                # Update metas with actual values used
+                if params.cot_bpm:
+                    metas_out["bpm"] = params.cot_bpm
+                elif bpm:
+                    metas_out["bpm"] = bpm
+                if params.cot_duration:
+                    metas_out["duration"] = params.cot_duration
+                elif audio_duration:
+                    metas_out["duration"] = audio_duration
+                if params.cot_keyscale:
+                    metas_out["keyscale"] = params.cot_keyscale
+                elif key_scale:
+                    metas_out["keyscale"] = key_scale
+                if params.cot_timesignature:
+                    metas_out["timesignature"] = params.cot_timesignature
+                elif time_signature:
+                    metas_out["timesignature"] = time_signature
+                # Store original user input in metas (not the final/modified values)
+                metas_out["prompt"] = original_prompt
+                metas_out["lyrics"] = original_lyrics
+                # Extract seed values for response (comma-separated for multiple audios)
+                seed_values = []
+                for audio in result.audios:
+                    audio_params = audio.get("params", {})
+                    seed = audio_params.get("seed")
+                    if seed is not None:
+                        seed_values.append(str(seed))
+                seed_value = ",".join(seed_values) if seed_values else ""
+                # Build generation_info using the helper function (like gradio_ui)
+                time_costs = result.extra_outputs.get("time_costs", {})
+                generation_info = _build_generation_info(
+                    lm_metadata=lm_metadata,
+                    time_costs=time_costs,
+                    seed_value=seed_value,
+                    inference_steps=req.inference_steps,
+                    num_audios=len(result.audios),
+                )
+                def _none_if_na_str(v: Any) -> Optional[str]:
+                    if v is None:
+                        return None
+                    s = str(v).strip()
+                    if s in {"", "N/A"}:
+                        return None
+                    return s
+                # Get model information
+                lm_model_name = os.getenv("ACESTEP_LM_MODEL_PATH", "acestep-5Hz-lm-0.6B")
+                # Use selected_model_name (set at the beginning of _run_one_job)
+                dit_model_name = selected_model_name
+                return {
+                    "first_audio_path": _path_to_audio_url(first_audio) if first_audio else None,
+                    "second_audio_path": _path_to_audio_url(second_audio) if second_audio else None,
+                    "audio_paths": [_path_to_audio_url(p) for p in audio_paths],
+                    "generation_info": generation_info,
+                    "status_message": result.status_message,
+                    "seed_value": seed_value,
+                    # Final prompt/lyrics (may be modified by thinking/format)
+                    "prompt": caption or "",
+                    "lyrics": lyrics or "",
+                    # metas contains original user input + other metadata
+                    "metas": metas_out,
+                    "bpm": metas_out.get("bpm") if isinstance(metas_out.get("bpm"), int) else None,
+                    "duration": metas_out.get("duration") if isinstance(metas_out.get("duration"), (int, float)) else None,
+                    "genres": _none_if_na_str(metas_out.get("genres")),
+                    "keyscale": _none_if_na_str(metas_out.get("keyscale")),
+                    "timesignature": _none_if_na_str(metas_out.get("timesignature")),
+                    "lm_model": lm_model_name,
+                    "dit_model": dit_model_name,
+                }
+            t0 = time.time()
+            try:
+                loop = asyncio.get_running_loop()
+                result = await loop.run_in_executor(executor, _blocking_generate)
+                job_store.mark_succeeded(job_id, result)
+                # Update local cache
+                _update_local_cache(job_id, result, "succeeded")
+            except Exception:
+                job_store.mark_failed(job_id, traceback.format_exc())
+                # Update local cache
+                _update_local_cache(job_id, None, "failed")
+            finally:
+                dt = max(0.0, time.time() - t0)
+                async with app.state.stats_lock:
+                    app.state.recent_durations.append(dt)
+                    if app.state.recent_durations:
+                        app.state.avg_job_seconds = sum(app.state.recent_durations) / len(app.state.recent_durations)
+        async def _queue_worker(worker_idx: int) -> None:
+            while True:
+                job_id, req = await app.state.job_queue.get()
+                try:
+                    async with app.state.pending_lock:
+                        try:
+                            app.state.pending_ids.remove(job_id)
+                        except ValueError:
+                            pass
+                    await _run_one_job(job_id, req)
+                finally:
+                    await _cleanup_job_temp_files(job_id)
+                    app.state.job_queue.task_done()
+        worker_count = max(1, WORKER_COUNT)
+        workers = [asyncio.create_task(_queue_worker(i)) for i in range(worker_count)]
+        app.state.worker_tasks = workers
+        try:
+            yield
+        finally:
+            for t in workers:
+                t.cancel()
+            executor.shutdown(wait=False, cancel_futures=True)
+    app = FastAPI(title="ACE-Step API", version="1.0", lifespan=lifespan)
+    async def _queue_position(job_id: str) -> int:
+        async with app.state.pending_lock:
+            try:
+                return list(app.state.pending_ids).index(job_id) + 1
+            except ValueError:
+                return 0
+    async def _eta_seconds_for_position(pos: int) -> Optional[float]:
+        if pos <= 0:
+            return None
+        async with app.state.stats_lock:
+            avg = float(getattr(app.state, "avg_job_seconds", INITIAL_AVG_JOB_SECONDS))
+        return pos * avg
+    @app.post("/release_task", response_model=CreateJobResponse)
+    async def create_music_generate_job(request: Request) -> CreateJobResponse:
+        content_type = (request.headers.get("content-type") or "").lower()
+        temp_files: list[str] = []
+        def _build_request(p: RequestParser, **kwargs) -> GenerateMusicRequest:
+            """Build GenerateMusicRequest from parsed parameters."""
+            return GenerateMusicRequest(
+                prompt=p.str("prompt"),
+                lyrics=p.str("lyrics"),
+                thinking=p.bool("thinking"),
+                sample_mode=p.bool("sample_mode"),
+                sample_query=p.str("sample_query"),
+                use_format=p.bool("use_format"),
+                model=p.str("model") or None,
+                bpm=p.int("bpm"),
+                key_scale=p.str("key_scale"),
+                time_signature=p.str("time_signature"),
+                audio_duration=p.float("audio_duration"),
+                vocal_language=p.str("vocal_language", "en"),
+                inference_steps=p.int("inference_steps", 8),
+                guidance_scale=p.float("guidance_scale", 7.0),
+                use_random_seed=p.bool("use_random_seed", True),
+                seed=p.int("seed", -1),
+                batch_size=p.int("batch_size"),
+                audio_code_string=p.str("audio_code_string"),
+                repainting_start=p.float("repainting_start", 0.0),
+                repainting_end=p.float("repainting_end"),
+                instruction=p.str("instruction", DEFAULT_DIT_INSTRUCTION),
+                audio_cover_strength=p.float("audio_cover_strength", 1.0),
+                task_type=p.str("task_type", "text2music"),
+                use_adg=p.bool("use_adg"),
+                cfg_interval_start=p.float("cfg_interval_start", 0.0),
+                cfg_interval_end=p.float("cfg_interval_end", 1.0),
+                infer_method=p.str("infer_method", "ode"),
+                shift=p.float("shift", 3.0),
+                audio_format=p.str("audio_format", "mp3"),
+                use_tiled_decode=p.bool("use_tiled_decode", True),
+                lm_model_path=p.str("lm_model_path") or None,
+                lm_backend=p.str("lm_backend", "vllm"),
+                lm_temperature=p.float("lm_temperature", LM_DEFAULT_TEMPERATURE),
+                lm_cfg_scale=p.float("lm_cfg_scale", LM_DEFAULT_CFG_SCALE),
+                lm_top_k=p.int("lm_top_k"),
+                lm_top_p=p.float("lm_top_p", LM_DEFAULT_TOP_P),
+                lm_repetition_penalty=p.float("lm_repetition_penalty", 1.0),
+                lm_negative_prompt=p.str("lm_negative_prompt", "NO USER INPUT"),
+                constrained_decoding=p.bool("constrained_decoding", True),
+                constrained_decoding_debug=p.bool("constrained_decoding_debug"),
+                use_cot_caption=p.bool("use_cot_caption", True),
+                use_cot_language=p.bool("use_cot_language", True),
+                is_format_caption=p.bool("is_format_caption"),
+                **kwargs,
+            )
+        if content_type.startswith("application/json"):
+            body = await request.json()
+            if not isinstance(body, dict):
+                raise HTTPException(status_code=400, detail="JSON payload must be an object")
+            req = _build_request(RequestParser(body))
+        elif content_type.endswith("+json"):
+            body = await request.json()
+            if not isinstance(body, dict):
+                raise HTTPException(status_code=400, detail="JSON payload must be an object")
+            req = _build_request(RequestParser(body))
+        elif content_type.startswith("multipart/form-data"):
+            form = await request.form()
+            ref_up = form.get("reference_audio")
+            src_up = form.get("src_audio")
+            reference_audio_path = None
+            src_audio_path = None
+            if isinstance(ref_up, StarletteUploadFile):
+                reference_audio_path = await _save_upload_to_temp(ref_up, prefix="reference_audio")
+                temp_files.append(reference_audio_path)
+            else:
+                reference_audio_path = str(form.get("reference_audio_path") or "").strip() or None
+            if isinstance(src_up, StarletteUploadFile):
+                src_audio_path = await _save_upload_to_temp(src_up, prefix="src_audio")
+                temp_files.append(src_audio_path)
+            else:
+                src_audio_path = str(form.get("src_audio_path") or "").strip() or None
+            req = _build_request(
+                RequestParser(dict(form)),
+                reference_audio_path=reference_audio_path,
+                src_audio_path=src_audio_path,
+            )
+        elif content_type.startswith("application/x-www-form-urlencoded"):
+            form = await request.form()
+            reference_audio_path = str(form.get("reference_audio_path") or "").strip() or None
+            src_audio_path = str(form.get("src_audio_path") or "").strip() or None
+            req = _build_request(
+                RequestParser(dict(form)),
+                reference_audio_path=reference_audio_path,
+                src_audio_path=src_audio_path,
+            )
+        else:
+            raw = await request.body()
+            raw_stripped = raw.lstrip()
+            # Best-effort: accept missing/incorrect Content-Type if payload is valid JSON.
+            if raw_stripped.startswith(b"{") or raw_stripped.startswith(b"["):
+                try:
+                    body = json.loads(raw.decode("utf-8"))
+                    if isinstance(body, dict):
+                        req = _build_request(RequestParser(body))
+                    else:
+                        raise HTTPException(status_code=400, detail="JSON payload must be an object")
+                except HTTPException:
+                    raise
+                except Exception:
+                    raise HTTPException(
+                        status_code=400,
+                        detail="Invalid JSON body (hint: set 'Content-Type: application/json')",
+                    )
+            # Best-effort: parse key=value bodies even if Content-Type is missing.
+            elif raw_stripped and b"=" in raw:
+                parsed = urllib.parse.parse_qs(raw.decode("utf-8"), keep_blank_values=True)
+                flat = {k: (v[0] if isinstance(v, list) and v else v) for k, v in parsed.items()}
+                reference_audio_path = str(flat.get("reference_audio_path") or "").strip() or None
+                src_audio_path = str(flat.get("src_audio_path") or "").strip() or None
+                req = _build_request(
+                    RequestParser(flat),
+                    reference_audio_path=reference_audio_path,
+                    src_audio_path=src_audio_path,
+                )
+            else:
+                raise HTTPException(
+                    status_code=415,
+                    detail=(
+                        f"Unsupported Content-Type: {content_type or '(missing)'}; "
+                        "use application/json, application/x-www-form-urlencoded, or multipart/form-data"
+                    ),
+                )
+        rec = store.create()
+        q: asyncio.Queue = app.state.job_queue
+        if q.full():
+            for p in temp_files:
+                try:
+                    os.remove(p)
+                except Exception:
+                    pass
+            raise HTTPException(status_code=429, detail="Server busy: queue is full")
+        if temp_files:
+            async with app.state.job_temp_files_lock:
+                app.state.job_temp_files[rec.job_id] = temp_files
+        async with app.state.pending_lock:
+            app.state.pending_ids.append(rec.job_id)
+            position = len(app.state.pending_ids)
+        await q.put((rec.job_id, req))
+        return CreateJobResponse(task_id=rec.job_id, status="queued", queue_position=position)
+    @app.post("/v1/music/random", response_model=CreateJobResponse)
+    async def create_random_sample_job(request: Request) -> CreateJobResponse:
+        """Create a sample-mode job that auto-generates caption/lyrics via LM."""
+        thinking_value: Any = None
+        content_type = (request.headers.get("content-type") or "").lower()
+        body_dict: Dict[str, Any] = {}
+        if "json" in content_type:
+            try:
+                payload = await request.json()
+                if isinstance(payload, dict):
+                    body_dict = payload
+            except Exception:
+                body_dict = {}
+        if not body_dict and request.query_params:
+            body_dict = dict(request.query_params)
+        thinking_value = body_dict.get("thinking")
+        if thinking_value is None:
+            thinking_value = body_dict.get("Thinking")
+        thinking_flag = _to_bool(thinking_value, True)
+        req = GenerateMusicRequest(
+            caption="",
+            lyrics="",
+            thinking=thinking_flag,
+            sample_mode=True,
+        )
+        rec = store.create()
+        q: asyncio.Queue = app.state.job_queue
+        if q.full():
+            raise HTTPException(status_code=429, detail="Server busy: queue is full")
+        async with app.state.pending_lock:
+            app.state.pending_ids.append(rec.job_id)
+            position = len(app.state.pending_ids)
+        await q.put((rec.job_id, req))
+        return CreateJobResponse(task_id=rec.job_id, status="queued", queue_position=position)
+    @app.post("/query_result")
+    async def query_result(request: Request) -> List[Dict[str, Any]]:
+        """Batch query job results"""
+        content_type = (request.headers.get("content-type") or "").lower()
+        if "json" in content_type:
+            body = await request.json()
+        else:
+            form = await request.form()
+            body = {k: v for k, v in form.items()}
+        task_id_list_str = body.get("task_id_list", "[]")
+        # Parse task ID list
+        if isinstance(task_id_list_str, list):
+            task_id_list = task_id_list_str
+        else:
+            try:
+                task_id_list = json.loads(task_id_list_str)
+            except Exception:
+                task_id_list = []
+        local_cache = getattr(app.state, 'local_cache', None)
+        data_list = []
+        current_time = time.time()
+        for task_id in task_id_list:
+            result_key = f"{RESULT_KEY_PREFIX}{task_id}"
+            # Read from local cache first
+            if local_cache:
+                data = local_cache.get(result_key)
+                if data:
+                    try:
+                        data_json = json.loads(data)
+                    except Exception:
+                        data_json = []
+                    if len(data_json) <= 0:
+                        data_list.append({"task_id": task_id, "result": data, "status": 2})
+                    else:
+                        status = data_json[0].get("status")
+                        create_time = data_json[0].get("create_time", 0)
+                        if status == 0 and (current_time - create_time) > TASK_TIMEOUT_SECONDS:
+                            data_list.append({"task_id": task_id, "result": data, "status": 2})
+                        else:
+                            data_list.append({
+                                "task_id": task_id,
+                                "result": data,
+                                "status": int(status) if status is not None else 1,
+                            })
+                    continue
+            # Fallback to job_store query
+            rec = store.get(task_id)
+            if rec:
+                env = getattr(rec, 'env', 'development')
+                create_time = rec.created_at
+                status_int = _map_status(rec.status)
+                if rec.result and rec.status == "succeeded":
+                    audio_paths = rec.result.get("audio_paths", [])
+                    metas = rec.result.get("metas", {}) or {}
+                    result_data = [
+                        {
+                            "file": p, "wave": "", "status": status_int,
+                            "create_time": int(create_time), "env": env,
+                            "prompt": metas.get("caption", ""),
+                            "lyrics": metas.get("lyrics", ""),
+                            "metas": {
+                                "bpm": metas.get("bpm"),
+                                "duration": metas.get("duration"),
+                                "genres": metas.get("genres", ""),
+                                "keyscale": metas.get("keyscale", ""),
+                                "timesignature": metas.get("timesignature", ""),
+                            }
+                        }
+                        for p in audio_paths
+                    ] if audio_paths else [{
+                        "file": "", "wave": "", "status": status_int,
+                        "create_time": int(create_time), "env": env,
+                        "prompt": metas.get("caption", ""),
+                        "lyrics": metas.get("lyrics", ""),
+                        "metas": {
+                            "bpm": metas.get("bpm"),
+                            "duration": metas.get("duration"),
+                            "genres": metas.get("genres", ""),
+                            "keyscale": metas.get("keyscale", ""),
+                            "timesignature": metas.get("timesignature", ""),
+                        }
+                    }]
+                else:
+                    result_data = [{
+                        "file": "", "wave": "", "status": status_int,
+                        "create_time": int(create_time), "env": env,
+                        "prompt": "", "lyrics": "",
+                        "metas": {}
+                    }]
+                data_list.append({
+                    "task_id": task_id,
+                    "result": json.dumps(result_data, ensure_ascii=False),
+                    "status": status_int,
+                })
+            else:
+                data_list.append({"task_id": task_id, "result": "[]", "status": 0})
+        return data_list
+    @app.get("/health")
+    async def health_check():
+        """Health check endpoint for service status."""
+        return {
+            "status": "ok",
+            "service": "ACE-Step API",
+            "version": "1.0",
+        }
+    @app.get("/v1/models")
+    async def list_models():
+        """List available DiT models."""
+        models = []
+        # Primary model (always available if initialized)
+        if getattr(app.state, "_initialized", False):
+            primary_model = _get_model_name(app.state._config_path)
+            if primary_model:
+                models.append({
+                    "name": primary_model,
+                    "is_default": True,
+                })
+        # Secondary model
+        if getattr(app.state, "_initialized2", False) and app.state._config_path2:
+            secondary_model = _get_model_name(app.state._config_path2)
+            if secondary_model:
+                models.append({
+                    "name": secondary_model,
+                    "is_default": False,
+                })
+        # Third model
+        if getattr(app.state, "_initialized3", False) and app.state._config_path3:
+            third_model = _get_model_name(app.state._config_path3)
+            if third_model:
+                models.append({
+                    "name": third_model,
+                    "is_default": False,
+                })
+        return {
+            "models": models,
+            "default_model": models[0]["name"] if models else None,
+        }
+    @app.get("/v1/audio")
+    async def get_audio(path: str):
+        """Serve audio file by path."""
+        from fastapi.responses import FileResponse
+        if not os.path.exists(path):
+            raise HTTPException(status_code=404, detail=f"Audio file not found: {path}")
+        ext = os.path.splitext(path)[1].lower()
+        media_types = {
+            ".mp3": "audio/mpeg",
+            ".wav": "audio/wav",
+            ".flac": "audio/flac",
+            ".ogg": "audio/ogg",
+        }
+        media_type = media_types.get(ext, "audio/mpeg")
+        return FileResponse(path, media_type=media_type)
+    return app
+app = create_app()
+def main() -> None:
+    import argparse
+    import uvicorn
+    parser = argparse.ArgumentParser(description="ACE-Step API server")
+    parser.add_argument(
+        "--host",
+        default=os.getenv("ACESTEP_API_HOST", "127.0.0.1"),
+        help="Bind host (default from ACESTEP_API_HOST or 127.0.0.1)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=int(os.getenv("ACESTEP_API_PORT", "8001")),
+        help="Bind port (default from ACESTEP_API_PORT or 8001)",
+    )
+    args = parser.parse_args()
+    # IMPORTANT: in-memory queue/store -> workers MUST be 1
+    uvicorn.run(
+        "acestep.api_server:app",
+        host=str(args.host),
+        port=int(args.port),
+        reload=False,
+        workers=1,
+    )
+if __name__ == "__main__":
+    main()

acestep/audio_utils.py ADDED Viewed

	@@ -0,0 +1,378 @@

+"""
+Audio saving and transcoding utility module
+Independent audio file operations outside of handler, supporting:
+- Save audio tensor/numpy to files (default FLAC format, fast)
+- Format conversion (FLAC/WAV/MP3)
+- Batch processing
+"""
+import os
+# Disable torchcodec backend to avoid CUDA dependency issues on HuggingFace Space
+# This forces torchaudio to use ffmpeg/sox/soundfile backends instead
+os.environ["TORCHAUDIO_USE_TORCHCODEC"] = "0"
+import hashlib
+import json
+from pathlib import Path
+from typing import Union, Optional, List, Tuple
+import torch
+import numpy as np
+import torchaudio
+from loguru import logger
+class AudioSaver:
+    """Audio saving and transcoding utility class"""
+    def __init__(self, default_format: str = "flac"):
+        """
+        Initialize audio saver
+        Args:
+            default_format: Default save format ('flac', 'wav', 'mp3')
+        """
+        self.default_format = default_format.lower()
+        if self.default_format not in ["flac", "wav", "mp3"]:
+            logger.warning(f"Unsupported format {default_format}, using 'flac'")
+            self.default_format = "flac"
+    def save_audio(
+        self,
+        audio_data: Union[torch.Tensor, np.ndarray],
+        output_path: Union[str, Path],
+        sample_rate: int = 48000,
+        format: Optional[str] = None,
+        channels_first: bool = True,
+    ) -> str:
+        """
+        Save audio data to file
+        Args:
+            audio_data: Audio data, torch.Tensor [channels, samples] or numpy.ndarray
+            output_path: Output file path (extension can be omitted)
+            sample_rate: Sample rate
+            format: Audio format ('flac', 'wav', 'mp3'), defaults to default_format
+            channels_first: If True, tensor format is [channels, samples], else [samples, channels]
+        Returns:
+            Actual saved file path
+        """
+        format = (format or self.default_format).lower()
+        if format not in ["flac", "wav", "mp3"]:
+            logger.warning(f"Unsupported format {format}, using {self.default_format}")
+            format = self.default_format
+        # Ensure output path has correct extension
+        output_path = Path(output_path)
+        if output_path.suffix.lower() not in ['.flac', '.wav', '.mp3']:
+            output_path = output_path.with_suffix(f'.{format}')
+        # Convert to torch tensor
+        if isinstance(audio_data, np.ndarray):
+            if channels_first:
+                # numpy [samples, channels] -> tensor [channels, samples]
+                audio_tensor = torch.from_numpy(audio_data.T).float()
+            else:
+                # numpy [samples, channels] -> tensor [samples, channels] -> [channels, samples]
+                audio_tensor = torch.from_numpy(audio_data).float()
+                if audio_tensor.dim() == 2 and audio_tensor.shape[0] < audio_tensor.shape[1]:
+                    audio_tensor = audio_tensor.T
+        else:
+            # torch tensor
+            audio_tensor = audio_data.cpu().float()
+            if not channels_first and audio_tensor.dim() == 2:
+                # [samples, channels] -> [channels, samples]
+                if audio_tensor.shape[0] > audio_tensor.shape[1]:
+                    audio_tensor = audio_tensor.T
+        # Ensure memory is contiguous
+        audio_tensor = audio_tensor.contiguous()
+        # Select backend and save
+        try:
+            if format == "mp3":
+                # MP3 uses ffmpeg backend
+                torchaudio.save(
+                    str(output_path),
+                    audio_tensor,
+                    sample_rate,
+                    channels_first=True,
+                    backend='ffmpeg',
+                )
+            elif format in ["flac", "wav"]:
+                # FLAC and WAV use soundfile backend (fastest)
+                torchaudio.save(
+                    str(output_path),
+                    audio_tensor,
+                    sample_rate,
+                    channels_first=True,
+                    backend='soundfile',
+                )
+            else:
+                # Other formats use default backend
+                torchaudio.save(
+                    str(output_path),
+                    audio_tensor,
+                    sample_rate,
+                    channels_first=True,
+                )
+            logger.debug(f"[AudioSaver] Saved audio to {output_path} ({format}, {sample_rate}Hz)")
+            return str(output_path)
+        except Exception as e:
+            try:
+                import soundfile as sf
+                audio_np = audio_tensor.transpose(0, 1).numpy()  # -> [samples, channels]
+                sf.write(str(output_path), audio_np, sample_rate, format=format.upper())
+                logger.debug(f"[AudioSaver] Fallback soundfile Saved audio to {output_path} ({format}, {sample_rate}Hz)")
+                return str(output_path)
+            except Exception as e:
+                logger.error(f"[AudioSaver] Failed to save audio: {e}")
+                raise
+    def _load_audio_file(self, audio_file: Union[str, Path]) -> Tuple[torch.Tensor, int]:
+        """
+        Load audio file with ffmpeg backend, fallback to soundfile if failed.
+        This handles CUDA dependency issues with torchcodec on HuggingFace Space.
+        Args:
+            audio_file: Path to the audio file
+        Returns:
+            Tuple of (audio_tensor, sample_rate)
+        Raises:
+            FileNotFoundError: If the audio file doesn't exist
+            Exception: If all methods fail to load the audio
+        """
+        audio_file = str(audio_file)
+        # Check if file exists first
+        if not Path(audio_file).exists():
+            raise FileNotFoundError(f"Audio file not found: {audio_file}")
+        # Try torchaudio with explicit ffmpeg backend first
+        try:
+            audio, sr = torchaudio.load(audio_file, backend="ffmpeg")
+            return audio, sr
+        except Exception as e:
+            logger.debug(f"[AudioSaver._load_audio_file] ffmpeg backend failed: {e}, trying soundfile fallback")
+        # Fallback: use soundfile directly (most compatible)
+        try:
+            import soundfile as sf
+            audio_np, sr = sf.read(audio_file)
+            # soundfile returns [samples, channels] or [samples], convert to [channels, samples]
+            audio = torch.from_numpy(audio_np).float()
+            if audio.dim() == 1:
+                # Mono: [samples] -> [1, samples]
+                audio = audio.unsqueeze(0)
+            else:
+                # Stereo: [samples, channels] -> [channels, samples]
+                audio = audio.T
+            return audio, sr
+        except Exception as e:
+            logger.error(f"[AudioSaver._load_audio_file] All methods failed to load audio: {audio_file}, error: {e}")
+            raise
+    def convert_audio(
+        self,
+        input_path: Union[str, Path],
+        output_path: Union[str, Path],
+        output_format: str,
+        remove_input: bool = False,
+    ) -> str:
+        """
+        Convert audio format
+        Args:
+            input_path: Input audio file path
+            output_path: Output audio file path
+            output_format: Target format ('flac', 'wav', 'mp3')
+            remove_input: Whether to delete input file
+        Returns:
+            Output file path
+        """
+        input_path = Path(input_path)
+        output_path = Path(output_path)
+        if not input_path.exists():
+            raise FileNotFoundError(f"Input file not found: {input_path}")
+        # Load audio with fallback backends
+        audio_tensor, sample_rate = self._load_audio_file(input_path)
+        # Save as new format
+        output_path = self.save_audio(
+            audio_tensor,
+            output_path,
+            sample_rate=sample_rate,
+            format=output_format,
+            channels_first=True
+        )
+        # Delete input file if needed
+        if remove_input:
+            input_path.unlink()
+            logger.debug(f"[AudioSaver] Removed input file: {input_path}")
+        return output_path
+    def save_batch(
+        self,
+        audio_batch: Union[List[torch.Tensor], torch.Tensor],
+        output_dir: Union[str, Path],
+        file_prefix: str = "audio",
+        sample_rate: int = 48000,
+        format: Optional[str] = None,
+        channels_first: bool = True,
+    ) -> List[str]:
+        """
+        Save audio batch
+        Args:
+            audio_batch: Audio batch, List[tensor] or tensor [batch, channels, samples]
+            output_dir: Output directory
+            file_prefix: File prefix
+            sample_rate: Sample rate
+            format: Audio format
+            channels_first: Tensor format flag
+        Returns:
+            List of saved file paths
+        """
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Process batch
+        if isinstance(audio_batch, torch.Tensor) and audio_batch.dim() == 3:
+            # [batch, channels, samples]
+            audio_list = [audio_batch[i] for i in range(audio_batch.shape[0])]
+        elif isinstance(audio_batch, list):
+            audio_list = audio_batch
+        else:
+            audio_list = [audio_batch]
+        saved_paths = []
+        for i, audio in enumerate(audio_list):
+            output_path = output_dir / f"{file_prefix}_{i:04d}"
+            saved_path = self.save_audio(
+                audio,
+                output_path,
+                sample_rate=sample_rate,
+                format=format,
+                channels_first=channels_first
+            )
+            saved_paths.append(saved_path)
+        return saved_paths
+def get_audio_file_hash(audio_file) -> str:
+    """
+    Get hash identifier for an audio file.
+    Args:
+        audio_file: Path to audio file (str) or file-like object
+    Returns:
+        Hash string or empty string
+    """
+    if audio_file is None:
+        return ""
+    try:
+        if isinstance(audio_file, str):
+            if os.path.exists(audio_file):
+                with open(audio_file, 'rb') as f:
+                    return hashlib.md5(f.read()).hexdigest()
+            return hashlib.md5(audio_file.encode('utf-8')).hexdigest()
+        elif hasattr(audio_file, 'name'):
+            return hashlib.md5(str(audio_file.name).encode('utf-8')).hexdigest()
+        return hashlib.md5(str(audio_file).encode('utf-8')).hexdigest()
+    except Exception:
+        return hashlib.md5(str(audio_file).encode('utf-8')).hexdigest()
+def generate_uuid_from_params(params_dict) -> str:
+    """
+    Generate deterministic UUID from generation parameters.
+    Same parameters will always generate the same UUID.
+    Args:
+        params_dict: Dictionary of parameters
+    Returns:
+        UUID string
+    """
+    params_json = json.dumps(params_dict, sort_keys=True, ensure_ascii=False)
+    hash_obj = hashlib.sha256(params_json.encode('utf-8'))
+    hash_hex = hash_obj.hexdigest()
+    uuid_str = f"{hash_hex[0:8]}-{hash_hex[8:12]}-{hash_hex[12:16]}-{hash_hex[16:20]}-{hash_hex[20:32]}"
+    return uuid_str
+def generate_uuid_from_audio_data(
+    audio_data: Union[torch.Tensor, np.ndarray],
+    seed: Optional[int] = None
+) -> str:
+    """
+    Generate UUID from audio data (for caching/deduplication)
+    Args:
+        audio_data: Audio data
+        seed: Optional seed value
+    Returns:
+        UUID string
+    """
+    if isinstance(audio_data, torch.Tensor):
+        # Convert to numpy and calculate hash
+        audio_np = audio_data.cpu().numpy()
+    else:
+        audio_np = audio_data
+    # Calculate data hash
+    data_hash = hashlib.md5(audio_np.tobytes()).hexdigest()
+    if seed is not None:
+        combined = f"{data_hash}_{seed}"
+        return hashlib.md5(combined.encode()).hexdigest()
+    return data_hash
+# Global default instance
+_default_saver = AudioSaver(default_format="flac")
+def save_audio(
+    audio_data: Union[torch.Tensor, np.ndarray],
+    output_path: Union[str, Path],
+    sample_rate: int = 48000,
+    format: Optional[str] = None,
+    channels_first: bool = True,
+) -> str:
+    """
+    Convenience function: save audio (using default configuration)
+    Args:
+        audio_data: Audio data
+        output_path: Output path
+        sample_rate: Sample rate
+        format: Format (default flac)
+        channels_first: Tensor format flag
+    Returns:
+        Saved file path
+    """
+    return _default_saver.save_audio(
+        audio_data, output_path, sample_rate, format, channels_first
+    )

acestep/constants.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""
+Constants for ACE-Step
+Centralized constants used across the codebase
+"""
+# ==============================================================================
+# Language Constants
+# ==============================================================================
+VALID_LANGUAGES = [
+    'ar', 'az', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en',
+    'es', 'fa', 'fi', 'fr', 'he', 'hi', 'hr', 'ht', 'hu', 'id',
+    'is', 'it', 'ja', 'ko', 'la', 'lt', 'ms', 'ne', 'nl', 'no',
+    'pa', 'pl', 'pt', 'ro', 'ru', 'sa', 'sk', 'sr', 'sv', 'sw',
+    'ta', 'te', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'yue', 'zh',
+    'unknown'
+]
+# ==============================================================================
+# Keyscale Constants
+# ==============================================================================
+KEYSCALE_NOTES = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
+KEYSCALE_ACCIDENTALS = ['', '#', 'b', '♯', '♭']  # empty + ASCII sharp/flat + Unicode sharp/flat
+KEYSCALE_MODES = ['major', 'minor']
+# Generate all valid keyscales: 7 notes × 5 accidentals × 2 modes = 70 combinations
+VALID_KEYSCALES = set()
+for note in KEYSCALE_NOTES:
+    for acc in KEYSCALE_ACCIDENTALS:
+        for mode in KEYSCALE_MODES:
+            VALID_KEYSCALES.add(f"{note}{acc} {mode}")
+# ==============================================================================
+# Metadata Range Constants
+# ==============================================================================
+# BPM (Beats Per Minute) range
+BPM_MIN = 30
+BPM_MAX = 300
+# Duration range (in seconds)
+DURATION_MIN = 10
+DURATION_MAX = 600
+# Valid time signatures
+VALID_TIME_SIGNATURES = [2, 3, 4, 6]
+# ==============================================================================
+# Task Type Constants
+# ==============================================================================
+TASK_TYPES = ["text2music", "repaint", "cover", "extract", "lego", "complete"]
+# Task types available for turbo models (subset)
+TASK_TYPES_TURBO = ["text2music", "repaint", "cover"]
+# Task types available for base models (full set)
+TASK_TYPES_BASE = ["text2music", "repaint", "cover", "extract", "lego", "complete"]
+# ==============================================================================
+# Instruction Constants
+# ==============================================================================
+# Default instructions
+DEFAULT_DIT_INSTRUCTION = "Fill the audio semantic mask based on the given conditions:"
+DEFAULT_LM_INSTRUCTION = "Generate audio semantic tokens based on the given conditions:"
+DEFAULT_LM_UNDERSTAND_INSTRUCTION = "Understand the given musical conditions and describe the audio semantics accordingly:"
+DEFAULT_LM_INSPIRED_INSTRUCTION = "Expand the user's input into a more detailed and specific musical description:"
+DEFAULT_LM_REWRITE_INSTRUCTION = "Format the user's input into a more detailed and specific musical description:"
+# Instruction templates for each task type
+# Note: Some instructions use placeholders like {TRACK_NAME} or {TRACK_CLASSES}
+# These should be formatted using .format() or f-strings when used
+TASK_INSTRUCTIONS = {
+    "text2music": "Fill the audio semantic mask based on the given conditions:",
+    "repaint": "Repaint the mask area based on the given conditions:",
+    "cover": "Generate audio semantic tokens based on the given conditions:",
+    "extract": "Extract the {TRACK_NAME} track from the audio:",
+    "extract_default": "Extract the track from the audio:",
+    "lego": "Generate the {TRACK_NAME} track based on the audio context:",
+    "lego_default": "Generate the track based on the audio context:",
+    "complete": "Complete the input track with {TRACK_CLASSES}:",
+    "complete_default": "Complete the input track:",
+}
+# ==============================================================================
+# Track/Instrument Constants
+# ==============================================================================
+TRACK_NAMES = [
+    "woodwinds", "brass", "fx", "synth", "strings", "percussion",
+    "keyboard", "guitar", "bass", "drums", "backing_vocals", "vocals"
+]
+SFT_GEN_PROMPT = """# Instruction
+{}
+# Caption
+{}
+# Metas
+{}<|endoftext|>
+"""

acestep/constrained_logits_processor.py ADDED Viewed

The diff for this file is too large to render. See raw diff

acestep/dataset_handler.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""
+Dataset Handler
+Handles dataset import and exploration functionality
+"""
+from typing import Optional, Tuple, Any, Dict
+class DatasetHandler:
+    """Dataset Handler for Dataset Explorer functionality"""
+    def __init__(self):
+        """Initialize dataset handler"""
+        self.dataset = None
+        self.dataset_imported = False
+    def import_dataset(self, dataset_type: str) -> str:
+        """
+        Import dataset (temporarily disabled)
+        Args:
+            dataset_type: Type of dataset to import (e.g., "train", "test")
+        Returns:
+            Status message string
+        """
+        self.dataset_imported = False
+        return f"⚠️ Dataset import is currently disabled. Text2MusicDataset dependency not available."
+    def get_item_data(self, *args, **kwargs) -> Tuple:
+        """
+        Get dataset item (temporarily disabled)
+        Returns:
+            Tuple of placeholder values matching the expected return format
+        """
+        return "", "", "", "", "", None, None, None, "❌ Dataset not available", "", 0, "", None, None, None, {}, "text2music"

acestep/dit_alignment_score.py ADDED Viewed

	@@ -0,0 +1,870 @@

+"""
+DiT Alignment Score Module
+This module provides lyrics-to-audio alignment using cross-attention matrices
+from DiT model for generating LRC timestamps.
+Refactored from lyrics_alignment_infos.py for integration with ACE-Step.
+"""
+import numba
+import torch
+import numpy as np
+import torch.nn.functional as F
+from dataclasses import dataclass, asdict
+from typing import List, Dict, Any, Optional, Tuple, Union
+# ================= Data Classes =================
+@dataclass
+class TokenTimestamp:
+    """Stores per-token timing information."""
+    token_id: int
+    text: str
+    start: float
+    end: float
+    probability: float
+@dataclass
+class SentenceTimestamp:
+    """Stores per-sentence timing information with token list."""
+    text: str
+    start: float
+    end: float
+    tokens: List[TokenTimestamp]
+    confidence: float
+# ================= DTW Algorithm (Numba Optimized) =================
+@numba.jit(nopython=True)
+def dtw_cpu(x: np.ndarray):
+    """
+    Dynamic Time Warping algorithm optimized with Numba.
+    Args:
+        x: Cost matrix of shape [N, M]
+    Returns:
+        Tuple of (text_indices, time_indices) arrays
+    """
+    N, M = x.shape
+    # Use float32 for memory efficiency
+    cost = np.ones((N + 1, M + 1), dtype=np.float32) * np.inf
+    trace = -np.ones((N + 1, M + 1), dtype=np.float32)
+    cost[0, 0] = 0
+    for j in range(1, M + 1):
+        for i in range(1, N + 1):
+            c0 = cost[i - 1, j - 1]
+            c1 = cost[i - 1, j]
+            c2 = cost[i, j - 1]
+            if c0 < c1 and c0 < c2:
+                c, t = c0, 0
+            elif c1 < c0 and c1 < c2:
+                c, t = c1, 1
+            else:
+                c, t = c2, 2
+            cost[i, j] = x[i - 1, j - 1] + c
+            trace[i, j] = t
+    return _backtrace(trace, N, M)
+@numba.jit(nopython=True)
+def _backtrace(trace: np.ndarray, N: int, M: int):
+    """
+    Optimized backtrace function for DTW.
+    Args:
+        trace: Trace matrix of shape (N+1, M+1)
+        N, M: Original matrix dimensions
+    Returns:
+        Path array of shape (2, path_len) - first row is text indices, second is time indices
+    """
+    # Boundary handling
+    trace[0, :] = 2
+    trace[:, 0] = 1
+    # Pre-allocate array, max path length is N+M
+    max_path_len = N + M
+    path = np.zeros((2, max_path_len), dtype=np.int32)
+    i, j = N, M
+    path_idx = max_path_len - 1
+    while i > 0 or j > 0:
+        path[0, path_idx] = i - 1  # text index
+        path[1, path_idx] = j - 1  # time index
+        path_idx -= 1
+        t = trace[i, j]
+        if t == 0:
+            i -= 1
+            j -= 1
+        elif t == 1:
+            i -= 1
+        elif t == 2:
+            j -= 1
+        else:
+            break
+    actual_len = max_path_len - path_idx - 1
+    return path[:, path_idx + 1:max_path_len]
+# ================= Utility Functions =================
+def median_filter(x: torch.Tensor, filter_width: int) -> torch.Tensor:
+    """
+    Apply median filter to tensor.
+    Args:
+        x: Input tensor
+        filter_width: Width of median filter
+    Returns:
+        Filtered tensor
+    """
+    pad_width = filter_width // 2
+    if x.shape[-1] <= pad_width:
+        return x
+    if x.ndim == 2:
+        x = x[None, :]
+    x = F.pad(x, (filter_width // 2, filter_width // 2, 0, 0), mode="reflect")
+    result = x.unfold(-1, filter_width, 1).sort()[0][..., filter_width // 2]
+    if result.ndim > 2:
+        result = result.squeeze(0)
+    return result
+# ================= Main Aligner Class =================
+class MusicStampsAligner:
+    """
+    Aligner class for generating lyrics timestamps from cross-attention matrices.
+    Uses bidirectional consensus denoising and DTW for alignment.
+    """
+    def __init__(self, tokenizer):
+        """
+        Initialize the aligner.
+        Args:
+            tokenizer: Text tokenizer for decoding tokens
+        """
+        self.tokenizer = tokenizer
+    def _apply_bidirectional_consensus(
+        self,
+        weights_stack: torch.Tensor,
+        violence_level: float,
+        medfilt_width: int
+    ) -> tuple:
+        """
+        Core denoising logic using bidirectional consensus.
+        Args:
+            weights_stack: Attention weights [Heads, Tokens, Frames]
+            violence_level: Denoising strength coefficient
+            medfilt_width: Median filter width
+        Returns:
+            Tuple of (calc_matrix, energy_matrix) as numpy arrays
+        """
+        # A. Bidirectional Consensus
+        row_prob = F.softmax(weights_stack, dim=-1)  # Token -> Frame
+        col_prob = F.softmax(weights_stack, dim=-2)  # Frame -> Token
+        processed = row_prob * col_prob
+        # 1. Row suppression (kill horizontal crossing lines)
+        row_medians = torch.quantile(processed, 0.5, dim=-1, keepdim=True)
+        processed = processed - (violence_level * row_medians)
+        processed = torch.relu(processed)
+        # 2. Column suppression (kill vertical crossing lines)
+        col_medians = torch.quantile(processed, 0.5, dim=-2, keepdim=True)
+        processed = processed - (violence_level * col_medians)
+        processed = torch.relu(processed)
+        # C. Power sharpening
+        processed = processed ** 2
+        # Energy matrix for confidence
+        energy_matrix = processed.mean(dim=0).cpu().numpy()
+        # D. Z-Score normalization
+        std, mean = torch.std_mean(processed, unbiased=False)
+        weights_processed = (processed - mean) / (std + 1e-9)
+        # E. Median filtering
+        weights_processed = median_filter(weights_processed, filter_width=medfilt_width)
+        calc_matrix = weights_processed.mean(dim=0).numpy()
+        return calc_matrix, energy_matrix
+    def _preprocess_attention(
+        self,
+        attention_matrix: torch.Tensor,
+        custom_config: Dict[int, List[int]],
+        violence_level: float,
+        medfilt_width: int = 7
+    ) -> tuple:
+        """
+        Preprocess attention matrix for alignment.
+        Args:
+            attention_matrix: Attention tensor [Layers, Heads, Tokens, Frames]
+            custom_config: Dict mapping layer indices to head indices
+            violence_level: Denoising strength
+            medfilt_width: Median filter width
+        Returns:
+            Tuple of (calc_matrix, energy_matrix, visual_matrix)
+        """
+        if not isinstance(attention_matrix, torch.Tensor):
+            weights = torch.tensor(attention_matrix)
+        else:
+            weights = attention_matrix.clone()
+        weights = weights.cpu().float()
+        selected_tensors = []
+        for layer_idx, head_indices in custom_config.items():
+            for head_idx in head_indices:
+                if layer_idx < weights.shape[0] and head_idx < weights.shape[1]:
+                    head_matrix = weights[layer_idx, head_idx]
+                    selected_tensors.append(head_matrix)
+        if not selected_tensors:
+            return None, None, None
+        # Stack selected heads: [Heads, Tokens, Frames]
+        weights_stack = torch.stack(selected_tensors, dim=0)
+        visual_matrix = weights_stack.mean(dim=0).numpy()
+        calc_matrix, energy_matrix = self._apply_bidirectional_consensus(
+            weights_stack, violence_level, medfilt_width
+        )
+        return calc_matrix, energy_matrix, visual_matrix
+    def stamps_align_info(
+        self,
+        attention_matrix: torch.Tensor,
+        lyrics_tokens: List[int],
+        total_duration_seconds: float,
+        custom_config: Dict[int, List[int]],
+        return_matrices: bool = False,
+        violence_level: float = 2.0,
+        medfilt_width: int = 1
+    ) -> Dict[str, Any]:
+        """
+        Get alignment information from attention matrix.
+        Args:
+            attention_matrix: Cross-attention tensor [Layers, Heads, Tokens, Frames]
+            lyrics_tokens: List of lyrics token IDs
+            total_duration_seconds: Total audio duration in seconds
+            custom_config: Dict mapping layer indices to head indices
+            return_matrices: Whether to return intermediate matrices
+            violence_level: Denoising strength
+            medfilt_width: Median filter width
+        Returns:
+            Dict containing calc_matrix, lyrics_tokens, total_duration_seconds,
+            and optionally energy_matrix and vis_matrix
+        """
+        calc_matrix, energy_matrix, visual_matrix = self._preprocess_attention(
+            attention_matrix, custom_config, violence_level, medfilt_width
+        )
+        if calc_matrix is None:
+            return {
+                "calc_matrix": None,
+                "lyrics_tokens": lyrics_tokens,
+                "total_duration_seconds": total_duration_seconds,
+                "error": "No valid attention heads found"
+            }
+        return_dict = {
+            "calc_matrix": calc_matrix,
+            "lyrics_tokens": lyrics_tokens,
+            "total_duration_seconds": total_duration_seconds
+        }
+        if return_matrices:
+            return_dict['energy_matrix'] = energy_matrix
+            return_dict['vis_matrix'] = visual_matrix
+        return return_dict
+    def _decode_tokens_incrementally(self, token_ids: List[int]) -> List[str]:
+        """
+        Decode tokens incrementally to properly handle multi-byte UTF-8 characters.
+        For Chinese and other multi-byte characters, the tokenizer may split them
+        into multiple byte-level tokens. Decoding each token individually produces
+        invalid UTF-8 sequences (showing as �). This method uses byte-level comparison
+        to correctly track which characters each token contributes.
+        Args:
+            token_ids: List of token IDs
+        Returns:
+            List of decoded text for each token position
+        """
+        decoded_tokens = []
+        prev_bytes = b""
+        for i in range(len(token_ids)):
+            # Decode tokens from start to current position
+            current_text = self.tokenizer.decode(token_ids[:i+1], skip_special_tokens=False)
+            current_bytes = current_text.encode('utf-8', errors='surrogatepass')
+            # The contribution of current token is the new bytes added
+            if len(current_bytes) >= len(prev_bytes):
+                new_bytes = current_bytes[len(prev_bytes):]
+                # Try to decode the new bytes; if incomplete, use empty string
+                try:
+                    token_text = new_bytes.decode('utf-8')
+                except UnicodeDecodeError:
+                    # Incomplete UTF-8 sequence, this token doesn't complete a character
+                    token_text = ""
+            else:
+                # Edge case: current decode is shorter (shouldn't happen normally)
+                token_text = ""
+            decoded_tokens.append(token_text)
+            prev_bytes = current_bytes
+        return decoded_tokens
+    def token_timestamps(
+        self,
+        calc_matrix: np.ndarray,
+        lyrics_tokens: List[int],
+        total_duration_seconds: float
+    ) -> List[TokenTimestamp]:
+        """
+        Generate per-token timestamps using DTW.
+        Args:
+            calc_matrix: Processed attention matrix [Tokens, Frames]
+            lyrics_tokens: List of token IDs
+            total_duration_seconds: Total audio duration
+        Returns:
+            List of TokenTimestamp objects
+        """
+        n_frames = calc_matrix.shape[-1]
+        text_indices, time_indices = dtw_cpu(-calc_matrix.astype(np.float64))
+        seconds_per_frame = total_duration_seconds / n_frames
+        alignment_results = []
+        # Use incremental decoding to properly handle multi-byte UTF-8 characters
+        decoded_tokens = self._decode_tokens_incrementally(lyrics_tokens)
+        for i in range(len(lyrics_tokens)):
+            mask = (text_indices == i)
+            if not np.any(mask):
+                start = alignment_results[-1].end if alignment_results else 0.0
+                end = start
+                token_conf = 0.0
+            else:
+                times = time_indices[mask] * seconds_per_frame
+                start = times[0]
+                end = times[-1]
+                token_conf = 0.0
+            if end < start:
+                end = start
+            alignment_results.append(TokenTimestamp(
+                token_id=lyrics_tokens[i],
+                text=decoded_tokens[i],
+                start=float(start),
+                end=float(end),
+                probability=token_conf
+            ))
+        return alignment_results
+    def _decode_sentence_from_tokens(self, tokens: List[TokenTimestamp]) -> str:
+        """
+        Decode a sentence by decoding all token IDs together.
+        This avoids UTF-8 encoding issues from joining individual token texts.
+        Args:
+            tokens: List of TokenTimestamp objects
+        Returns:
+            Properly decoded sentence text
+        """
+        token_ids = [t.token_id for t in tokens]
+        return self.tokenizer.decode(token_ids, skip_special_tokens=False)
+    def sentence_timestamps(
+        self,
+        token_alignment: List[TokenTimestamp]
+    ) -> List[SentenceTimestamp]:
+        """
+        Group token timestamps into sentence timestamps.
+        Args:
+            token_alignment: List of TokenTimestamp objects
+        Returns:
+            List of SentenceTimestamp objects
+        """
+        results = []
+        current_tokens = []
+        for token in token_alignment:
+            current_tokens.append(token)
+            if '\n' in token.text:
+                # Decode all token IDs together to avoid UTF-8 issues
+                full_text = self._decode_sentence_from_tokens(current_tokens)
+                if full_text.strip():
+                    valid_scores = [t.probability for t in current_tokens if t.probability > 0]
+                    sent_conf = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
+                    results.append(SentenceTimestamp(
+                        text=full_text.strip(),
+                        start=round(current_tokens[0].start, 3),
+                        end=round(current_tokens[-1].end, 3),
+                        tokens=list(current_tokens),
+                        confidence=sent_conf
+                    ))
+                current_tokens = []
+        # Handle last sentence
+        if current_tokens:
+            # Decode all token IDs together to avoid UTF-8 issues
+            full_text = self._decode_sentence_from_tokens(current_tokens)
+            if full_text.strip():
+                valid_scores = [t.probability for t in current_tokens if t.probability > 0]
+                sent_conf = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
+                results.append(SentenceTimestamp(
+                    text=full_text.strip(),
+                    start=round(current_tokens[0].start, 3),
+                    end=round(current_tokens[-1].end, 3),
+                    tokens=list(current_tokens),
+                    confidence=sent_conf
+                ))
+        # Normalize confidence scores
+        if results:
+            all_scores = [s.confidence for s in results]
+            min_score = min(all_scores)
+            max_score = max(all_scores)
+            score_range = max_score - min_score
+            if score_range > 1e-9:
+                for s in results:
+                    normalized_score = (s.confidence - min_score) / score_range
+                    s.confidence = round(normalized_score, 2)
+            else:
+                for s in results:
+                    s.confidence = round(s.confidence, 2)
+        return results
+    def format_lrc(
+        self,
+        sentence_timestamps: List[SentenceTimestamp],
+        include_end_time: bool = False
+    ) -> str:
+        """
+        Format sentence timestamps as LRC lyrics format.
+        Args:
+            sentence_timestamps: List of SentenceTimestamp objects
+            include_end_time: Whether to include end time (enhanced LRC format)
+        Returns:
+            LRC formatted string
+        """
+        lines = []
+        for sentence in sentence_timestamps:
+            # Convert seconds to mm:ss.xx format
+            start_minutes = int(sentence.start // 60)
+            start_seconds = sentence.start % 60
+            if include_end_time:
+                end_minutes = int(sentence.end // 60)
+                end_seconds = sentence.end % 60
+                timestamp = f"[{start_minutes:02d}:{start_seconds:05.2f}][{end_minutes:02d}:{end_seconds:05.2f}]"
+            else:
+                timestamp = f"[{start_minutes:02d}:{start_seconds:05.2f}]"
+            # Clean the text (remove structural tags like [verse], [chorus])
+            text = sentence.text
+            lines.append(f"{timestamp}{text}")
+        return "\n".join(lines)
+    def get_timestamps_and_lrc(
+        self,
+        calc_matrix: np.ndarray,
+        lyrics_tokens: List[int],
+        total_duration_seconds: float
+    ) -> Dict[str, Any]:
+        """
+        Convenience method to get both timestamps and LRC in one call.
+        Args:
+            calc_matrix: Processed attention matrix
+            lyrics_tokens: List of token IDs
+            total_duration_seconds: Total audio duration
+        Returns:
+            Dict containing token_timestamps, sentence_timestamps, and lrc_text
+        """
+        token_stamps = self.token_timestamps(
+            calc_matrix=calc_matrix,
+            lyrics_tokens=lyrics_tokens,
+            total_duration_seconds=total_duration_seconds
+        )
+        sentence_stamps = self.sentence_timestamps(token_stamps)
+        lrc_text = self.format_lrc(sentence_stamps)
+        return {
+            "token_timestamps": token_stamps,
+            "sentence_timestamps": sentence_stamps,
+            "lrc_text": lrc_text
+        }
+class MusicLyricScorer:
+    """
+    Scorer class for evaluating lyrics-to-audio alignment quality.
+    Focuses on calculating alignment quality metrics (Coverage, Monotonicity, Confidence)
+    using tensor operations for potential differentiability or GPU acceleration.
+    """
+    def __init__(self, tokenizer: Any):
+        """
+        Initialize the aligner.
+        Args:
+            tokenizer: Tokenizer instance (must implement .decode()).
+        """
+        self.tokenizer = tokenizer
+    def _generate_token_type_mask(self, token_ids: List[int]) -> np.ndarray:
+        """
+        Generate a mask distinguishing lyrics (1) from structural tags (0).
+        Uses self.tokenizer to decode tokens.
+        Args:
+            token_ids: List of token IDs.
+        Returns:
+            Numpy array of shape [len(token_ids)] with 1 or 0.
+        """
+        decoded_tokens = [self.tokenizer.decode([tid]) for tid in token_ids]
+        mask = np.ones(len(token_ids), dtype=np.int32)
+        in_bracket = False
+        for i, token_str in enumerate(decoded_tokens):
+            if '[' in token_str:
+                in_bracket = True
+            if in_bracket:
+                mask[i] = 0
+            if ']' in token_str:
+                in_bracket = False
+                mask[i] = 0
+        return mask
+    def _preprocess_attention(
+            self,
+            attention_matrix: Union[torch.Tensor, np.ndarray],
+            custom_config: Dict[int, List[int]],
+            medfilt_width: int = 1
+    ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[torch.Tensor]]:
+        """
+        Extracts and normalizes the attention matrix.
+        Logic V4: Uses Min-Max normalization to highlight energy differences.
+        Args:
+            attention_matrix: Raw attention tensor [Layers, Heads, Tokens, Frames].
+            custom_config: Config mapping layers to heads.
+            medfilt_width: Width for median filtering.
+        Returns:
+            Tuple of (calc_matrix, energy_matrix, avg_weights_tensor).
+        """
+        # 1. Prepare Tensor
+        if not isinstance(attention_matrix, torch.Tensor):
+            weights = torch.tensor(attention_matrix)
+        else:
+            weights = attention_matrix.clone()
+        weights = weights.cpu().float()
+        # 2. Select Heads based on config
+        selected_tensors = []
+        for layer_idx, head_indices in custom_config.items():
+            for head_idx in head_indices:
+                if layer_idx < weights.shape[0] and head_idx < weights.shape[1]:
+                    selected_tensors.append(weights[layer_idx, head_idx])
+        if not selected_tensors:
+            return None, None, None
+        weights_stack = torch.stack(selected_tensors, dim=0)
+        # 3. Average Heads
+        avg_weights = weights_stack.mean(dim=0)  # [Tokens, Frames]
+        # 4. Preprocessing Logic
+        # Min-Max normalization preserving energy distribution
+        # Median filter is applied to the energy matrix
+        energy_tensor = median_filter(avg_weights, filter_width=medfilt_width)
+        energy_matrix = energy_tensor.numpy()
+        e_min, e_max = energy_matrix.min(), energy_matrix.max()
+        if e_max - e_min > 1e-9:
+            energy_matrix = (energy_matrix - e_min) / (e_max - e_min)
+        else:
+            energy_matrix = np.zeros_like(energy_matrix)
+        # Contrast enhancement for DTW pathfinding
+        # calc_matrix is used for pathfinding, energy_matrix for scoring
+        calc_matrix = energy_matrix ** 2
+        return calc_matrix, energy_matrix, avg_weights
+    def _compute_alignment_metrics(
+            self,
+            energy_matrix: torch.Tensor,
+            path_coords: torch.Tensor,
+            type_mask: torch.Tensor,
+            time_weight: float = 0.01,
+            overlap_frames: float = 9.0,
+            instrumental_weight: float = 1.0
+    ) -> Tuple[float, float, float]:
+        """
+        Core metric calculation logic using high-precision Tensor operations.
+        Args:
+            energy_matrix: Normalized energy [Rows, Cols].
+            path_coords: DTW path coordinates [Steps, 2].
+            type_mask: Token type mask [Rows] (1=Lyrics, 0=Tags).
+            time_weight: Minimum energy threshold for monotonicity.
+            overlap_frames: Allowed overlap for monotonicity check.
+            instrumental_weight: Weight for non-lyric tokens in confidence calc.
+        Returns:
+            Tuple of (coverage, monotonicity, confidence).
+        """
+        # Ensure high precision for internal calculation
+        energy_matrix = energy_matrix.to(dtype=torch.float64)
+        path_coords = path_coords.long()
+        type_mask = type_mask.long()
+        device = energy_matrix.device
+        rows, cols = energy_matrix.shape
+        is_lyrics_row = (type_mask == 1)
+        # ================= A. Coverage Score =================
+        # Ratio of lyric lines that have significant energy peak
+        row_max_energies = energy_matrix.max(dim=1).values
+        total_sung_rows = is_lyrics_row.sum().double()
+        coverage_threshold = 0.1
+        valid_sung_mask = is_lyrics_row & (row_max_energies > coverage_threshold)
+        valid_sung_rows = valid_sung_mask.sum().double()
+        if total_sung_rows > 0:
+            coverage_score = valid_sung_rows / total_sung_rows
+        else:
+            coverage_score = torch.tensor(1.0, device=device, dtype=torch.float64)
+        # ================= B. Monotonicity Score =================
+        # Check if the "center of mass" of lyric lines moves forward in time
+        col_indices = torch.arange(cols, device=device, dtype=torch.float64)
+        # Zero out low energy noise
+        weights = torch.where(
+            energy_matrix > time_weight,
+            energy_matrix,
+            torch.zeros_like(energy_matrix)
+        )
+        sum_w = weights.sum(dim=1)
+        sum_t = (weights * col_indices).sum(dim=1)
+        # Calculate centroids
+        centroids = torch.full((rows,), -1.0, device=device, dtype=torch.float64)
+        valid_w_mask = sum_w > 1e-9
+        centroids[valid_w_mask] = sum_t[valid_w_mask] / sum_w[valid_w_mask]
+        # Extract sequence of valid lyrics centroids
+        valid_sequence_mask = is_lyrics_row & (centroids >= 0)
+        sung_centroids = centroids[valid_sequence_mask]
+        cnt = sung_centroids.shape[0]
+        if cnt > 1:
+            curr_c = sung_centroids[:-1]
+            next_c = sung_centroids[1:]
+            # Check non-decreasing order with overlap tolerance
+            non_decreasing = (next_c >= (curr_c - overlap_frames)).double().sum()
+            pairs = torch.tensor(cnt - 1, device=device, dtype=torch.float64)
+            monotonicity_score = non_decreasing / pairs
+        else:
+            monotonicity_score = torch.tensor(1.0, device=device, dtype=torch.float64)
+        # ================= C. Path Confidence =================
+        # Average energy along the optimal path
+        if path_coords.shape[0] > 0:
+            p_rows = path_coords[:, 0]
+            p_cols = path_coords[:, 1]
+            path_energies = energy_matrix[p_rows, p_cols]
+            step_weights = torch.ones_like(path_energies)
+            # Lower weight for instrumental/tag steps
+            is_inst_step = (type_mask[p_rows] == 0)
+            step_weights[is_inst_step] = instrumental_weight
+            total_energy = (path_energies * step_weights).sum()
+            total_steps = step_weights.sum()
+            if total_steps > 0:
+                path_confidence = total_energy / total_steps
+            else:
+                path_confidence = torch.tensor(0.0, device=device, dtype=torch.float64)
+        else:
+            path_confidence = torch.tensor(0.0, device=device, dtype=torch.float64)
+        return coverage_score.item(), monotonicity_score.item(), path_confidence.item()
+    def lyrics_alignment_info(
+            self,
+            attention_matrix: Union[torch.Tensor, np.ndarray],
+            token_ids: List[int],
+            custom_config: Dict[int, List[int]],
+            return_matrices: bool = False,
+            medfilt_width: int = 1
+    ) -> Dict[str, Any]:
+        """
+        Generates alignment path and processed matrices.
+        Args:
+            attention_matrix: Input attention tensor.
+            token_ids: Corresponding token IDs.
+            custom_config: Layer/Head configuration.
+            return_matrices: If True, returns matrices in the output.
+            medfilt_width: Median filter width.
+        Returns:
+            Dict or AlignmentInfo object containing path and masks.
+        """
+        calc_matrix, energy_matrix, vis_matrix = self._preprocess_attention(
+            attention_matrix, custom_config, medfilt_width
+        )
+        if calc_matrix is None:
+            return {
+                "calc_matrix": None,
+                "error": "No valid attention heads found"
+            }
+        # 1. Generate Semantic Mask (1=Lyrics, 0=Tags)
+        # Uses self.tokenizer internally
+        type_mask = self._generate_token_type_mask(token_ids)
+        # Safety check for shape mismatch
+        if len(type_mask) != energy_matrix.shape[0]:
+            # Fallback to all lyrics if shapes don't align
+            type_mask = np.ones(energy_matrix.shape[0], dtype=np.int32)
+        # 2. DTW Pathfinding
+        # Using negative calc_matrix because DTW minimizes cost
+        text_indices, time_indices = dtw_cpu(-calc_matrix.astype(np.float32))
+        path_coords = np.stack([text_indices, time_indices], axis=1)
+        return_dict = {
+            "path_coords": path_coords,
+            "type_mask": type_mask,
+            "energy_matrix": energy_matrix
+        }
+        if return_matrices:
+            return_dict['calc_matrix'] = calc_matrix
+            return_dict['vis_matrix'] = vis_matrix
+        return return_dict
+    def calculate_score(
+            self,
+            energy_matrix: Union[torch.Tensor, np.ndarray],
+            type_mask: Union[torch.Tensor, np.ndarray],
+            path_coords: Union[torch.Tensor, np.ndarray],
+            time_weight: float = 0.01,
+            overlap_frames: float = 9.0,
+            instrumental_weight: float = 1.0
+    ) -> Dict[str, Any]:
+        """
+        Calculates the final alignment score based on pre-computed components.
+        Args:
+            energy_matrix: Processed energy matrix.
+            type_mask: Token type mask.
+            path_coords: DTW path coordinates.
+            time_weight: Minimum energy threshold for monotonicity.
+            overlap_frames: Allowed backward movement frames.
+            instrumental_weight: Weight for non-lyric path steps.
+        Returns:
+            AlignmentScore object containing individual metrics and final score.
+        """
+        # Ensure Inputs are Tensors on the correct device
+        if not isinstance(energy_matrix, torch.Tensor):
+            energy_matrix = torch.tensor(energy_matrix, device='cuda', dtype=torch.float32)
+        device = energy_matrix.device
+        if not isinstance(type_mask, torch.Tensor):
+            type_mask = torch.tensor(type_mask, device=device, dtype=torch.long)
+        else:
+            type_mask = type_mask.to(device=device, dtype=torch.long)
+        if not isinstance(path_coords, torch.Tensor):
+            path_coords = torch.tensor(path_coords, device=device, dtype=torch.long)
+        else:
+            path_coords = path_coords.to(device=device, dtype=torch.long)
+        # Compute Metrics
+        coverage, monotonicity, confidence = self._compute_alignment_metrics(
+            energy_matrix=energy_matrix,
+            path_coords=path_coords,
+            type_mask=type_mask,
+            time_weight=time_weight,
+            overlap_frames=overlap_frames,
+            instrumental_weight=instrumental_weight
+        )
+        # Final Score Calculation
+        # (Cov^2 * Mono^2 * Conf)
+        final_score = (coverage ** 2) * (monotonicity ** 2) * confidence
+        final_score = float(np.clip(final_score, 0.0, 1.0))
+        return {
+            "lyrics_score": round(final_score, 4)
+        }

acestep/genres_vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

acestep/gradio_ui/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from acestep.gradio_ui.interfaces import create_gradio_interface

acestep/gradio_ui/events/__init__.py ADDED Viewed

	@@ -0,0 +1,1355 @@

+"""
+Gradio UI Event Handlers Module
+Main entry point for setting up all event handlers
+"""
+import os
+import gradio as gr
+from typing import Optional
+from loguru import logger
+# Import handler modules
+from . import generation_handlers as gen_h
+from . import results_handlers as res_h
+from . import training_handlers as train_h
+from acestep.gradio_ui.i18n import t
+# HuggingFace Space environment detection for ZeroGPU support
+IS_HUGGINGFACE_SPACE = os.environ.get("SPACE_ID") is not None
+def _get_spaces_gpu_decorator(duration=120):
+    """
+    Get the @spaces.GPU decorator if running in HuggingFace Space environment.
+    Returns identity decorator if not in Space environment.
+    """
+    if IS_HUGGINGFACE_SPACE:
+        try:
+            import spaces
+            return spaces.GPU(duration=duration)
+        except ImportError:
+            logger.warning("spaces package not found, GPU decorator disabled")
+            return lambda func: func
+    return lambda func: func
+def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, dataset_section, generation_section, results_section, init_params=None):
+    """Setup event handlers connecting UI components and business logic
+    Args:
+        init_params: Dictionary containing initialization parameters including:
+            - dit_handler_2: Optional second DiT handler for multi-model setup
+            - available_dit_models: List of available DiT model names
+            - config_path: Primary model config path
+            - config_path_2: Secondary model config path (if available)
+    """
+    # Get secondary DiT handler from init_params (for multi-model support)
+    dit_handler_2 = init_params.get('dit_handler_2') if init_params else None
+    config_path_1 = init_params.get('config_path', '') if init_params else ''
+    config_path_2 = init_params.get('config_path_2', '') if init_params else ''
+    # ========== Dataset Handlers ==========
+    dataset_section["import_dataset_btn"].click(
+        fn=dataset_handler.import_dataset,
+        inputs=[dataset_section["dataset_type"]],
+        outputs=[dataset_section["data_status"]]
+    )
+    # ========== Service Initialization ==========
+    generation_section["refresh_btn"].click(
+        fn=lambda: gen_h.refresh_checkpoints(dit_handler),
+        outputs=[generation_section["checkpoint_dropdown"]]
+    )
+    generation_section["config_path"].change(
+        fn=gen_h.update_model_type_settings,
+        inputs=[generation_section["config_path"]],
+        outputs=[
+            generation_section["inference_steps"],
+            generation_section["guidance_scale"],
+            generation_section["use_adg"],
+            generation_section["shift"],
+            generation_section["cfg_interval_start"],
+            generation_section["cfg_interval_end"],
+            generation_section["task_type"],
+        ]
+    )
+    generation_section["init_btn"].click(
+        fn=lambda *args: gen_h.init_service_wrapper(dit_handler, llm_handler, *args),
+        inputs=[
+            generation_section["checkpoint_dropdown"],
+            generation_section["config_path"],
+            generation_section["device"],
+            generation_section["init_llm_checkbox"],
+            generation_section["lm_model_path"],
+            generation_section["backend_dropdown"],
+            generation_section["use_flash_attention_checkbox"],
+            generation_section["offload_to_cpu_checkbox"],
+            generation_section["offload_dit_to_cpu_checkbox"],
+        ],
+        outputs=[
+            generation_section["init_status"],
+            generation_section["generate_btn"],
+            generation_section["service_config_accordion"],
+            # Model type settings (updated based on actual loaded model)
+            generation_section["inference_steps"],
+            generation_section["guidance_scale"],
+            generation_section["use_adg"],
+            generation_section["shift"],
+            generation_section["cfg_interval_start"],
+            generation_section["cfg_interval_end"],
+            generation_section["task_type"],
+        ]
+    )
+    # ========== LoRA Handlers ==========
+    generation_section["load_lora_btn"].click(
+        fn=dit_handler.load_lora,
+        inputs=[generation_section["lora_path"]],
+        outputs=[generation_section["lora_status"]]
+    ).then(
+        # Update checkbox to enabled state after loading
+        fn=lambda: gr.update(value=True),
+        outputs=[generation_section["use_lora_checkbox"]]
+    )
+    generation_section["unload_lora_btn"].click(
+        fn=dit_handler.unload_lora,
+        outputs=[generation_section["lora_status"]]
+    ).then(
+        # Update checkbox to disabled state after unloading
+        fn=lambda: gr.update(value=False),
+        outputs=[generation_section["use_lora_checkbox"]]
+    )
+    generation_section["use_lora_checkbox"].change(
+        fn=dit_handler.set_use_lora,
+        inputs=[generation_section["use_lora_checkbox"]],
+        outputs=[generation_section["lora_status"]]
+    )
+    # ========== UI Visibility Updates ==========
+    generation_section["init_llm_checkbox"].change(
+        fn=gen_h.update_negative_prompt_visibility,
+        inputs=[generation_section["init_llm_checkbox"]],
+        outputs=[generation_section["lm_negative_prompt"]]
+    )
+    generation_section["init_llm_checkbox"].change(
+        fn=gen_h.update_audio_cover_strength_visibility,
+        inputs=[generation_section["task_type"], generation_section["init_llm_checkbox"]],
+        outputs=[generation_section["audio_cover_strength"]]
+    )
+    generation_section["task_type"].change(
+        fn=gen_h.update_audio_cover_strength_visibility,
+        inputs=[generation_section["task_type"], generation_section["init_llm_checkbox"]],
+        outputs=[generation_section["audio_cover_strength"]]
+    )
+    generation_section["batch_size_input"].change(
+        fn=gen_h.update_audio_components_visibility,
+        inputs=[generation_section["batch_size_input"]],
+        outputs=[
+            results_section["audio_col_1"],
+            results_section["audio_col_2"],
+            results_section["audio_col_3"],
+            results_section["audio_col_4"],
+            results_section["audio_row_5_8"],
+            results_section["audio_col_5"],
+            results_section["audio_col_6"],
+            results_section["audio_col_7"],
+            results_section["audio_col_8"],
+        ]
+    )
+    # ========== Audio Conversion ==========
+    generation_section["convert_src_to_codes_btn"].click(
+        fn=lambda src: gen_h.convert_src_audio_to_codes_wrapper(dit_handler, src),
+        inputs=[generation_section["src_audio"]],
+        outputs=[generation_section["text2music_audio_code_string"]]
+    )
+    # ========== Instruction UI Updates ==========
+    for trigger in [generation_section["task_type"], generation_section["track_name"], generation_section["complete_track_classes"]]:
+        trigger.change(
+            fn=lambda *args: gen_h.update_instruction_ui(dit_handler, *args),
+            inputs=[
+                generation_section["task_type"],
+                generation_section["track_name"],
+                generation_section["complete_track_classes"],
+                generation_section["text2music_audio_code_string"],
+                generation_section["init_llm_checkbox"]
+            ],
+            outputs=[
+                generation_section["instruction_display_gen"],
+                generation_section["track_name"],
+                generation_section["complete_track_classes"],
+                generation_section["audio_cover_strength"],
+                generation_section["repainting_group"],
+                generation_section["text2music_audio_codes_group"],
+            ]
+        )
+    # ========== Sample/Transcribe Handlers ==========
+    # Load random example from ./examples/text2music directory
+    generation_section["sample_btn"].click(
+        fn=lambda task: gen_h.load_random_example(task) + (True,),
+        inputs=[
+            generation_section["task_type"],
+        ],
+        outputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["think_checkbox"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["vocal_language"],
+            generation_section["time_signature"],
+            results_section["is_format_caption_state"]
+        ]
+    )
+    generation_section["text2music_audio_code_string"].change(
+        fn=gen_h.update_transcribe_button_text,
+        inputs=[generation_section["text2music_audio_code_string"]],
+        outputs=[generation_section["transcribe_btn"]]
+    )
+    generation_section["transcribe_btn"].click(
+        fn=lambda codes, debug: gen_h.transcribe_audio_codes(llm_handler, codes, debug),
+        inputs=[
+            generation_section["text2music_audio_code_string"],
+            generation_section["constrained_decoding_debug"]
+        ],
+        outputs=[
+            results_section["status_output"],
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["vocal_language"],
+            generation_section["time_signature"],
+            results_section["is_format_caption_state"]
+        ]
+    )
+    # ========== Reset Format Caption Flag ==========
+    for trigger in [generation_section["captions"], generation_section["lyrics"], generation_section["bpm"],
+                    generation_section["key_scale"], generation_section["time_signature"],
+                    generation_section["vocal_language"], generation_section["audio_duration"]]:
+        trigger.change(
+            fn=gen_h.reset_format_caption_flag,
+            inputs=[],
+            outputs=[results_section["is_format_caption_state"]]
+        )
+    # ========== Audio Uploads Accordion ==========
+    for trigger in [generation_section["reference_audio"], generation_section["src_audio"]]:
+        trigger.change(
+            fn=gen_h.update_audio_uploads_accordion,
+            inputs=[generation_section["reference_audio"], generation_section["src_audio"]],
+            outputs=[generation_section["audio_uploads_accordion"]]
+        )
+    # ========== Instrumental Checkbox ==========
+    generation_section["instrumental_checkbox"].change(
+        fn=gen_h.handle_instrumental_checkbox,
+        inputs=[generation_section["instrumental_checkbox"], generation_section["lyrics"]],
+        outputs=[generation_section["lyrics"]]
+    )
+    # ========== Format Button ==========
+    # Note: cfg_scale and negative_prompt are not supported in format mode
+    @_get_spaces_gpu_decorator(duration=120)
+    def handle_format_sample_wrapper(caption, lyrics, bpm, duration, key_scale, time_sig, temp, top_k, top_p, debug):
+        return gen_h.handle_format_sample(
+            llm_handler, caption, lyrics, bpm, duration, key_scale, time_sig, temp, top_k, top_p, debug
+        )
+    generation_section["format_btn"].click(
+        fn=handle_format_sample_wrapper,
+        inputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["lm_temperature"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["constrained_decoding_debug"],
+        ],
+        outputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["vocal_language"],
+            generation_section["time_signature"],
+            results_section["is_format_caption_state"],
+            results_section["status_output"],
+        ]
+    )
+    # ========== Generation Mode Toggle (Simple/Custom/Cover/Repaint) ==========
+    generation_section["generation_mode"].change(
+        fn=gen_h.handle_generation_mode_change,
+        inputs=[generation_section["generation_mode"]],
+        outputs=[
+            generation_section["simple_mode_group"],
+            generation_section["custom_mode_content"],
+            generation_section["cover_mode_group"],
+            generation_section["repainting_group"],
+            generation_section["task_type"],
+            generation_section["generate_btn"],
+            generation_section["simple_sample_created"],
+            generation_section["src_audio_group"],
+            generation_section["audio_cover_strength"],
+            generation_section["think_checkbox"],  # Disable thinking for cover/repaint modes
+        ]
+    )
+    # ========== Process Source Audio Button ==========
+    # Combines Convert to Codes + Transcribe in one step
+    # Note: @spaces.GPU decorator must be on the function passed directly to fn=,
+    # not on a module-level function wrapped in a lambda. Lambdas capturing handler
+    # objects cause pickling errors on ZeroGPU because the model contains unpicklable
+    # local objects (e.g. AceStepDiTModel.__init__ lambdas).
+    @_get_spaces_gpu_decorator(duration=120)
+    def process_source_audio_wrapper(src, debug):
+        return gen_h.process_source_audio(dit_handler, llm_handler, src, debug)
+    generation_section["process_src_btn"].click(
+        fn=process_source_audio_wrapper,
+        inputs=[
+            generation_section["src_audio"],
+            generation_section["constrained_decoding_debug"]
+        ],
+        outputs=[
+            generation_section["text2music_audio_code_string"],
+            results_section["status_output"],
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["vocal_language"],
+            generation_section["time_signature"],
+            results_section["is_format_caption_state"],
+        ]
+    )
+    # ========== Simple Mode Instrumental Checkbox ==========
+    # When instrumental is checked, disable vocal language and set to ["unknown"]
+    generation_section["simple_instrumental_checkbox"].change(
+        fn=gen_h.handle_simple_instrumental_change,
+        inputs=[generation_section["simple_instrumental_checkbox"]],
+        outputs=[generation_section["simple_vocal_language"]]
+    )
+    # ========== Random Description Button ==========
+    generation_section["random_desc_btn"].click(
+        fn=gen_h.load_random_simple_description,
+        inputs=[],
+        outputs=[
+            generation_section["simple_query_input"],
+            generation_section["simple_instrumental_checkbox"],
+            generation_section["simple_vocal_language"],
+        ]
+    )
+    # ========== Create Sample Button (Simple Mode) ==========
+    # Note: cfg_scale and negative_prompt are not supported in create_sample mode
+    @_get_spaces_gpu_decorator(duration=120)
+    def handle_create_sample_wrapper(query, instrumental, vocal_lang, temp, top_k, top_p, debug):
+        return gen_h.handle_create_sample(
+            llm_handler, query, instrumental, vocal_lang, temp, top_k, top_p, debug
+        )
+    generation_section["create_sample_btn"].click(
+        fn=handle_create_sample_wrapper,
+        inputs=[
+            generation_section["simple_query_input"],
+            generation_section["simple_instrumental_checkbox"],
+            generation_section["simple_vocal_language"],
+            generation_section["lm_temperature"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["constrained_decoding_debug"],
+        ],
+        outputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["vocal_language"],
+            generation_section["simple_vocal_language"],
+            generation_section["time_signature"],
+            generation_section["instrumental_checkbox"],
+            generation_section["caption_accordion"],
+            generation_section["lyrics_accordion"],
+            generation_section["generate_btn"],
+            generation_section["simple_sample_created"],
+            generation_section["think_checkbox"],
+            results_section["is_format_caption_state"],
+            results_section["status_output"],
+        ]
+    )
+    # ========== Load/Save Metadata ==========
+    generation_section["load_file"].upload(
+        fn=gen_h.load_metadata,
+        inputs=[generation_section["load_file"]],
+        outputs=[
+            generation_section["task_type"],
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["vocal_language"],
+            generation_section["bpm"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["audio_duration"],
+            generation_section["batch_size_input"],
+            generation_section["inference_steps"],
+            generation_section["guidance_scale"],
+            generation_section["seed"],
+            generation_section["random_seed_checkbox"],
+            generation_section["use_adg"],
+            generation_section["cfg_interval_start"],
+            generation_section["cfg_interval_end"],
+            generation_section["shift"],
+            generation_section["infer_method"],
+            generation_section["custom_timesteps"],
+            generation_section["audio_format"],
+            generation_section["lm_temperature"],
+            generation_section["lm_cfg_scale"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["lm_negative_prompt"],
+            generation_section["use_cot_metas"],  # Added: use_cot_metas
+            generation_section["use_cot_caption"],
+            generation_section["use_cot_language"],
+            generation_section["audio_cover_strength"],
+            generation_section["think_checkbox"],
+            generation_section["text2music_audio_code_string"],
+            generation_section["repainting_start"],
+            generation_section["repainting_end"],
+            generation_section["track_name"],
+            generation_section["complete_track_classes"],
+            generation_section["instrumental_checkbox"],  # Added: instrumental_checkbox
+            results_section["is_format_caption_state"]
+        ]
+    )
+    # Save buttons for all 8 audio outputs
+    download_existing_js = """(current_audio, batch_files) => {
+    // Debug: print what the input actually is
+    console.log("👉 [Debug] Current Audio Input:", current_audio);
+    // 1. Safety check
+    if (!current_audio) {
+        console.warn("⚠️ No audio selected or audio is empty.");
+        return;
+    }
+    if (!batch_files || !Array.isArray(batch_files)) {
+        console.warn("⚠️ Batch file list is empty/not ready.");
+        return;
+    }
+    // 2. Smartly extract path string
+    let pathString = "";
+    if (typeof current_audio === "string") {
+        // Case A: direct path string received
+        pathString = current_audio;
+    } else if (typeof current_audio === "object") {
+        // Case B: an object is received, try common properties
+        // Gradio file objects usually have path, url, or name
+        pathString = current_audio.path || current_audio.name || current_audio.url || "";
+    }
+    if (!pathString) {
+        console.error("❌ Error: Could not extract a valid path string from input.", current_audio);
+        return;
+    }
+    // 3. Extract Key (UUID)
+    // Path could be /tmp/.../uuid.mp3 or url like /file=.../uuid.mp3
+    let filename = pathString.split(/[\\\\/]/).pop(); // get the filename
+    let key = filename.split('.')[0]; // get UUID without extension
+    console.log(`🔑 Key extracted: ${key}`);
+    // 4. Find matching file(s) in the list
+    let targets = batch_files.filter(f => {
+        // Also extract names from batch_files objects
+        // f usually contains name (backend path) and orig_name (download name)
+        const fPath = f.name || f.path || "";
+        return fPath.includes(key);
+    });
+    if (targets.length === 0) {
+        console.warn("❌ No matching files found in batch list for key:", key);
+        alert("Batch list does not contain this file yet. Please wait for generation to finish.");
+        return;
+    }
+    // 5. Trigger download(s)
+    console.log(`🎯 Found ${targets.length} files to download.`);
+    targets.forEach((f, index) => {
+        setTimeout(() => {
+            const a = document.createElement('a');
+            // Prefer url (frontend-accessible link), otherwise try data
+            a.href = f.url || f.data;
+            a.download = f.orig_name || "download";
+            a.style.display = 'none';
+            document.body.appendChild(a);
+            a.click();
+            document.body.removeChild(a);
+        }, index * 1000); // 300ms interval to avoid browser blocking
+    });
+}
+"""
+    for btn_idx in range(1, 9):
+        results_section[f"save_btn_{btn_idx}"].click(
+            fn=None,
+            inputs=[
+                results_section[f"generated_audio_{btn_idx}"],
+                results_section["generated_audio_batch"],
+            ],
+        js=download_existing_js  # Run the above JS
+    )
+    # ========== Send to Cover Handlers ==========
+    def send_to_cover_handler(audio_file, lm_metadata):
+        """Send audio to cover mode and switch to cover"""
+        if audio_file is None:
+            return (gr.skip(),) * 11
+        return (
+            audio_file,      # src_audio
+            gr.skip(),       # bpm
+            gr.skip(),       # captions
+            gr.skip(),       # lyrics
+            gr.skip(),       # audio_duration
+            gr.skip(),       # key_scale
+            gr.skip(),       # vocal_language
+            gr.skip(),       # time_signature
+            gr.skip(),       # is_format_caption_state
+            "cover",         # generation_mode - switch to cover
+            "cover",         # task_type - set to cover
+        )
+    for btn_idx in range(1, 9):
+        results_section[f"send_to_cover_btn_{btn_idx}"].click(
+            fn=send_to_cover_handler,
+            inputs=[
+                results_section[f"generated_audio_{btn_idx}"],
+                results_section["lm_metadata_state"]
+            ],
+            outputs=[
+                generation_section["src_audio"],
+                generation_section["bpm"],
+                generation_section["captions"],
+                generation_section["lyrics"],
+                generation_section["audio_duration"],
+                generation_section["key_scale"],
+                generation_section["vocal_language"],
+                generation_section["time_signature"],
+                results_section["is_format_caption_state"],
+                generation_section["generation_mode"],
+                generation_section["task_type"],
+            ]
+        )
+    # ========== Send to Repaint Handlers ==========
+    def send_to_repaint_handler(audio_file, lm_metadata):
+        """Send audio to repaint mode and switch to repaint"""
+        if audio_file is None:
+            return (gr.skip(),) * 11
+        return (
+            audio_file,      # src_audio
+            gr.skip(),       # bpm
+            gr.skip(),       # captions
+            gr.skip(),       # lyrics
+            gr.skip(),       # audio_duration
+            gr.skip(),       # key_scale
+            gr.skip(),       # vocal_language
+            gr.skip(),       # time_signature
+            gr.skip(),       # is_format_caption_state
+            "repaint",       # generation_mode - switch to repaint
+            "repaint",       # task_type - set to repaint
+        )
+    for btn_idx in range(1, 9):
+        results_section[f"send_to_repaint_btn_{btn_idx}"].click(
+            fn=send_to_repaint_handler,
+            inputs=[
+                results_section[f"generated_audio_{btn_idx}"],
+                results_section["lm_metadata_state"]
+            ],
+            outputs=[
+                generation_section["src_audio"],
+                generation_section["bpm"],
+                generation_section["captions"],
+                generation_section["lyrics"],
+                generation_section["audio_duration"],
+                generation_section["key_scale"],
+                generation_section["vocal_language"],
+                generation_section["time_signature"],
+                results_section["is_format_caption_state"],
+                generation_section["generation_mode"],
+                generation_section["task_type"],
+            ]
+        )
+    # ========== Score Calculation Handlers ==========
+    # Use default argument to capture btn_idx value at definition time (Python closure fix)
+    # Note: @spaces.GPU decorator applied here (not on module-level function) to avoid
+    # pickling issues on ZeroGPU when handler objects are captured in closures.
+    def make_score_handler(idx):
+        @_get_spaces_gpu_decorator(duration=120)
+        def score_handler(scale, batch_idx, queue):
+            return res_h.calculate_score_handler_with_selection(
+                dit_handler, llm_handler, idx, scale, batch_idx, queue
+            )
+        return score_handler
+    for btn_idx in range(1, 9):
+        results_section[f"score_btn_{btn_idx}"].click(
+            fn=make_score_handler(btn_idx),
+            inputs=[
+                generation_section["score_scale"],
+                results_section["current_batch_index"],
+                results_section["batch_queue"],
+            ],
+            outputs=[
+                results_section[f"score_display_{btn_idx}"],
+                results_section[f"details_accordion_{btn_idx}"],
+                results_section["batch_queue"]
+            ]
+        )
+    # ========== LRC Timestamp Handlers ==========
+    # Use default argument to capture btn_idx value at definition time (Python closure fix)
+    def make_lrc_handler(idx):
+        @_get_spaces_gpu_decorator(duration=120)
+        def lrc_handler(batch_idx, queue, vocal_lang, infer_steps):
+            return res_h.generate_lrc_handler(
+                dit_handler, idx, batch_idx, queue, vocal_lang, infer_steps
+            )
+        return lrc_handler
+    for btn_idx in range(1, 9):
+        results_section[f"lrc_btn_{btn_idx}"].click(
+            fn=make_lrc_handler(btn_idx),
+            inputs=[
+                results_section["current_batch_index"],
+                results_section["batch_queue"],
+                generation_section["vocal_language"],
+                generation_section["inference_steps"],
+            ],
+            outputs=[
+                results_section[f"lrc_display_{btn_idx}"],
+                results_section[f"details_accordion_{btn_idx}"],
+                # NOTE: Removed generated_audio output!
+                # Audio subtitles are now updated via lrc_display.change() event.
+                results_section["batch_queue"]
+            ]
+        )
+    @_get_spaces_gpu_decorator(duration=120)
+    def generation_wrapper(selected_model, generation_mode, simple_query_input, simple_vocal_language, *args):
+        """Wrapper that selects the appropriate DiT handler based on model selection"""
+        # Convert args to list for modification
+        args_list = list(args)
+        # args order (after simple mode params):
+        # captions (0), lyrics (1), bpm (2), key_scale (3), time_signature (4), vocal_language (5),
+        # inference_steps (6), guidance_scale (7), random_seed_checkbox (8), seed (9),
+        # reference_audio (10), audio_duration (11), batch_size_input (12), src_audio (13),
+        # text2music_audio_code_string (14), repainting_start (15), repainting_end (16),
+        # instruction_display_gen (17), audio_cover_strength (18), task_type (19), ...
+        # ... lm_temperature (27), think_checkbox (28), ...
+        # ... instrumental_checkbox (at position after all regular params)
+        src_audio = args_list[13] if len(args_list) > 13 else None
+        task_type = args_list[19] if len(args_list) > 19 else "text2music"
+        # Validate: Cover and Repaint modes require source audio
+        if task_type in ["cover", "repaint"] and src_audio is None:
+            raise gr.Error(f"Source Audio is required for {task_type.capitalize()} mode. Please upload an audio file.")
+        # Handle Simple mode: first create sample, then generate
+        if generation_mode == "simple":
+            # Get instrumental from the main checkbox (args[-6] based on input order)
+            # The instrumental_checkbox is passed after all the regular generation params
+            instrumental = args_list[-6] if len(args_list) > 6 else False  # instrumental_checkbox position
+            lm_temperature = args_list[27] if len(args_list) > 27 else 0.85
+            lm_top_k = args_list[30] if len(args_list) > 30 else 0
+            lm_top_p = args_list[31] if len(args_list) > 31 else 0.9
+            constrained_decoding_debug = args_list[38] if len(args_list) > 38 else False
+            # Call create_sample to generate caption/lyrics/metadata
+            from acestep.inference import create_sample
+            top_k_value = None if not lm_top_k or lm_top_k == 0 else int(lm_top_k)
+            top_p_value = None if not lm_top_p or lm_top_p >= 1.0 else lm_top_p
+            result = create_sample(
+                llm_handler=llm_handler,
+                query=simple_query_input,
+                instrumental=instrumental,
+                vocal_language=simple_vocal_language,
+                temperature=lm_temperature,
+                top_k=top_k_value,
+                top_p=top_p_value,
+                use_constrained_decoding=True,
+                constrained_decoding_debug=constrained_decoding_debug,
+            )
+            if not result.success:
+                raise gr.Error(f"Failed to create sample: {result.status_message}")
+            # Update args with generated data
+            args_list[0] = result.caption  # captions
+            args_list[1] = result.lyrics  # lyrics
+            args_list[2] = result.bpm  # bpm
+            args_list[3] = result.keyscale  # key_scale
+            args_list[4] = result.timesignature  # time_signature
+            args_list[5] = result.language  # vocal_language
+            if result.duration and result.duration > 0:
+                args_list[11] = result.duration  # audio_duration
+            # Enable thinking for Simple mode
+            args_list[28] = True  # think_checkbox
+            # Mark as formatted caption (LM-generated sample)
+            args_list[36] = True  # is_format_caption_state
+        # Determine which handler to use based on model selection
+        active_handler = dit_handler  # Default to primary handler
+        if dit_handler_2 is not None and selected_model == config_path_2:
+            active_handler = dit_handler_2
+        yield from res_h.generate_with_batch_management(active_handler, llm_handler, *args_list)
+    # ========== Generation Handler ==========
+    generation_section["generate_btn"].click(
+        fn=generation_wrapper,
+        inputs=[
+            generation_section["dit_model_selector"],  # Model selection input
+            generation_section["generation_mode"],  # For Simple mode detection
+            generation_section["simple_query_input"],  # Simple mode query
+            generation_section["simple_vocal_language"],  # Simple mode vocal language
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["vocal_language"],
+            generation_section["inference_steps"],
+            generation_section["guidance_scale"],
+            generation_section["random_seed_checkbox"],
+            generation_section["seed"],
+            generation_section["reference_audio"],
+            generation_section["audio_duration"],
+            generation_section["batch_size_input"],
+            generation_section["src_audio"],
+            generation_section["text2music_audio_code_string"],
+            generation_section["repainting_start"],
+            generation_section["repainting_end"],
+            generation_section["instruction_display_gen"],
+            generation_section["audio_cover_strength"],
+            generation_section["task_type"],
+            generation_section["use_adg"],
+            generation_section["cfg_interval_start"],
+            generation_section["cfg_interval_end"],
+            generation_section["shift"],
+            generation_section["infer_method"],
+            generation_section["custom_timesteps"],
+            generation_section["audio_format"],
+            generation_section["lm_temperature"],
+            generation_section["think_checkbox"],
+            generation_section["lm_cfg_scale"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["lm_negative_prompt"],
+            generation_section["use_cot_metas"],
+            generation_section["use_cot_caption"],
+            generation_section["use_cot_language"],
+            results_section["is_format_caption_state"],
+            generation_section["constrained_decoding_debug"],
+            generation_section["allow_lm_batch"],
+            generation_section["auto_score"],
+            generation_section["auto_lrc"],
+            generation_section["score_scale"],
+            generation_section["lm_batch_chunk_size"],
+            generation_section["track_name"],
+            generation_section["complete_track_classes"],
+            generation_section["autogen_checkbox"],
+            results_section["current_batch_index"],
+            results_section["total_batches"],
+            results_section["batch_queue"],
+            results_section["generation_params_state"],
+        ],
+        outputs=[
+            results_section["generated_audio_1"],
+            results_section["generated_audio_2"],
+            results_section["generated_audio_3"],
+            results_section["generated_audio_4"],
+            results_section["generated_audio_5"],
+            results_section["generated_audio_6"],
+            results_section["generated_audio_7"],
+            results_section["generated_audio_8"],
+            results_section["generated_audio_batch"],
+            results_section["generation_info"],
+            results_section["status_output"],
+            generation_section["seed"],
+            results_section["score_display_1"],
+            results_section["score_display_2"],
+            results_section["score_display_3"],
+            results_section["score_display_4"],
+            results_section["score_display_5"],
+            results_section["score_display_6"],
+            results_section["score_display_7"],
+            results_section["score_display_8"],
+            results_section["codes_display_1"],
+            results_section["codes_display_2"],
+            results_section["codes_display_3"],
+            results_section["codes_display_4"],
+            results_section["codes_display_5"],
+            results_section["codes_display_6"],
+            results_section["codes_display_7"],
+            results_section["codes_display_8"],
+            results_section["details_accordion_1"],
+            results_section["details_accordion_2"],
+            results_section["details_accordion_3"],
+            results_section["details_accordion_4"],
+            results_section["details_accordion_5"],
+            results_section["details_accordion_6"],
+            results_section["details_accordion_7"],
+            results_section["details_accordion_8"],
+            results_section["lrc_display_1"],
+            results_section["lrc_display_2"],
+            results_section["lrc_display_3"],
+            results_section["lrc_display_4"],
+            results_section["lrc_display_5"],
+            results_section["lrc_display_6"],
+            results_section["lrc_display_7"],
+            results_section["lrc_display_8"],
+            results_section["lm_metadata_state"],
+            results_section["is_format_caption_state"],
+            results_section["current_batch_index"],
+            results_section["total_batches"],
+            results_section["batch_queue"],
+            results_section["generation_params_state"],
+            results_section["batch_indicator"],
+            results_section["prev_batch_btn"],
+            results_section["next_batch_btn"],
+            results_section["next_batch_status"],
+            results_section["restore_params_btn"],
+        ]
+    ).then(
+        fn=lambda selected_model, *args: res_h.generate_next_batch_background(
+            dit_handler_2 if (dit_handler_2 is not None and selected_model == config_path_2) else dit_handler,
+            llm_handler, *args
+        ),
+        inputs=[
+            generation_section["dit_model_selector"],  # Model selection input
+            generation_section["autogen_checkbox"],
+            results_section["generation_params_state"],
+            results_section["current_batch_index"],
+            results_section["total_batches"],
+            results_section["batch_queue"],
+            results_section["is_format_caption_state"],
+        ],
+        outputs=[
+            results_section["batch_queue"],
+            results_section["total_batches"],
+            results_section["next_batch_status"],
+            results_section["next_batch_btn"],
+        ]
+    )
+    # ========== Batch Navigation Handlers ==========
+    results_section["prev_batch_btn"].click(
+        fn=res_h.navigate_to_previous_batch,
+        inputs=[
+            results_section["current_batch_index"],
+            results_section["batch_queue"],
+        ],
+        outputs=[
+            results_section["generated_audio_1"],
+            results_section["generated_audio_2"],
+            results_section["generated_audio_3"],
+            results_section["generated_audio_4"],
+            results_section["generated_audio_5"],
+            results_section["generated_audio_6"],
+            results_section["generated_audio_7"],
+            results_section["generated_audio_8"],
+            results_section["generated_audio_batch"],
+            results_section["generation_info"],
+            results_section["current_batch_index"],
+            results_section["batch_indicator"],
+            results_section["prev_batch_btn"],
+            results_section["next_batch_btn"],
+            results_section["status_output"],
+            results_section["score_display_1"],
+            results_section["score_display_2"],
+            results_section["score_display_3"],
+            results_section["score_display_4"],
+            results_section["score_display_5"],
+            results_section["score_display_6"],
+            results_section["score_display_7"],
+            results_section["score_display_8"],
+            results_section["codes_display_1"],
+            results_section["codes_display_2"],
+            results_section["codes_display_3"],
+            results_section["codes_display_4"],
+            results_section["codes_display_5"],
+            results_section["codes_display_6"],
+            results_section["codes_display_7"],
+            results_section["codes_display_8"],
+            results_section["lrc_display_1"],
+            results_section["lrc_display_2"],
+            results_section["lrc_display_3"],
+            results_section["lrc_display_4"],
+            results_section["lrc_display_5"],
+            results_section["lrc_display_6"],
+            results_section["lrc_display_7"],
+            results_section["lrc_display_8"],
+            results_section["details_accordion_1"],
+            results_section["details_accordion_2"],
+            results_section["details_accordion_3"],
+            results_section["details_accordion_4"],
+            results_section["details_accordion_5"],
+            results_section["details_accordion_6"],
+            results_section["details_accordion_7"],
+            results_section["details_accordion_8"],
+            results_section["restore_params_btn"],
+        ]
+    )
+    results_section["next_batch_btn"].click(
+        fn=res_h.capture_current_params,
+        inputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["vocal_language"],
+            generation_section["inference_steps"],
+            generation_section["guidance_scale"],
+            generation_section["random_seed_checkbox"],
+            generation_section["seed"],
+            generation_section["reference_audio"],
+            generation_section["audio_duration"],
+            generation_section["batch_size_input"],
+            generation_section["src_audio"],
+            generation_section["text2music_audio_code_string"],
+            generation_section["repainting_start"],
+            generation_section["repainting_end"],
+            generation_section["instruction_display_gen"],
+            generation_section["audio_cover_strength"],
+            generation_section["task_type"],
+            generation_section["use_adg"],
+            generation_section["cfg_interval_start"],
+            generation_section["cfg_interval_end"],
+            generation_section["shift"],
+            generation_section["infer_method"],
+            generation_section["custom_timesteps"],
+            generation_section["audio_format"],
+            generation_section["lm_temperature"],
+            generation_section["think_checkbox"],
+            generation_section["lm_cfg_scale"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["lm_negative_prompt"],
+            generation_section["use_cot_metas"],
+            generation_section["use_cot_caption"],
+            generation_section["use_cot_language"],
+            generation_section["constrained_decoding_debug"],
+            generation_section["allow_lm_batch"],
+            generation_section["auto_score"],
+            generation_section["auto_lrc"],
+            generation_section["score_scale"],
+            generation_section["lm_batch_chunk_size"],
+            generation_section["track_name"],
+            generation_section["complete_track_classes"],
+        ],
+        outputs=[results_section["generation_params_state"]]
+    ).then(
+        fn=res_h.navigate_to_next_batch,
+        inputs=[
+            generation_section["autogen_checkbox"],
+            results_section["current_batch_index"],
+            results_section["total_batches"],
+            results_section["batch_queue"],
+        ],
+        outputs=[
+            results_section["generated_audio_1"],
+            results_section["generated_audio_2"],
+            results_section["generated_audio_3"],
+            results_section["generated_audio_4"],
+            results_section["generated_audio_5"],
+            results_section["generated_audio_6"],
+            results_section["generated_audio_7"],
+            results_section["generated_audio_8"],
+            results_section["generated_audio_batch"],
+            results_section["generation_info"],
+            results_section["current_batch_index"],
+            results_section["batch_indicator"],
+            results_section["prev_batch_btn"],
+            results_section["next_batch_btn"],
+            results_section["status_output"],
+            results_section["next_batch_status"],
+            results_section["score_display_1"],
+            results_section["score_display_2"],
+            results_section["score_display_3"],
+            results_section["score_display_4"],
+            results_section["score_display_5"],
+            results_section["score_display_6"],
+            results_section["score_display_7"],
+            results_section["score_display_8"],
+            results_section["codes_display_1"],
+            results_section["codes_display_2"],
+            results_section["codes_display_3"],
+            results_section["codes_display_4"],
+            results_section["codes_display_5"],
+            results_section["codes_display_6"],
+            results_section["codes_display_7"],
+            results_section["codes_display_8"],
+            results_section["lrc_display_1"],
+            results_section["lrc_display_2"],
+            results_section["lrc_display_3"],
+            results_section["lrc_display_4"],
+            results_section["lrc_display_5"],
+            results_section["lrc_display_6"],
+            results_section["lrc_display_7"],
+            results_section["lrc_display_8"],
+            results_section["details_accordion_1"],
+            results_section["details_accordion_2"],
+            results_section["details_accordion_3"],
+            results_section["details_accordion_4"],
+            results_section["details_accordion_5"],
+            results_section["details_accordion_6"],
+            results_section["details_accordion_7"],
+            results_section["details_accordion_8"],
+            results_section["restore_params_btn"],
+        ]
+    ).then(
+        fn=lambda selected_model, *args: res_h.generate_next_batch_background(
+            dit_handler_2 if (dit_handler_2 is not None and selected_model == config_path_2) else dit_handler,
+            llm_handler, *args
+        ),
+        inputs=[
+            generation_section["dit_model_selector"],  # Model selection input
+            generation_section["autogen_checkbox"],
+            results_section["generation_params_state"],
+            results_section["current_batch_index"],
+            results_section["total_batches"],
+            results_section["batch_queue"],
+            results_section["is_format_caption_state"],
+        ],
+        outputs=[
+            results_section["batch_queue"],
+            results_section["total_batches"],
+            results_section["next_batch_status"],
+            results_section["next_batch_btn"],
+        ]
+    )
+    # ========== Restore Parameters Handler ==========
+    results_section["restore_params_btn"].click(
+        fn=res_h.restore_batch_parameters,
+        inputs=[
+            results_section["current_batch_index"],
+            results_section["batch_queue"]
+        ],
+        outputs=[
+            generation_section["text2music_audio_code_string"],
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["vocal_language"],
+            generation_section["audio_duration"],
+            generation_section["batch_size_input"],
+            generation_section["inference_steps"],
+            generation_section["lm_temperature"],
+            generation_section["lm_cfg_scale"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["think_checkbox"],
+            generation_section["use_cot_caption"],
+            generation_section["use_cot_language"],
+            generation_section["allow_lm_batch"],
+            generation_section["track_name"],
+            generation_section["complete_track_classes"],
+        ]
+    )
+    # ========== LRC Display Change Handlers ==========
+    # NEW APPROACH: Use lrc_display.change() to update audio subtitles
+    # This decouples audio value updates from subtitle updates, avoiding flickering.
+    #
+    # When lrc_display text changes (from generate, LRC button, or manual edit):
+    # 1. lrc_display.change() is triggered
+    # 2. update_audio_subtitles_from_lrc() parses LRC and updates audio subtitles
+    # 3. Audio value is NEVER updated here - only subtitles
+    for lrc_idx in range(1, 9):
+        results_section[f"lrc_display_{lrc_idx}"].change(
+            fn=res_h.update_audio_subtitles_from_lrc,
+            inputs=[
+                results_section[f"lrc_display_{lrc_idx}"],
+                # audio_duration not needed - parse_lrc_to_subtitles calculates end time from timestamps
+            ],
+            outputs=[
+                results_section[f"generated_audio_{lrc_idx}"],  # Only updates subtitles, not value
+            ]
+        )
+def setup_training_event_handlers(demo, dit_handler, llm_handler, training_section):
+    """Setup event handlers for the training tab (dataset builder and LoRA training)"""
+    # ========== Load Existing Dataset (Top Section) ==========
+    # Load existing dataset JSON at the top of Dataset Builder
+    training_section["load_json_btn"].click(
+        fn=train_h.load_existing_dataset_for_preprocess,
+        inputs=[
+            training_section["load_json_path"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[
+            training_section["load_json_status"],
+            training_section["audio_files_table"],
+            training_section["sample_selector"],
+            training_section["dataset_builder_state"],
+            # Also update preview fields with first sample
+            training_section["preview_audio"],
+            training_section["preview_filename"],
+            training_section["edit_caption"],
+            training_section["edit_lyrics"],
+            training_section["edit_bpm"],
+            training_section["edit_keyscale"],
+            training_section["edit_timesig"],
+            training_section["edit_duration"],
+            training_section["edit_language"],
+            training_section["edit_instrumental"],
+        ]
+    )
+    # ========== Dataset Builder Handlers ==========
+    # Scan directory for audio files
+    training_section["scan_btn"].click(
+        fn=lambda dir, name, tag, pos, instr, state: train_h.scan_directory(
+            dir, name, tag, pos, instr, state
+        ),
+        inputs=[
+            training_section["audio_directory"],
+            training_section["dataset_name"],
+            training_section["custom_tag"],
+            training_section["tag_position"],
+            training_section["all_instrumental"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[
+            training_section["audio_files_table"],
+            training_section["scan_status"],
+            training_section["sample_selector"],
+            training_section["dataset_builder_state"],
+        ]
+    )
+    # Auto-label all samples
+    training_section["auto_label_btn"].click(
+        fn=lambda state, skip: train_h.auto_label_all(dit_handler, llm_handler, state, skip),
+        inputs=[
+            training_section["dataset_builder_state"],
+            training_section["skip_metas"],
+        ],
+        outputs=[
+            training_section["audio_files_table"],
+            training_section["label_progress"],
+            training_section["dataset_builder_state"],
+        ]
+    )
+    # Sample selector change - update preview
+    training_section["sample_selector"].change(
+        fn=train_h.get_sample_preview,
+        inputs=[
+            training_section["sample_selector"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[
+            training_section["preview_audio"],
+            training_section["preview_filename"],
+            training_section["edit_caption"],
+            training_section["edit_lyrics"],
+            training_section["edit_bpm"],
+            training_section["edit_keyscale"],
+            training_section["edit_timesig"],
+            training_section["edit_duration"],
+            training_section["edit_language"],
+            training_section["edit_instrumental"],
+        ]
+    )
+    # Save sample edit
+    training_section["save_edit_btn"].click(
+        fn=train_h.save_sample_edit,
+        inputs=[
+            training_section["sample_selector"],
+            training_section["edit_caption"],
+            training_section["edit_lyrics"],
+            training_section["edit_bpm"],
+            training_section["edit_keyscale"],
+            training_section["edit_timesig"],
+            training_section["edit_language"],
+            training_section["edit_instrumental"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[
+            training_section["audio_files_table"],
+            training_section["edit_status"],
+            training_section["dataset_builder_state"],
+        ]
+    )
+    # Update settings when changed
+    for trigger in [training_section["custom_tag"], training_section["tag_position"], training_section["all_instrumental"]]:
+        trigger.change(
+            fn=train_h.update_settings,
+            inputs=[
+                training_section["custom_tag"],
+                training_section["tag_position"],
+                training_section["all_instrumental"],
+                training_section["dataset_builder_state"],
+            ],
+            outputs=[training_section["dataset_builder_state"]]
+        )
+    # Save dataset
+    training_section["save_dataset_btn"].click(
+        fn=train_h.save_dataset,
+        inputs=[
+            training_section["save_path"],
+            training_section["dataset_name"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[training_section["save_status"]]
+    )
+    # ========== Preprocess Handlers ==========
+    # Load existing dataset JSON for preprocessing
+    # This also updates the preview section so users can view/edit samples
+    training_section["load_existing_dataset_btn"].click(
+        fn=train_h.load_existing_dataset_for_preprocess,
+        inputs=[
+            training_section["load_existing_dataset_path"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[
+            training_section["load_existing_status"],
+            training_section["audio_files_table"],
+            training_section["sample_selector"],
+            training_section["dataset_builder_state"],
+            # Also update preview fields with first sample
+            training_section["preview_audio"],
+            training_section["preview_filename"],
+            training_section["edit_caption"],
+            training_section["edit_lyrics"],
+            training_section["edit_bpm"],
+            training_section["edit_keyscale"],
+            training_section["edit_timesig"],
+            training_section["edit_duration"],
+            training_section["edit_language"],
+            training_section["edit_instrumental"],
+        ]
+    )
+    # Preprocess dataset to tensor files
+    training_section["preprocess_btn"].click(
+        fn=lambda output_dir, state: train_h.preprocess_dataset(
+            output_dir, dit_handler, state
+        ),
+        inputs=[
+            training_section["preprocess_output_dir"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[training_section["preprocess_progress"]]
+    )
+    # ========== Training Tab Handlers ==========
+    # Load preprocessed tensor dataset
+    training_section["load_dataset_btn"].click(
+        fn=train_h.load_training_dataset,
+        inputs=[training_section["training_tensor_dir"]],
+        outputs=[training_section["training_dataset_info"]]
+    )
+    # Start training from preprocessed tensors
+    def training_wrapper(tensor_dir, r, a, d, lr, ep, bs, ga, se, sh, sd, od, ts):
+        try:
+            for progress, log, plot, state in train_h.start_training(
+                tensor_dir, dit_handler, r, a, d, lr, ep, bs, ga, se, sh, sd, od, ts
+            ):
+                yield progress, log, plot, state
+        except Exception as e:
+            logger.exception("Training wrapper error")
+            yield f"❌ Error: {str(e)}", str(e), None, ts
+    training_section["start_training_btn"].click(
+        fn=training_wrapper,
+        inputs=[
+            training_section["training_tensor_dir"],
+            training_section["lora_rank"],
+            training_section["lora_alpha"],
+            training_section["lora_dropout"],
+            training_section["learning_rate"],
+            training_section["train_epochs"],
+            training_section["train_batch_size"],
+            training_section["gradient_accumulation"],
+            training_section["save_every_n_epochs"],
+            training_section["training_shift"],
+            training_section["training_seed"],
+            training_section["lora_output_dir"],
+            training_section["training_state"],
+        ],
+        outputs=[
+            training_section["training_progress"],
+            training_section["training_log"],
+            training_section["training_loss_plot"],
+            training_section["training_state"],
+        ]
+    )
+    # Stop training
+    training_section["stop_training_btn"].click(
+        fn=train_h.stop_training,
+        inputs=[training_section["training_state"]],
+        outputs=[
+            training_section["training_progress"],
+            training_section["training_state"],
+        ]
+    )
+    # Export LoRA
+    training_section["export_lora_btn"].click(
+        fn=train_h.export_lora,
+        inputs=[
+            training_section["export_path"],
+            training_section["lora_output_dir"],
+        ],
+        outputs=[training_section["export_status"]]
+    )

acestep/gradio_ui/events/generation_handlers.py ADDED Viewed

	@@ -0,0 +1,1071 @@

+"""
+Generation Input Handlers Module
+Contains event handlers and helper functions related to generation inputs
+"""
+import os
+import json
+import random
+import glob
+import gradio as gr
+from typing import Optional, List, Tuple
+from loguru import logger
+from acestep.constants import (
+    TASK_TYPES_TURBO,
+    TASK_TYPES_BASE,
+)
+from acestep.gradio_ui.i18n import t
+from acestep.inference import understand_music, create_sample, format_sample
+# HuggingFace Space environment detection for ZeroGPU support
+IS_HUGGINGFACE_SPACE = os.environ.get("SPACE_ID") is not None
+def _get_spaces_gpu_decorator(duration=120):
+    """
+    Get the @spaces.GPU decorator if running in HuggingFace Space environment.
+    Returns identity decorator if not in Space environment.
+    """
+    if IS_HUGGINGFACE_SPACE:
+        try:
+            import spaces
+            return spaces.GPU(duration=duration)
+        except ImportError:
+            logger.warning("spaces package not found, GPU decorator disabled")
+            return lambda func: func
+    return lambda func: func
+def parse_and_validate_timesteps(
+    timesteps_str: str,
+    inference_steps: int
+) -> Tuple[Optional[List[float]], bool, str]:
+    """
+    Parse timesteps string and validate.
+    Args:
+        timesteps_str: Comma-separated timesteps string (e.g., "0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0")
+        inference_steps: Expected number of inference steps
+    Returns:
+        Tuple of (parsed_timesteps, has_warning, warning_message)
+        - parsed_timesteps: List of float timesteps, or None if invalid/empty
+        - has_warning: Whether a warning was shown
+        - warning_message: Description of the warning
+    """
+    if not timesteps_str or not timesteps_str.strip():
+        return None, False, ""
+    # Parse comma-separated values
+    values = [v.strip() for v in timesteps_str.split(",") if v.strip()]
+    if not values:
+        return None, False, ""
+    # Handle optional trailing 0
+    if values[-1] != "0":
+        values.append("0")
+    try:
+        timesteps = [float(v) for v in values]
+    except ValueError:
+        gr.Warning(t("messages.invalid_timesteps_format"))
+        return None, True, "Invalid format"
+    # Validate range [0, 1]
+    if any(ts < 0 or ts > 1 for ts in timesteps):
+        gr.Warning(t("messages.timesteps_out_of_range"))
+        return None, True, "Out of range"
+    # Check if count matches inference_steps
+    actual_steps = len(timesteps) - 1
+    if actual_steps != inference_steps:
+        gr.Warning(t("messages.timesteps_count_mismatch", actual=actual_steps, expected=inference_steps))
+        return timesteps, True, f"Using {actual_steps} steps from timesteps"
+    return timesteps, False, ""
+def load_metadata(file_obj):
+    """Load generation parameters from a JSON file"""
+    if file_obj is None:
+        gr.Warning(t("messages.no_file_selected"))
+        return [None] * 36 + [False]  # Return None for all fields, False for is_format_caption
+    try:
+        # Read the uploaded file
+        if hasattr(file_obj, 'name'):
+            filepath = file_obj.name
+        else:
+            filepath = file_obj
+        with open(filepath, 'r', encoding='utf-8') as f:
+            metadata = json.load(f)
+        # Extract all fields
+        task_type = metadata.get('task_type', 'text2music')
+        captions = metadata.get('caption', '')
+        lyrics = metadata.get('lyrics', '')
+        vocal_language = metadata.get('vocal_language', 'unknown')
+        # Convert bpm
+        bpm_value = metadata.get('bpm')
+        if bpm_value is not None and bpm_value != "N/A":
+            try:
+                bpm = int(bpm_value) if bpm_value else None
+            except:
+                bpm = None
+        else:
+            bpm = None
+        key_scale = metadata.get('keyscale', '')
+        time_signature = metadata.get('timesignature', '')
+        # Convert duration
+        duration_value = metadata.get('duration', -1)
+        if duration_value is not None and duration_value != "N/A":
+            try:
+                audio_duration = float(duration_value)
+            except:
+                audio_duration = -1
+        else:
+            audio_duration = -1
+        batch_size = metadata.get('batch_size', 2)
+        inference_steps = metadata.get('inference_steps', 8)
+        guidance_scale = metadata.get('guidance_scale', 7.0)
+        seed = metadata.get('seed', '-1')
+        random_seed = False  # Always set to False when loading to enable reproducibility with saved seed
+        use_adg = metadata.get('use_adg', False)
+        cfg_interval_start = metadata.get('cfg_interval_start', 0.0)
+        cfg_interval_end = metadata.get('cfg_interval_end', 1.0)
+        audio_format = metadata.get('audio_format', 'mp3')
+        lm_temperature = metadata.get('lm_temperature', 0.85)
+        lm_cfg_scale = metadata.get('lm_cfg_scale', 2.0)
+        lm_top_k = metadata.get('lm_top_k', 0)
+        lm_top_p = metadata.get('lm_top_p', 0.9)
+        lm_negative_prompt = metadata.get('lm_negative_prompt', 'NO USER INPUT')
+        use_cot_metas = metadata.get('use_cot_metas', True)  # Added: read use_cot_metas
+        use_cot_caption = metadata.get('use_cot_caption', True)
+        use_cot_language = metadata.get('use_cot_language', True)
+        audio_cover_strength = metadata.get('audio_cover_strength', 1.0)
+        think = metadata.get('thinking', True)  # Fixed: read 'thinking' not 'think'
+        audio_codes = metadata.get('audio_codes', '')
+        repainting_start = metadata.get('repainting_start', 0.0)
+        repainting_end = metadata.get('repainting_end', -1)
+        track_name = metadata.get('track_name')
+        complete_track_classes = metadata.get('complete_track_classes', [])
+        shift = metadata.get('shift', 3.0)  # Default 3.0 for base models
+        infer_method = metadata.get('infer_method', 'ode')  # Default 'ode' for diffusion inference
+        custom_timesteps = metadata.get('timesteps', '')  # Custom timesteps (stored as 'timesteps' in JSON)
+        if custom_timesteps is None:
+            custom_timesteps = ''
+        instrumental = metadata.get('instrumental', False)  # Added: read instrumental
+        gr.Info(t("messages.params_loaded", filename=os.path.basename(filepath)))
+        return (
+            task_type, captions, lyrics, vocal_language, bpm, key_scale, time_signature,
+            audio_duration, batch_size, inference_steps, guidance_scale, seed, random_seed,
+            use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method,
+            custom_timesteps,  # Added: custom_timesteps (between infer_method and audio_format)
+            audio_format, lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
+            use_cot_metas, use_cot_caption, use_cot_language, audio_cover_strength,
+            think, audio_codes, repainting_start, repainting_end,
+            track_name, complete_track_classes, instrumental,
+            True  # Set is_format_caption to True when loading from file
+        )
+    except json.JSONDecodeError as e:
+        gr.Warning(t("messages.invalid_json", error=str(e)))
+        return [None] * 36 + [False]
+    except Exception as e:
+        gr.Warning(t("messages.load_error", error=str(e)))
+        return [None] * 36 + [False]
+def load_random_example(task_type: str):
+    """Load a random example from the task-specific examples directory
+    Args:
+        task_type: The task type (e.g., "text2music")
+    Returns:
+        Tuple of (caption, lyrics, think, bpm, duration, keyscale, language, timesignature) for updating UI components
+    """
+    try:
+        # Get the project root directory
+        current_file = os.path.abspath(__file__)
+        # This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
+        project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
+        # Construct the examples directory path
+        examples_dir = os.path.join(project_root, "examples", task_type)
+        # Check if directory exists
+        if not os.path.exists(examples_dir):
+            gr.Warning(f"Examples directory not found: examples/{task_type}/")
+            return "", "", True, None, None, "", "", ""
+        # Find all JSON files in the directory
+        json_files = glob.glob(os.path.join(examples_dir, "*.json"))
+        if not json_files:
+            gr.Warning(f"No JSON files found in examples/{task_type}/")
+            return "", "", True, None, None, "", "", ""
+        # Randomly select one file
+        selected_file = random.choice(json_files)
+        # Read and parse JSON
+        try:
+            with open(selected_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            # Extract caption (prefer 'caption', fallback to 'prompt')
+            caption_value = data.get('caption', data.get('prompt', ''))
+            if not isinstance(caption_value, str):
+                caption_value = str(caption_value) if caption_value else ''
+            # Extract lyrics
+            lyrics_value = data.get('lyrics', '')
+            if not isinstance(lyrics_value, str):
+                lyrics_value = str(lyrics_value) if lyrics_value else ''
+            # Extract think (default to True if not present)
+            think_value = data.get('think', True)
+            if not isinstance(think_value, bool):
+                think_value = True
+            # Extract optional metadata fields
+            bpm_value = None
+            if 'bpm' in data and data['bpm'] not in [None, "N/A", ""]:
+                try:
+                    bpm_value = int(data['bpm'])
+                except (ValueError, TypeError):
+                    pass
+            duration_value = None
+            if 'duration' in data and data['duration'] not in [None, "N/A", ""]:
+                try:
+                    duration_value = float(data['duration'])
+                except (ValueError, TypeError):
+                    pass
+            keyscale_value = data.get('keyscale', '')
+            if keyscale_value in [None, "N/A"]:
+                keyscale_value = ''
+            language_value = data.get('language', '')
+            if language_value in [None, "N/A"]:
+                language_value = ''
+            timesignature_value = data.get('timesignature', '')
+            if timesignature_value in [None, "N/A"]:
+                timesignature_value = ''
+            gr.Info(t("messages.example_loaded", filename=os.path.basename(selected_file)))
+            return caption_value, lyrics_value, think_value, bpm_value, duration_value, keyscale_value, language_value, timesignature_value
+        except json.JSONDecodeError as e:
+            gr.Warning(t("messages.example_failed", filename=os.path.basename(selected_file), error=str(e)))
+            return "", "", True, None, None, "", "", ""
+        except Exception as e:
+            gr.Warning(t("messages.example_error", error=str(e)))
+            return "", "", True, None, None, "", "", ""
+    except Exception as e:
+        gr.Warning(t("messages.example_error", error=str(e)))
+        return "", "", True, None, None, "", "", ""
+def sample_example_smart(llm_handler, task_type: str, constrained_decoding_debug: bool = False):
+    """Smart sample function that uses LM if initialized, otherwise falls back to examples
+    This is a Gradio wrapper that uses the understand_music API from acestep.inference
+    to generate examples when LM is available.
+    Args:
+        llm_handler: LLM handler instance
+        task_type: The task type (e.g., "text2music")
+        constrained_decoding_debug: Whether to enable debug logging for constrained decoding
+    Returns:
+        Tuple of (caption, lyrics, think, bpm, duration, keyscale, language, timesignature) for updating UI components
+    """
+    # Check if LM is initialized
+    if llm_handler.llm_initialized:
+        # Use LM to generate example via understand_music API
+        try:
+            result = understand_music(
+                llm_handler=llm_handler,
+                audio_codes="NO USER INPUT",  # Empty input triggers example generation
+                temperature=0.85,
+                use_constrained_decoding=True,
+                constrained_decoding_debug=constrained_decoding_debug,
+            )
+            if result.success:
+                gr.Info(t("messages.lm_generated"))
+                return (
+                    result.caption,
+                    result.lyrics,
+                    True,  # Always enable think when using LM-generated examples
+                    result.bpm,
+                    result.duration,
+                    result.keyscale,
+                    result.language,
+                    result.timesignature,
+                )
+            else:
+                gr.Warning(t("messages.lm_fallback"))
+                return load_random_example(task_type)
+        except Exception as e:
+            gr.Warning(t("messages.lm_fallback"))
+            return load_random_example(task_type)
+    else:
+        # LM not initialized, use examples directory
+        return load_random_example(task_type)
+def load_random_simple_description():
+    """Load a random description from the simple_mode examples directory.
+    Returns:
+        Tuple of (description, instrumental, vocal_language) for updating UI components
+    """
+    try:
+        # Get the project root directory
+        current_file = os.path.abspath(__file__)
+        # This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
+        project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
+        # Construct the examples directory path
+        examples_dir = os.path.join(project_root, "examples", "simple_mode")
+        # Check if directory exists
+        if not os.path.exists(examples_dir):
+            gr.Warning(t("messages.simple_examples_not_found"))
+            return gr.update(), gr.update(), gr.update()
+        # Find all JSON files in the directory
+        json_files = glob.glob(os.path.join(examples_dir, "*.json"))
+        if not json_files:
+            gr.Warning(t("messages.simple_examples_empty"))
+            return gr.update(), gr.update(), gr.update()
+        # Randomly select one file
+        selected_file = random.choice(json_files)
+        # Read and parse JSON
+        try:
+            with open(selected_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            # Extract fields
+            description = data.get('description', '')
+            instrumental = data.get('instrumental', False)
+            vocal_language = data.get('vocal_language', 'unknown')
+            # Ensure vocal_language is a string
+            if isinstance(vocal_language, list):
+                vocal_language = vocal_language[0] if vocal_language else 'unknown'
+            gr.Info(t("messages.simple_example_loaded", filename=os.path.basename(selected_file)))
+            return description, instrumental, vocal_language
+        except json.JSONDecodeError as e:
+            gr.Warning(t("messages.example_failed", filename=os.path.basename(selected_file), error=str(e)))
+            return gr.update(), gr.update(), gr.update()
+        except Exception as e:
+            gr.Warning(t("messages.example_error", error=str(e)))
+            return gr.update(), gr.update(), gr.update()
+    except Exception as e:
+        gr.Warning(t("messages.example_error", error=str(e)))
+        return gr.update(), gr.update(), gr.update()
+def refresh_checkpoints(dit_handler):
+    """Refresh available checkpoints"""
+    choices = dit_handler.get_available_checkpoints()
+    return gr.update(choices=choices)
+def update_model_type_settings(config_path):
+    """Update UI settings based on model type (fallback when handler not initialized yet)
+    Note: This is used as a fallback when the user changes config_path dropdown
+    before initializing the model. The actual settings are determined by the
+    handler's is_turbo_model() method after initialization.
+    """
+    if config_path is None:
+        config_path = ""
+    config_path_lower = config_path.lower()
+    # Determine is_turbo based on config_path string
+    # This is a heuristic fallback - actual model type is determined after loading
+    if "turbo" in config_path_lower:
+        is_turbo = True
+    elif "base" in config_path_lower:
+        is_turbo = False
+    else:
+        # Default to turbo settings for unknown model types
+        is_turbo = True
+    return get_model_type_ui_settings(is_turbo)
+def init_service_wrapper(dit_handler, llm_handler, checkpoint, config_path, device, init_llm, lm_model_path, backend, use_flash_attention, offload_to_cpu, offload_dit_to_cpu):
+    """Wrapper for service initialization, returns status, button state, accordion state, and model type settings"""
+    # Initialize DiT handler
+    status, enable = dit_handler.initialize_service(
+        checkpoint, config_path, device,
+        use_flash_attention=use_flash_attention, compile_model=False,
+        offload_to_cpu=offload_to_cpu, offload_dit_to_cpu=offload_dit_to_cpu
+    )
+    # Initialize LM handler if requested
+    if init_llm:
+        # Get checkpoint directory
+        current_file = os.path.abspath(__file__)
+        # This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
+        project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
+        checkpoint_dir = os.path.join(project_root, "checkpoints")
+        lm_status, lm_success = llm_handler.initialize(
+            checkpoint_dir=checkpoint_dir,
+            lm_model_path=lm_model_path,
+            backend=backend,
+            device=device,
+            offload_to_cpu=offload_to_cpu,
+            dtype=dit_handler.dtype
+        )
+        if lm_success:
+            status += f"\n{lm_status}"
+        else:
+            status += f"\n{lm_status}"
+            # Don't fail the entire initialization if LM fails, but log it
+            # Keep enable as is (DiT initialization result) even if LM fails
+    # Check if model is initialized - if so, collapse the accordion
+    is_model_initialized = dit_handler.model is not None
+    accordion_state = gr.Accordion(open=not is_model_initialized)
+    # Get model type settings based on actual loaded model
+    is_turbo = dit_handler.is_turbo_model()
+    model_type_settings = get_model_type_ui_settings(is_turbo)
+    return (
+        status,
+        gr.update(interactive=enable),
+        accordion_state,
+        *model_type_settings
+    )
+def get_model_type_ui_settings(is_turbo: bool):
+    """Get UI settings based on whether the model is turbo or base"""
+    if is_turbo:
+        # Turbo model: max 20 steps, default 8, show shift with default 3.0, only show text2music/repaint/cover
+        return (
+            gr.update(value=8, maximum=20, minimum=1),  # inference_steps
+            gr.update(visible=False),  # guidance_scale
+            gr.update(visible=False),  # use_adg
+            gr.update(value=3.0, visible=True),  # shift (show with default 3.0)
+            gr.update(visible=False),  # cfg_interval_start
+            gr.update(visible=False),  # cfg_interval_end
+            gr.update(choices=TASK_TYPES_TURBO),  # task_type
+        )
+    else:
+        # Base model: max 200 steps, default 32, show CFG/ADG/shift, show all task types
+        return (
+            gr.update(value=32, maximum=200, minimum=1),  # inference_steps
+            gr.update(visible=True),  # guidance_scale
+            gr.update(visible=True),  # use_adg
+            gr.update(value=3.0, visible=True),  # shift (effective for base, default 3.0)
+            gr.update(visible=True),  # cfg_interval_start
+            gr.update(visible=True),  # cfg_interval_end
+            gr.update(choices=TASK_TYPES_BASE),  # task_type
+        )
+def update_negative_prompt_visibility(init_llm_checked):
+    """Update negative prompt visibility: show if Initialize 5Hz LM checkbox is checked"""
+    return gr.update(visible=init_llm_checked)
+def update_audio_cover_strength_visibility(task_type_value, init_llm_checked):
+    """Update audio_cover_strength visibility and label"""
+    # Show if task is cover OR if LM is initialized (but NOT for repaint mode)
+    # Repaint mode never shows this control
+    is_repaint = task_type_value == "repaint"
+    is_cover = task_type_value == "cover"
+    is_visible = is_cover or (init_llm_checked and not is_repaint)
+    # Change label based on context
+    if init_llm_checked and not is_cover:
+        label = "LM codes strength"
+        info = "Control how many denoising steps use LM-generated codes"
+    else:
+        label = "Audio Cover Strength"
+        info = "Control how many denoising steps use cover mode"
+    return gr.update(visible=is_visible, label=label, info=info)
+def convert_src_audio_to_codes_wrapper(dit_handler, src_audio):
+    """Wrapper for converting src audio to codes"""
+    codes_string = dit_handler.convert_src_audio_to_codes(src_audio)
+    return codes_string
+def update_instruction_ui(
+    dit_handler,
+    task_type_value: str,
+    track_name_value: Optional[str],
+    complete_track_classes_value: list,
+    audio_codes_content: str = "",
+    init_llm_checked: bool = False
+) -> tuple:
+    """Update instruction and UI visibility based on task type."""
+    instruction = dit_handler.generate_instruction(
+        task_type=task_type_value,
+        track_name=track_name_value,
+        complete_track_classes=complete_track_classes_value
+    )
+    # Show track_name for lego and extract
+    track_name_visible = task_type_value in ["lego", "extract"]
+    # Show complete_track_classes for complete
+    complete_visible = task_type_value == "complete"
+    # Show audio_cover_strength for cover OR when LM is initialized (but NOT for repaint)
+    is_repaint = task_type_value == "repaint"
+    is_cover = task_type_value == "cover"
+    audio_cover_strength_visible = is_cover or (init_llm_checked and not is_repaint)
+    # Determine label and info based on context
+    if init_llm_checked and not is_cover:
+        audio_cover_strength_label = "LM codes strength"
+        audio_cover_strength_info = "Control how many denoising steps use LM-generated codes"
+    else:
+        audio_cover_strength_label = "Audio Cover Strength"
+        audio_cover_strength_info = "Control how many denoising steps use cover mode"
+    # Show repainting controls for repaint and lego
+    repainting_visible = task_type_value in ["repaint", "lego"]
+    # Show text2music_audio_codes if task is text2music OR if it has content
+    # This allows it to stay visible even if user switches task type but has codes
+    has_audio_codes = audio_codes_content and str(audio_codes_content).strip()
+    text2music_audio_codes_visible = task_type_value == "text2music" or has_audio_codes
+    return (
+        instruction,  # instruction_display_gen
+        gr.update(visible=track_name_visible),  # track_name
+        gr.update(visible=complete_visible),  # complete_track_classes
+        gr.update(visible=audio_cover_strength_visible, label=audio_cover_strength_label, info=audio_cover_strength_info),  # audio_cover_strength
+        gr.update(visible=repainting_visible),  # repainting_group
+        gr.update(visible=text2music_audio_codes_visible),  # text2music_audio_codes_group
+    )
+def transcribe_audio_codes(llm_handler, audio_code_string, constrained_decoding_debug):
+    """
+    Transcribe audio codes to metadata using LLM understanding.
+    If audio_code_string is empty, generate a sample example instead.
+    This is a Gradio wrapper around the understand_music API in acestep.inference.
+    Args:
+        llm_handler: LLM handler instance
+        audio_code_string: String containing audio codes (or empty for example generation)
+        constrained_decoding_debug: Whether to enable debug logging for constrained decoding
+    Returns:
+        Tuple of (status_message, caption, lyrics, bpm, duration, keyscale, language, timesignature, is_format_caption)
+    """
+    # Call the inference API
+    result = understand_music(
+        llm_handler=llm_handler,
+        audio_codes=audio_code_string,
+        use_constrained_decoding=True,
+        constrained_decoding_debug=constrained_decoding_debug,
+    )
+    # Handle error case with localized message
+    if not result.success:
+        # Use localized error message for LLM not initialized
+        if result.error == "LLM not initialized":
+            return t("messages.lm_not_initialized"), "", "", None, None, "", "", "", False
+        return result.status_message, "", "", None, None, "", "", "", False
+    return (
+        result.status_message,
+        result.caption,
+        result.lyrics,
+        result.bpm,
+        result.duration,
+        result.keyscale,
+        result.language,
+        result.timesignature,
+        True  # Set is_format_caption to True (from Transcribe/LM understanding)
+    )
+def update_transcribe_button_text(audio_code_string):
+    """
+    Update the transcribe button text based on input content.
+    If empty: "Generate Example"
+    If has content: "Transcribe"
+    """
+    if not audio_code_string or not audio_code_string.strip():
+        return gr.update(value="Generate Example")
+    else:
+        return gr.update(value="Transcribe")
+def reset_format_caption_flag():
+    """Reset is_format_caption to False when user manually edits caption/metadata"""
+    return False
+def update_audio_uploads_accordion(reference_audio, src_audio):
+    """Update Audio Uploads visibility based on whether audio files are present"""
+    has_audio = (reference_audio is not None) or (src_audio is not None)
+    return gr.update(visible=has_audio)
+def handle_instrumental_checkbox(instrumental_checked, current_lyrics):
+    """
+    Handle instrumental checkbox changes.
+    When checked: if no lyrics, fill with [Instrumental]
+    When unchecked: if lyrics is [Instrumental], clear it
+    """
+    if instrumental_checked:
+        # If checked and no lyrics, fill with [Instrumental]
+        if not current_lyrics or not current_lyrics.strip():
+            return "[Instrumental]"
+        else:
+            # Has lyrics, don't change
+            return current_lyrics
+    else:
+        # If unchecked and lyrics is exactly [Instrumental], clear it
+        if current_lyrics and current_lyrics.strip() == "[Instrumental]":
+            return ""
+        else:
+            # Has other lyrics, don't change
+            return current_lyrics
+def handle_simple_instrumental_change(is_instrumental: bool):
+    """
+    Handle simple mode instrumental checkbox changes.
+    When checked: set vocal_language to "unknown" and disable editing.
+    When unchecked: enable vocal_language editing.
+    Args:
+        is_instrumental: Whether instrumental checkbox is checked
+    Returns:
+        gr.update for simple_vocal_language dropdown
+    """
+    if is_instrumental:
+        return gr.update(value="unknown", interactive=False)
+    else:
+        return gr.update(interactive=True)
+def update_audio_components_visibility(batch_size):
+    """Show/hide individual audio components based on batch size (1-8)
+    Row 1: Components 1-4 (batch_size 1-4)
+    Row 2: Components 5-8 (batch_size 5-8)
+    """
+    # Clamp batch size to 1-8 range for UI
+    batch_size = min(max(int(batch_size), 1), 8)
+    # Row 1 columns (1-4)
+    updates_row1 = (
+        gr.update(visible=True),  # audio_col_1: always visible
+        gr.update(visible=batch_size >= 2),  # audio_col_2
+        gr.update(visible=batch_size >= 3),  # audio_col_3
+        gr.update(visible=batch_size >= 4),  # audio_col_4
+    )
+    # Row 2 container and columns (5-8)
+    show_row_5_8 = batch_size >= 5
+    updates_row2 = (
+        gr.update(visible=show_row_5_8),  # audio_row_5_8 (container)
+        gr.update(visible=batch_size >= 5),  # audio_col_5
+        gr.update(visible=batch_size >= 6),  # audio_col_6
+        gr.update(visible=batch_size >= 7),  # audio_col_7
+        gr.update(visible=batch_size >= 8),  # audio_col_8
+    )
+    return updates_row1 + updates_row2
+def handle_generation_mode_change(mode: str):
+    """
+    Handle generation mode change between Simple, Custom, Cover, and Repaint modes.
+    Modes:
+    - Simple: Show simple mode group, hide others
+    - Custom: Show custom content (prompt), hide others
+    - Cover: Show src_audio_group + custom content + LM codes strength
+    - Repaint: Show src_audio_group + custom content + repaint time controls (hide LM codes strength)
+    Args:
+        mode: "simple", "custom", "cover", or "repaint"
+    Returns:
+        Tuple of updates for:
+        - simple_mode_group (visibility)
+        - custom_mode_content (visibility)
+        - cover_mode_group (visibility) - legacy, always hidden
+        - repainting_group (visibility)
+        - task_type (value)
+        - generate_btn (interactive state)
+        - simple_sample_created (reset state)
+        - src_audio_group (visibility) - shown for cover and repaint
+        - audio_cover_strength (visibility) - shown only for cover mode
+        - think_checkbox (value and interactive) - disabled for cover/repaint modes
+    """
+    is_simple = mode == "simple"
+    is_custom = mode == "custom"
+    is_cover = mode == "cover"
+    is_repaint = mode == "repaint"
+    # Map mode to task_type
+    task_type_map = {
+        "simple": "text2music",
+        "custom": "text2music",
+        "cover": "cover",
+        "repaint": "repaint",
+    }
+    task_type_value = task_type_map.get(mode, "text2music")
+    # think_checkbox: disabled and set to False for cover/repaint modes
+    # (these modes don't use LM thinking, they use source audio codes)
+    if is_cover or is_repaint:
+        think_checkbox_update = gr.update(value=False, interactive=False)
+    else:
+        think_checkbox_update = gr.update(value=True, interactive=True)
+    return (
+        gr.update(visible=is_simple),  # simple_mode_group
+        gr.update(visible=not is_simple),  # custom_mode_content - visible for custom/cover/repaint
+        gr.update(visible=False),  # cover_mode_group - legacy, always hidden
+        gr.update(visible=is_repaint),  # repainting_group - time range controls
+        gr.update(value=task_type_value),  # task_type
+        gr.update(interactive=True),  # generate_btn - always enabled (Simple mode does create+generate in one step)
+        False,  # simple_sample_created - reset to False on mode change
+        gr.update(visible=is_cover or is_repaint),  # src_audio_group - shown for cover and repaint
+        gr.update(visible=is_cover),  # audio_cover_strength - only shown for cover mode
+        think_checkbox_update,  # think_checkbox - disabled for cover/repaint modes
+    )
+def process_source_audio(dit_handler, llm_handler, src_audio, constrained_decoding_debug):
+    """
+    Process source audio: convert to codes and then transcribe.
+    This combines convert_src_audio_to_codes_wrapper + transcribe_audio_codes.
+    Args:
+        dit_handler: DiT handler instance
+        llm_handler: LLM handler instance
+        src_audio: Path to source audio file
+        constrained_decoding_debug: Whether to enable debug logging
+    Returns:
+        Tuple of (audio_codes, status_message, caption, lyrics, bpm, duration, keyscale, language, timesignature, is_format_caption)
+    """
+    if src_audio is None:
+        return ("", "No audio file provided", "", "", None, None, "", "", "", False)
+    # Step 1: Convert audio to codes
+    try:
+        codes_string = dit_handler.convert_src_audio_to_codes(src_audio)
+        if not codes_string:
+            return ("", "Failed to convert audio to codes", "", "", None, None, "", "", "", False)
+    except Exception as e:
+        return ("", f"Error converting audio: {str(e)}", "", "", None, None, "", "", "", False)
+    # Step 2: Transcribe the codes
+    result = understand_music(
+        llm_handler=llm_handler,
+        audio_codes=codes_string,
+        use_constrained_decoding=True,
+        constrained_decoding_debug=constrained_decoding_debug,
+    )
+    # Handle error case
+    if not result.success:
+        if result.error == "LLM not initialized":
+            return (codes_string, t("messages.lm_not_initialized"), "", "", None, None, "", "", "", False)
+        return (codes_string, result.status_message, "", "", None, None, "", "", "", False)
+    return (
+        codes_string,
+        result.status_message,
+        result.caption,
+        result.lyrics,
+        result.bpm,
+        result.duration,
+        result.keyscale,
+        result.language,
+        result.timesignature,
+        True  # Set is_format_caption to True
+    )
+def handle_create_sample(
+    llm_handler,
+    query: str,
+    instrumental: bool,
+    vocal_language: str,
+    lm_temperature: float,
+    lm_top_k: int,
+    lm_top_p: float,
+    constrained_decoding_debug: bool = False,
+):
+    """
+    Handle the Create Sample button click in Simple mode.
+    Creates a sample from the user's query using the LLM, then populates
+    the caption, lyrics, and metadata fields.
+    Note: cfg_scale and negative_prompt are not supported in create_sample mode.
+    Args:
+        llm_handler: LLM handler instance (unused, fetched from registry)
+        query: User's natural language music description
+        instrumental: Whether to generate instrumental music
+        vocal_language: Preferred vocal language for constrained decoding
+        lm_temperature: LLM temperature for generation
+        lm_top_k: LLM top-k sampling
+        lm_top_p: LLM top-p sampling
+        constrained_decoding_debug: Whether to enable debug logging
+    Returns:
+        Tuple of updates for:
+        - captions
+        - lyrics
+        - bpm
+        - audio_duration
+        - key_scale
+        - vocal_language
+        - time_signature
+        - instrumental_checkbox
+        - caption_accordion (open)
+        - lyrics_accordion (open)
+        - generate_btn (interactive)
+        - simple_sample_created (True)
+        - think_checkbox (True)
+        - is_format_caption_state (True)
+        - status_output
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        gr.Warning(t("messages.lm_not_initialized"))
+        return (
+            gr.update(),  # captions - no change
+            gr.update(),  # lyrics - no change
+            gr.update(),  # bpm - no change
+            gr.update(),  # audio_duration - no change
+            gr.update(),  # key_scale - no change
+            gr.update(),  # vocal_language - no change
+            gr.update(),  # time_signature - no change
+            gr.update(),  # instrumental_checkbox - no change
+            gr.update(),  # caption_accordion - no change
+            gr.update(),  # lyrics_accordion - no change
+            gr.update(interactive=False),  # generate_btn - keep disabled
+            False,  # simple_sample_created - still False
+            gr.update(),  # think_checkbox - no change
+            gr.update(),  # is_format_caption_state - no change
+            t("messages.lm_not_initialized"),  # status_output
+        )
+    # Convert LM parameters
+    top_k_value = None if not lm_top_k or lm_top_k == 0 else int(lm_top_k)
+    top_p_value = None if not lm_top_p or lm_top_p >= 1.0 else lm_top_p
+    # Call create_sample API
+    # Note: cfg_scale and negative_prompt are not supported in create_sample mode
+    result = create_sample(
+        llm_handler=llm_handler,
+        query=query,
+        instrumental=instrumental,
+        vocal_language=vocal_language,
+        temperature=lm_temperature,
+        top_k=top_k_value,
+        top_p=top_p_value,
+        use_constrained_decoding=True,
+        constrained_decoding_debug=constrained_decoding_debug,
+    )
+    # Handle error
+    if not result.success:
+        gr.Warning(result.status_message or t("messages.sample_creation_failed"))
+        return (
+            gr.update(),  # captions - no change
+            gr.update(),  # lyrics - no change
+            gr.update(),  # bpm - no change
+            gr.update(),  # audio_duration - no change
+            gr.update(),  # key_scale - no change
+            gr.update(),  # vocal_language - no change
+            gr.update(),  # simple vocal_language - no change
+            gr.update(),  # time_signature - no change
+            gr.update(),  # instrumental_checkbox - no change
+            gr.update(),  # caption_accordion - no change
+            gr.update(),  # lyrics_accordion - no change
+            gr.update(interactive=False),  # generate_btn - keep disabled
+            False,  # simple_sample_created - still False
+            gr.update(),  # think_checkbox - no change
+            gr.update(),  # is_format_caption_state - no change
+            result.status_message or t("messages.sample_creation_failed"),  # status_output
+        )
+    # Success - populate fields
+    gr.Info(t("messages.sample_created"))
+    return (
+        result.caption,  # captions
+        result.lyrics,  # lyrics
+        result.bpm,  # bpm
+        result.duration if result.duration and result.duration > 0 else -1,  # audio_duration
+        result.keyscale,  # key_scale
+        result.language,  # vocal_language
+        result.language,  # simple vocal_language
+        result.timesignature,  # time_signature
+        result.instrumental,  # instrumental_checkbox
+        gr.Accordion(open=True),  # caption_accordion - expand
+        gr.Accordion(open=True),  # lyrics_accordion - expand
+        gr.update(interactive=True),  # generate_btn - enable
+        True,  # simple_sample_created - True
+        True,  # think_checkbox - enable thinking
+        True,  # is_format_caption_state - True (LM-generated)
+        result.status_message,  # status_output
+    )
+def handle_format_sample(
+    llm_handler,
+    caption: str,
+    lyrics: str,
+    bpm,
+    audio_duration,
+    key_scale: str,
+    time_signature: str,
+    lm_temperature: float,
+    lm_top_k: int,
+    lm_top_p: float,
+    constrained_decoding_debug: bool = False,
+):
+    """
+    Handle the Format button click to format caption and lyrics.
+    Takes user-provided caption and lyrics, and uses the LLM to generate
+    structured music metadata and an enhanced description.
+    Note: cfg_scale and negative_prompt are not supported in format mode.
+    Args:
+        llm_handler: LLM handler instance (unused, fetched from registry)
+        caption: User's caption/description
+        lyrics: User's lyrics
+        bpm: User-provided BPM (optional, for constrained decoding)
+        audio_duration: User-provided duration (optional, for constrained decoding)
+        key_scale: User-provided key scale (optional, for constrained decoding)
+        time_signature: User-provided time signature (optional, for constrained decoding)
+        lm_temperature: LLM temperature for generation
+        lm_top_k: LLM top-k sampling
+        lm_top_p: LLM top-p sampling
+        constrained_decoding_debug: Whether to enable debug logging
+    Returns:
+        Tuple of updates for:
+        - captions
+        - lyrics
+        - bpm
+        - audio_duration
+        - key_scale
+        - vocal_language
+        - time_signature
+        - is_format_caption_state
+        - status_output
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        gr.Warning(t("messages.lm_not_initialized"))
+        return (
+            gr.update(),  # captions - no change
+            gr.update(),  # lyrics - no change
+            gr.update(),  # bpm - no change
+            gr.update(),  # audio_duration - no change
+            gr.update(),  # key_scale - no change
+            gr.update(),  # vocal_language - no change
+            gr.update(),  # time_signature - no change
+            gr.update(),  # is_format_caption_state - no change
+            t("messages.lm_not_initialized"),  # status_output
+        )
+    # Build user_metadata from provided values for constrained decoding
+    user_metadata = {}
+    if bpm is not None and bpm > 0:
+        user_metadata['bpm'] = int(bpm)
+    if audio_duration is not None and audio_duration > 0:
+        user_metadata['duration'] = int(audio_duration)
+    if key_scale and key_scale.strip():
+        user_metadata['keyscale'] = key_scale.strip()
+    if time_signature and time_signature.strip():
+        user_metadata['timesignature'] = time_signature.strip()
+    # Only pass user_metadata if we have at least one field
+    user_metadata_to_pass = user_metadata if user_metadata else None
+    # Convert LM parameters
+    top_k_value = None if not lm_top_k or lm_top_k == 0 else int(lm_top_k)
+    top_p_value = None if not lm_top_p or lm_top_p >= 1.0 else lm_top_p
+    # Call format_sample API
+    result = format_sample(
+        llm_handler=llm_handler,
+        caption=caption,
+        lyrics=lyrics,
+        user_metadata=user_metadata_to_pass,
+        temperature=lm_temperature,
+        top_k=top_k_value,
+        top_p=top_p_value,
+        use_constrained_decoding=True,
+        constrained_decoding_debug=constrained_decoding_debug,
+    )
+    # Handle error
+    if not result.success:
+        gr.Warning(result.status_message or t("messages.format_failed"))
+        return (
+            gr.update(),  # captions - no change
+            gr.update(),  # lyrics - no change
+            gr.update(),  # bpm - no change
+            gr.update(),  # audio_duration - no change
+            gr.update(),  # key_scale - no change
+            gr.update(),  # vocal_language - no change
+            gr.update(),  # time_signature - no change
+            gr.update(),  # is_format_caption_state - no change
+            result.status_message or t("messages.format_failed"),  # status_output
+        )
+    # Success - populate fields
+    gr.Info(t("messages.format_success"))
+    return (
+        result.caption,  # captions
+        result.lyrics,  # lyrics
+        result.bpm,  # bpm
+        result.duration if result.duration and result.duration > 0 else -1,  # audio_duration
+        result.keyscale,  # key_scale
+        result.language,  # vocal_language
+        result.timesignature,  # time_signature
+        True,  # is_format_caption_state - True (LM-formatted)
+        result.status_message,  # status_output
+    )

acestep/gradio_ui/events/results_handlers.py ADDED Viewed

The diff for this file is too large to render. See raw diff

acestep/gradio_ui/events/training_handlers.py ADDED Viewed

	@@ -0,0 +1,644 @@

+"""
+Event Handlers for Training Tab
+Contains all event handler functions for the dataset builder and training UI.
+"""
+import os
+import json
+from typing import Any, Dict, List, Tuple, Optional
+from loguru import logger
+import gradio as gr
+from acestep.training.dataset_builder import DatasetBuilder, AudioSample
+def create_dataset_builder() -> DatasetBuilder:
+    """Create a new DatasetBuilder instance."""
+    return DatasetBuilder()
+def scan_directory(
+    audio_dir: str,
+    dataset_name: str,
+    custom_tag: str,
+    tag_position: str,
+    all_instrumental: bool,
+    builder_state: Optional[DatasetBuilder],
+) -> Tuple[Any, str, Any, DatasetBuilder]:
+    """Scan a directory for audio files.
+    Returns:
+        Tuple of (table_data, status, slider_update, builder_state)
+    """
+    if not audio_dir or not audio_dir.strip():
+        return [], "❌ Please enter a directory path", gr.Slider(maximum=0, value=0), builder_state
+    # Create or use existing builder
+    builder = builder_state if builder_state else DatasetBuilder()
+    # Set metadata before scanning
+    builder.metadata.name = dataset_name
+    builder.metadata.custom_tag = custom_tag
+    builder.metadata.tag_position = tag_position
+    builder.metadata.all_instrumental = all_instrumental
+    # Scan directory
+    samples, status = builder.scan_directory(audio_dir.strip())
+    if not samples:
+        return [], status, gr.Slider(maximum=0, value=0), builder
+    # Set instrumental and tag for all samples
+    builder.set_all_instrumental(all_instrumental)
+    if custom_tag:
+        builder.set_custom_tag(custom_tag, tag_position)
+    # Get table data
+    table_data = builder.get_samples_dataframe_data()
+    # Calculate slider max and return as Slider update
+    slider_max = max(0, len(samples) - 1)
+    return table_data, status, gr.Slider(maximum=slider_max, value=0), builder
+def auto_label_all(
+    dit_handler,
+    llm_handler,
+    builder_state: Optional[DatasetBuilder],
+    skip_metas: bool = False,
+    progress=None,
+) -> Tuple[List[List[Any]], str, DatasetBuilder]:
+    """Auto-label all samples in the dataset.
+    Args:
+        dit_handler: DiT handler for audio processing
+        llm_handler: LLM handler for caption generation
+        builder_state: Dataset builder state
+        skip_metas: If True, skip LLM labeling. BPM/Key/TimeSig = N/A, Language = unknown for instrumental
+        progress: Progress callback
+    Returns:
+        Tuple of (table_data, status, builder_state)
+    """
+    if builder_state is None:
+        return [], "❌ Please scan a directory first", builder_state
+    if not builder_state.samples:
+        return [], "❌ No samples to label. Please scan a directory first.", builder_state
+    # If skip_metas is True, just set default values without LLM
+    if skip_metas:
+        for sample in builder_state.samples:
+            sample.bpm = None  # Will display as N/A
+            sample.keyscale = "N/A"
+            sample.timesignature = "N/A"
+            # For instrumental, language should be "unknown"
+            if sample.is_instrumental:
+                sample.language = "unknown"
+            else:
+                sample.language = "unknown"
+            # Use custom tag as caption if set, otherwise use filename
+            if builder_state.metadata.custom_tag:
+                sample.caption = builder_state.metadata.custom_tag
+            else:
+                sample.caption = sample.filename
+        table_data = builder_state.get_samples_dataframe_data()
+        return table_data, f"✅ Skipped AI labeling. {len(builder_state.samples)} samples set with default values.", builder_state
+    # Check if handlers are initialized
+    if dit_handler is None or dit_handler.model is None:
+        return builder_state.get_samples_dataframe_data(), "❌ Model not initialized. Please initialize the service first.", builder_state
+    if llm_handler is None or not llm_handler.llm_initialized:
+        return builder_state.get_samples_dataframe_data(), "❌ LLM not initialized. Please initialize the service with LLM enabled.", builder_state
+    def progress_callback(msg):
+        if progress:
+            try:
+                progress(msg)
+            except:
+                pass
+    # Label all samples
+    samples, status = builder_state.label_all_samples(
+        dit_handler=dit_handler,
+        llm_handler=llm_handler,
+        progress_callback=progress_callback,
+    )
+    # Get updated table data
+    table_data = builder_state.get_samples_dataframe_data()
+    return table_data, status, builder_state
+def get_sample_preview(
+    sample_idx: int,
+    builder_state: Optional[DatasetBuilder],
+) -> Tuple[str, str, str, str, Optional[int], str, str, float, str, bool]:
+    """Get preview data for a specific sample.
+    Returns:
+        Tuple of (audio_path, filename, caption, lyrics, bpm, keyscale, timesig, duration, language, instrumental)
+    """
+    if builder_state is None or not builder_state.samples:
+        return None, "", "", "", None, "", "", 0.0, "instrumental", True
+    idx = int(sample_idx)
+    if idx < 0 or idx >= len(builder_state.samples):
+        return None, "", "", "", None, "", "", 0.0, "instrumental", True
+    sample = builder_state.samples[idx]
+    return (
+        sample.audio_path,
+        sample.filename,
+        sample.caption,
+        sample.lyrics,
+        sample.bpm,
+        sample.keyscale,
+        sample.timesignature,
+        sample.duration,
+        sample.language,
+        sample.is_instrumental,
+    )
+def save_sample_edit(
+    sample_idx: int,
+    caption: str,
+    lyrics: str,
+    bpm: Optional[int],
+    keyscale: str,
+    timesig: str,
+    language: str,
+    is_instrumental: bool,
+    builder_state: Optional[DatasetBuilder],
+) -> Tuple[List[List[Any]], str, DatasetBuilder]:
+    """Save edits to a sample.
+    Returns:
+        Tuple of (table_data, status, builder_state)
+    """
+    if builder_state is None:
+        return [], "❌ No dataset loaded", builder_state
+    idx = int(sample_idx)
+    # Update sample
+    sample, status = builder_state.update_sample(
+        idx,
+        caption=caption,
+        lyrics=lyrics if not is_instrumental else "[Instrumental]",
+        bpm=int(bpm) if bpm else None,
+        keyscale=keyscale,
+        timesignature=timesig,
+        language="instrumental" if is_instrumental else language,
+        is_instrumental=is_instrumental,
+        labeled=True,
+    )
+    # Get updated table data
+    table_data = builder_state.get_samples_dataframe_data()
+    return table_data, status, builder_state
+def update_settings(
+    custom_tag: str,
+    tag_position: str,
+    all_instrumental: bool,
+    builder_state: Optional[DatasetBuilder],
+) -> DatasetBuilder:
+    """Update dataset settings.
+    Returns:
+        Updated builder_state
+    """
+    if builder_state is None:
+        return builder_state
+    if custom_tag:
+        builder_state.set_custom_tag(custom_tag, tag_position)
+    builder_state.set_all_instrumental(all_instrumental)
+    return builder_state
+def save_dataset(
+    save_path: str,
+    dataset_name: str,
+    builder_state: Optional[DatasetBuilder],
+) -> str:
+    """Save the dataset to a JSON file.
+    Returns:
+        Status message
+    """
+    if builder_state is None:
+        return "❌ No dataset to save. Please scan a directory first."
+    if not builder_state.samples:
+        return "❌ No samples in dataset."
+    if not save_path or not save_path.strip():
+        return "❌ Please enter a save path."
+    # Check if any samples are labeled
+    labeled_count = builder_state.get_labeled_count()
+    if labeled_count == 0:
+        return "⚠️ Warning: No samples have been labeled. Consider auto-labeling first.\nSaving anyway..."
+    return builder_state.save_dataset(save_path.strip(), dataset_name)
+def load_existing_dataset_for_preprocess(
+    dataset_path: str,
+    builder_state: Optional[DatasetBuilder],
+) -> Tuple[str, Any, Any, DatasetBuilder, str, str, str, str, Optional[int], str, str, float, str, bool]:
+    """Load an existing dataset JSON file for preprocessing.
+    This allows users to load a previously saved dataset and proceed to preprocessing
+    without having to re-scan and re-label.
+    Returns:
+        Tuple of (status, table_data, slider_update, builder_state,
+                  audio_path, filename, caption, lyrics, bpm, keyscale, timesig, duration, language, instrumental)
+    """
+    empty_preview = (None, "", "", "", None, "", "", 0.0, "instrumental", True)
+    if not dataset_path or not dataset_path.strip():
+        return ("❌ Please enter a dataset path", [], gr.Slider(maximum=0, value=0), builder_state) + empty_preview
+    dataset_path = dataset_path.strip()
+    if not os.path.exists(dataset_path):
+        return (f"❌ Dataset not found: {dataset_path}", [], gr.Slider(maximum=0, value=0), builder_state) + empty_preview
+    # Create new builder (don't reuse old state when loading a file)
+    builder = DatasetBuilder()
+    # Load the dataset
+    samples, status = builder.load_dataset(dataset_path)
+    if not samples:
+        return (status, [], gr.Slider(maximum=0, value=0), builder) + empty_preview
+    # Get table data
+    table_data = builder.get_samples_dataframe_data()
+    # Calculate slider max
+    slider_max = max(0, len(samples) - 1)
+    # Create info text
+    labeled_count = builder.get_labeled_count()
+    info = f"✅ Loaded dataset: {builder.metadata.name}\n"
+    info += f"📊 Samples: {len(samples)} ({labeled_count} labeled)\n"
+    info += f"🏷️ Custom Tag: {builder.metadata.custom_tag or '(none)'}\n"
+    info += "📝 Ready for preprocessing! You can also edit samples below."
+    # Get first sample preview
+    first_sample = builder.samples[0]
+    preview = (
+        first_sample.audio_path,
+        first_sample.filename,
+        first_sample.caption,
+        first_sample.lyrics,
+        first_sample.bpm,
+        first_sample.keyscale,
+        first_sample.timesignature,
+        first_sample.duration,
+        first_sample.language,
+        first_sample.is_instrumental,
+    )
+    return (info, table_data, gr.Slider(maximum=slider_max, value=0), builder) + preview
+def preprocess_dataset(
+    output_dir: str,
+    dit_handler,
+    builder_state: Optional[DatasetBuilder],
+    progress=None,
+) -> str:
+    """Preprocess dataset to tensor files for fast training.
+    This converts audio files to VAE latents and text to embeddings.
+    Returns:
+        Status message
+    """
+    if builder_state is None:
+        return "❌ No dataset loaded. Please scan a directory first."
+    if not builder_state.samples:
+        return "❌ No samples in dataset."
+    labeled_count = builder_state.get_labeled_count()
+    if labeled_count == 0:
+        return "❌ No labeled samples. Please auto-label or manually label samples first."
+    if not output_dir or not output_dir.strip():
+        return "❌ Please enter an output directory."
+    if dit_handler is None or dit_handler.model is None:
+        return "❌ Model not initialized. Please initialize the service first."
+    def progress_callback(msg):
+        if progress:
+            try:
+                progress(msg)
+            except:
+                pass
+    # Run preprocessing
+    output_paths, status = builder_state.preprocess_to_tensors(
+        dit_handler=dit_handler,
+        output_dir=output_dir.strip(),
+        progress_callback=progress_callback,
+    )
+    return status
+def load_training_dataset(
+    tensor_dir: str,
+) -> str:
+    """Load a preprocessed tensor dataset for training.
+    Returns:
+        Info text about the dataset
+    """
+    if not tensor_dir or not tensor_dir.strip():
+        return "❌ Please enter a tensor directory path"
+    tensor_dir = tensor_dir.strip()
+    if not os.path.exists(tensor_dir):
+        return f"❌ Directory not found: {tensor_dir}"
+    if not os.path.isdir(tensor_dir):
+        return f"❌ Not a directory: {tensor_dir}"
+    # Check for manifest
+    manifest_path = os.path.join(tensor_dir, "manifest.json")
+    if os.path.exists(manifest_path):
+        try:
+            with open(manifest_path, 'r') as f:
+                manifest = json.load(f)
+            num_samples = manifest.get("num_samples", 0)
+            metadata = manifest.get("metadata", {})
+            name = metadata.get("name", "Unknown")
+            custom_tag = metadata.get("custom_tag", "")
+            info = f"✅ Loaded preprocessed dataset: {name}\n"
+            info += f"📊 Samples: {num_samples} preprocessed tensors\n"
+            info += f"🏷️ Custom Tag: {custom_tag or '(none)'}"
+            return info
+        except Exception as e:
+            logger.warning(f"Failed to read manifest: {e}")
+    # Fallback: count .pt files
+    pt_files = [f for f in os.listdir(tensor_dir) if f.endswith('.pt')]
+    if not pt_files:
+        return f"❌ No .pt tensor files found in {tensor_dir}"
+    info = f"✅ Found {len(pt_files)} tensor files in {tensor_dir}\n"
+    info += "⚠️ No manifest.json found - using all .pt files"
+    return info
+# Training handlers
+import time
+import re
+def _format_duration(seconds):
+    """Format seconds to human readable string."""
+    seconds = int(seconds)
+    if seconds < 60:
+        return f"{seconds}s"
+    elif seconds < 3600:
+        return f"{seconds // 60}m {seconds % 60}s"
+    else:
+        return f"{seconds // 3600}h {(seconds % 3600) // 60}m"
+def start_training(
+    tensor_dir: str,
+    dit_handler,
+    lora_rank: int,
+    lora_alpha: int,
+    lora_dropout: float,
+    learning_rate: float,
+    train_epochs: int,
+    train_batch_size: int,
+    gradient_accumulation: int,
+    save_every_n_epochs: int,
+    training_shift: float,
+    training_seed: int,
+    lora_output_dir: str,
+    training_state: Dict,
+    progress=None,
+):
+    """Start LoRA training from preprocessed tensors.
+    This is a generator function that yields progress updates.
+    """
+    if not tensor_dir or not tensor_dir.strip():
+        yield "❌ Please enter a tensor directory path", "", None, training_state
+        return
+    tensor_dir = tensor_dir.strip()
+    if not os.path.exists(tensor_dir):
+        yield f"❌ Tensor directory not found: {tensor_dir}", "", None, training_state
+        return
+    if dit_handler is None or dit_handler.model is None:
+        yield "❌ Model not initialized. Please initialize the service first.", "", None, training_state
+        return
+    # Check for required training dependencies
+    try:
+        from lightning.fabric import Fabric
+        from peft import get_peft_model, LoraConfig
+    except ImportError as e:
+        yield f"❌ Missing required packages: {e}\nPlease install: pip install peft lightning", "", None, training_state
+        return
+    training_state["is_training"] = True
+    training_state["should_stop"] = False
+    try:
+        from acestep.training.trainer import LoRATrainer
+        from acestep.training.configs import LoRAConfig as LoRAConfigClass, TrainingConfig
+        # Create configs
+        lora_config = LoRAConfigClass(
+            r=lora_rank,
+            alpha=lora_alpha,
+            dropout=lora_dropout,
+        )
+        training_config = TrainingConfig(
+            shift=training_shift,
+            learning_rate=learning_rate,
+            batch_size=train_batch_size,
+            gradient_accumulation_steps=gradient_accumulation,
+            max_epochs=train_epochs,
+            save_every_n_epochs=save_every_n_epochs,
+            seed=training_seed,
+            output_dir=lora_output_dir,
+        )
+        import pandas as pd
+        # Initialize training log and loss history
+        log_lines = []
+        loss_data = pd.DataFrame({"step": [0], "loss": [0.0]})
+        # Start timer
+        start_time = time.time()
+        yield f"🚀 Starting training from {tensor_dir}...", "", loss_data, training_state
+        # Create trainer
+        trainer = LoRATrainer(
+            dit_handler=dit_handler,
+            lora_config=lora_config,
+            training_config=training_config,
+        )
+        # Collect loss history
+        step_list = []
+        loss_list = []
+        # Train with progress updates using preprocessed tensors
+        for step, loss, status in trainer.train_from_preprocessed(tensor_dir, training_state):
+            # Calculate elapsed time and ETA
+            elapsed_seconds = time.time() - start_time
+            time_info = f"⏱️ Elapsed: {_format_duration(elapsed_seconds)}"
+            # Parse "Epoch x/y" from status to calculate ETA
+            match = re.search(r"Epoch\s+(\d+)/(\d+)", str(status))
+            if match:
+                current_ep = int(match.group(1))
+                total_ep = int(match.group(2))
+                if current_ep > 0:
+                    eta_seconds = (elapsed_seconds / current_ep) * (total_ep - current_ep)
+                    time_info += f" | ETA: ~{_format_duration(eta_seconds)}"
+            # Display status with time info
+            display_status = f"{status}\n{time_info}"
+            # Terminal log
+            log_msg = f"[{_format_duration(elapsed_seconds)}] Step {step}: {status}"
+            logger.info(log_msg)
+            # Add to UI log
+            log_lines.append(status)
+            if len(log_lines) > 15:
+                log_lines = log_lines[-15:]
+            log_text = "\n".join(log_lines)
+            # Track loss for plot (only valid values)
+            if step > 0 and loss is not None and loss == loss:  # Check for NaN
+                step_list.append(step)
+                loss_list.append(float(loss))
+                loss_data = pd.DataFrame({"step": step_list, "loss": loss_list})
+            yield display_status, log_text, loss_data, training_state
+            if training_state.get("should_stop", False):
+                logger.info("⏹️ Training stopped by user")
+                log_lines.append("⏹️ Training stopped by user")
+                yield f"⏹️ Stopped ({time_info})", "\n".join(log_lines[-15:]), loss_data, training_state
+                break
+        total_time = time.time() - start_time
+        training_state["is_training"] = False
+        completion_msg = f"✅ Training completed! Total time: {_format_duration(total_time)}"
+        logger.info(completion_msg)
+        log_lines.append(completion_msg)
+        yield completion_msg, "\n".join(log_lines[-15:]), loss_data, training_state
+    except Exception as e:
+        logger.exception("Training error")
+        training_state["is_training"] = False
+        import pandas as pd
+        empty_df = pd.DataFrame({"step": [], "loss": []})
+        yield f"❌ Error: {str(e)}", str(e), empty_df, training_state
+def stop_training(training_state: Dict) -> Tuple[str, Dict]:
+    """Stop the current training process.
+    Returns:
+        Tuple of (status, training_state)
+    """
+    if not training_state.get("is_training", False):
+        return "⚠️ No training in progress", training_state
+    training_state["should_stop"] = True
+    return "⏹️ Stopping training...", training_state
+def export_lora(
+    export_path: str,
+    lora_output_dir: str,
+) -> str:
+    """Export the trained LoRA weights.
+    Returns:
+        Status message
+    """
+    if not export_path or not export_path.strip():
+        return "❌ Please enter an export path"
+    # Check if there's a trained model to export
+    final_dir = os.path.join(lora_output_dir, "final")
+    checkpoint_dir = os.path.join(lora_output_dir, "checkpoints")
+    # Prefer final, fallback to checkpoints
+    if os.path.exists(final_dir):
+        source_path = final_dir
+    elif os.path.exists(checkpoint_dir):
+        # Find the latest checkpoint
+        checkpoints = [d for d in os.listdir(checkpoint_dir) if d.startswith("epoch_")]
+        if not checkpoints:
+            return "❌ No checkpoints found"
+        checkpoints.sort(key=lambda x: int(x.split("_")[1]))
+        latest = checkpoints[-1]
+        source_path = os.path.join(checkpoint_dir, latest)
+    else:
+        return f"❌ No trained model found in {lora_output_dir}"
+    try:
+        import shutil
+        export_path = export_path.strip()
+        os.makedirs(os.path.dirname(export_path) if os.path.dirname(export_path) else ".", exist_ok=True)
+        if os.path.exists(export_path):
+            shutil.rmtree(export_path)
+        shutil.copytree(source_path, export_path)
+        return f"✅ LoRA exported to {export_path}"
+    except Exception as e:
+        logger.exception("Export error")
+        return f"❌ Export failed: {str(e)}"

acestep/gradio_ui/i18n.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""
+Internationalization (i18n) module for Gradio UI
+Supports multiple languages with easy translation management
+"""
+import os
+import json
+from typing import Dict, Optional
+class I18n:
+    """Internationalization handler"""
+    def __init__(self, default_language: str = "en"):
+        """
+        Initialize i18n handler
+        Args:
+            default_language: Default language code (en, zh, ja, etc.)
+        """
+        self.current_language = default_language
+        self.translations: Dict[str, Dict[str, str]] = {}
+        self._load_all_translations()
+    def _load_all_translations(self):
+        """Load all translation files from i18n directory"""
+        current_file = os.path.abspath(__file__)
+        module_dir = os.path.dirname(current_file)
+        i18n_dir = os.path.join(module_dir, "i18n")
+        if not os.path.exists(i18n_dir):
+            # Create i18n directory if it doesn't exist
+            os.makedirs(i18n_dir)
+            return
+        # Load all JSON files in i18n directory
+        for filename in os.listdir(i18n_dir):
+            if filename.endswith(".json"):
+                lang_code = filename[:-5]  # Remove .json extension
+                filepath = os.path.join(i18n_dir, filename)
+                try:
+                    with open(filepath, 'r', encoding='utf-8') as f:
+                        self.translations[lang_code] = json.load(f)
+                except Exception as e:
+                    print(f"Error loading translation file {filename}: {e}")
+    def set_language(self, language: str):
+        """Set current language"""
+        if language in self.translations:
+            self.current_language = language
+        else:
+            print(f"Warning: Language '{language}' not found, using default")
+    def t(self, key: str, **kwargs) -> str:
+        """
+        Translate a key to current language
+        Args:
+            key: Translation key (dot-separated for nested keys)
+            **kwargs: Optional format parameters
+        Returns:
+            Translated string
+        """
+        # Get translation from current language
+        translation = self._get_nested_value(
+            self.translations.get(self.current_language, {}),
+            key
+        )
+        # Fallback to English if not found
+        if translation is None:
+            translation = self._get_nested_value(
+                self.translations.get('en', {}),
+                key
+            )
+        # Final fallback to key itself
+        if translation is None:
+            translation = key
+        # Apply formatting if kwargs provided
+        if kwargs:
+            try:
+                translation = translation.format(**kwargs)
+            except KeyError:
+                pass
+        return translation
+    def _get_nested_value(self, data: dict, key: str) -> Optional[str]:
+        """
+        Get nested dictionary value using dot notation
+        Args:
+            data: Dictionary to search
+            key: Dot-separated key (e.g., "section.subsection.key")
+        Returns:
+            Value if found, None otherwise
+        """
+        keys = key.split('.')
+        current = data
+        for k in keys:
+            if isinstance(current, dict) and k in current:
+                current = current[k]
+            else:
+                return None
+        return current if isinstance(current, str) else None
+    def get_available_languages(self) -> list:
+        """Get list of available language codes"""
+        return list(self.translations.keys())
+# Global i18n instance
+_i18n_instance: Optional[I18n] = None
+def get_i18n(language: Optional[str] = None) -> I18n:
+    """
+    Get global i18n instance
+    Args:
+        language: Optional language to set
+    Returns:
+        I18n instance
+    """
+    global _i18n_instance
+    if _i18n_instance is None:
+        _i18n_instance = I18n(default_language=language or "en")
+    elif language is not None:
+        _i18n_instance.set_language(language)
+    return _i18n_instance
+def t(key: str, **kwargs) -> str:
+    """
+    Convenience function for translation
+    Args:
+        key: Translation key
+        **kwargs: Optional format parameters
+    Returns:
+        Translated string
+    """
+    return get_i18n().t(key, **kwargs)

acestep/gradio_ui/i18n/en.json ADDED Viewed

	@@ -0,0 +1,245 @@

+{
+  "app": {
+    "title": "🎛️ ACE-Step V1.5 Playground💡",
+    "subtitle": "Pushing the Boundaries of Open-Source Music Generation"
+  },
+  "dataset": {
+    "title": "📊 Dataset Explorer",
+    "dataset_label": "Dataset",
+    "dataset_info": "Choose dataset to explore",
+    "import_btn": "📥 Import Dataset",
+    "search_type_label": "Search Type",
+    "search_type_info": "How to find items",
+    "search_value_label": "Search Value",
+    "search_value_placeholder": "Enter keys or index (leave empty for random)",
+    "search_value_info": "Keys: exact match, Index: 0 to dataset size-1",
+    "instruction_label": "📝 Instruction",
+    "instruction_placeholder": "No instruction available",
+    "metadata_title": "📋 Item Metadata (JSON)",
+    "metadata_label": "Complete Item Information",
+    "source_audio": "Source Audio",
+    "target_audio": "Target Audio",
+    "reference_audio": "Reference Audio",
+    "get_item_btn": "🔍 Get Item",
+    "use_src_checkbox": "Use Source Audio from Dataset",
+    "use_src_info": "Check to use the source audio from dataset",
+    "data_status_label": "📊 Data Status",
+    "data_status_default": "❌ No dataset imported",
+    "autofill_btn": "📋 Auto-fill Generation Form"
+  },
+  "service": {
+    "title": "🔧 Service Configuration",
+    "checkpoint_label": "Checkpoint File",
+    "checkpoint_info": "Select a trained model checkpoint file (full path or filename)",
+    "refresh_btn": "🔄 Refresh",
+    "model_path_label": "Main Model Path",
+    "model_path_info": "Select the model configuration directory (auto-scanned from checkpoints)",
+    "device_label": "Device",
+    "device_info": "Processing device (auto-detect recommended)",
+    "lm_model_path_label": "5Hz LM Model Path",
+    "lm_model_path_info": "Select the 5Hz LM model checkpoint (auto-scanned from checkpoints)",
+    "backend_label": "5Hz LM Backend",
+    "backend_info": "Select backend for 5Hz LM: vllm (faster) or pt (PyTorch, more compatible)",
+    "init_llm_label": "Initialize 5Hz LM",
+    "init_llm_info": "Check to initialize 5Hz LM during service initialization",
+    "flash_attention_label": "Use Flash Attention",
+    "flash_attention_info_enabled": "Enable flash attention for faster inference (requires flash_attn package)",
+    "flash_attention_info_disabled": "Flash attention not available (flash_attn package not installed)",
+    "offload_cpu_label": "Offload to CPU",
+    "offload_cpu_info": "Offload models to CPU when not in use to save GPU memory",
+    "offload_dit_cpu_label": "Offload DiT to CPU",
+    "offload_dit_cpu_info": "Offload DiT to CPU (needs Offload to CPU)",
+    "init_btn": "Initialize Service",
+    "status_label": "Status",
+    "language_label": "UI Language",
+    "language_info": "Select interface language"
+  },
+  "generation": {
+    "required_inputs": "📝 Required Inputs",
+    "task_type_label": "Task Type",
+    "task_type_info": "Select the task type for generation",
+    "instruction_label": "Instruction",
+    "instruction_info": "Instruction is automatically generated based on task type",
+    "load_btn": "Load",
+    "track_name_label": "Track Name",
+    "track_name_info": "Select track name for lego/extract tasks",
+    "track_classes_label": "Track Names",
+    "track_classes_info": "Select multiple track classes for complete task",
+    "audio_uploads": "🎵 Audio Uploads",
+    "reference_audio": "Reference Audio (optional)",
+    "source_audio": "Source Audio (optional)",
+    "convert_codes_btn": "Convert to Codes",
+    "lm_codes_hints": "🎼 LM Codes Hints",
+    "lm_codes_label": "LM Codes Hints",
+    "lm_codes_placeholder": "<|audio_code_10695|><|audio_code_54246|>...",
+    "lm_codes_info": "Paste LM codes hints for text2music generation",
+    "lm_codes_sample": "LM Codes Hints (Sample {n})",
+    "lm_codes_sample_info": "Codes for sample {n}",
+    "transcribe_btn": "Transcribe",
+    "repainting_controls": "🎨 Repainting Controls (seconds)",
+    "repainting_start": "Repainting Start",
+    "repainting_end": "Repainting End",
+    "mode_label": "Generation Mode",
+    "mode_info": "Simple: describe music in natural language. Custom: full control over caption and lyrics.",
+    "mode_simple": "Simple",
+    "mode_custom": "Custom",
+    "simple_query_label": "Song Description",
+    "simple_query_placeholder": "Describe the music you want to create, e.g., 'a soft Bengali love song for a quiet evening'. Leave empty for a random sample.",
+    "simple_query_info": "Enter a natural language description of the music you want to generate",
+    "simple_vocal_language_label": "Vocal Language (optional)",
+    "simple_vocal_language_info": "Select preferred language(s) for lyrics. Use 'unknown' for any language.",
+    "create_sample_btn": "Create Sample",
+    "caption_title": "📝 Music Caption",
+    "caption_label": "Music Caption (optional)",
+    "caption_placeholder": "A peaceful acoustic guitar melody with soft vocals...",
+    "caption_info": "Describe the style, genre, instruments, and mood",
+    "lyrics_title": "📝 Lyrics",
+    "lyrics_label": "Lyrics (optional)",
+    "lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
+    "lyrics_info": "Song lyrics with structure",
+    "instrumental_label": "Instrumental",
+    "format_btn": "Format",
+    "optional_params": "⚙️ Optional Parameters",
+    "vocal_language_label": "Vocal Language (optional)",
+    "vocal_language_info": "use `unknown` for inst",
+    "bpm_label": "BPM (optional)",
+    "bpm_info": "leave empty for N/A",
+    "keyscale_label": "KeyScale (optional)",
+    "keyscale_placeholder": "Leave empty for N/A",
+    "keyscale_info": "A-G, #/♭, major/minor",
+    "timesig_label": "Time Signature (optional)",
+    "timesig_info": "2/4, 3/4, 4/4...",
+    "duration_label": "Audio Duration (seconds)",
+    "duration_info": "Use -1 for random",
+    "batch_size_label": "Batch Size",
+    "batch_size_info": "Number of audio to generate (max 8)",
+    "advanced_settings": "🔧 Advanced Settings",
+    "inference_steps_label": "DiT Inference Steps",
+    "inference_steps_info": "Turbo: max 8, Base: max 200",
+    "guidance_scale_label": "DiT Guidance Scale (Only support for base model)",
+    "guidance_scale_info": "Higher values follow text more closely",
+    "seed_label": "Seed",
+    "seed_info": "Use comma-separated values for batches",
+    "random_seed_label": "Random Seed",
+    "random_seed_info": "Enable to auto-generate seeds",
+    "audio_format_label": "Audio Format",
+    "audio_format_info": "Audio format for saved files",
+    "use_adg_label": "Use ADG",
+    "use_adg_info": "Enable Angle Domain Guidance",
+    "shift_label": "Shift",
+    "shift_info": "Timestep shift factor for base models (range 1.0~5.0, default 3.0). Not effective for turbo models.",
+    "infer_method_label": "Inference Method",
+    "infer_method_info": "Diffusion inference method. ODE (Euler) is faster, SDE (stochastic) may produce different results.",
+    "custom_timesteps_label": "Custom Timesteps",
+    "custom_timesteps_info": "Optional: comma-separated values from 1.0 to 0.0 (e.g., '0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0'). Overrides inference steps and shift.",
+    "cfg_interval_start": "CFG Interval Start",
+    "cfg_interval_end": "CFG Interval End",
+    "lm_params_title": "🤖 LM Generation Parameters",
+    "lm_temperature_label": "LM Temperature",
+    "lm_temperature_info": "5Hz LM temperature (higher = more random)",
+    "lm_cfg_scale_label": "LM CFG Scale",
+    "lm_cfg_scale_info": "5Hz LM CFG (1.0 = no CFG)",
+    "lm_top_k_label": "LM Top-K",
+    "lm_top_k_info": "Top-K (0 = disabled)",
+    "lm_top_p_label": "LM Top-P",
+    "lm_top_p_info": "Top-P (1.0 = disabled)",
+    "lm_negative_prompt_label": "LM Negative Prompt",
+    "lm_negative_prompt_placeholder": "Enter negative prompt for CFG (default: NO USER INPUT)",
+    "lm_negative_prompt_info": "Negative prompt (use when LM CFG Scale > 1.0)",
+    "cot_metas_label": "CoT Metas",
+    "cot_metas_info": "Use LM to generate CoT metadata (uncheck to skip LM CoT generation)",
+    "cot_language_label": "CoT Language",
+    "cot_language_info": "Generate language in CoT (chain-of-thought)",
+    "constrained_debug_label": "Constrained Decoding Debug",
+    "constrained_debug_info": "Enable debug logging for constrained decoding (check to see detailed logs)",
+    "auto_score_label": "Auto Score",
+    "auto_score_info": "Automatically calculate quality scores for all generated audios",
+    "auto_lrc_label": "Auto LRC",
+    "auto_lrc_info": "Automatically generate LRC lyrics timestamps for all generated audios",
+    "lm_batch_chunk_label": "LM Batch Chunk Size",
+    "lm_batch_chunk_info": "Max items per LM batch chunk (default: 8, limited by GPU memory)",
+    "codes_strength_label": "LM Codes Strength",
+    "codes_strength_info": "Control how many denoising steps use LM-generated codes",
+    "cover_strength_label": "Audio Cover Strength",
+    "cover_strength_info": "Control how many denoising steps use cover mode",
+    "score_sensitivity_label": "Quality Score Sensitivity",
+    "score_sensitivity_info": "Lower = more sensitive (default: 1.0). Adjusts how PMI maps to [0,1]",
+    "think_label": "Think",
+    "parallel_thinking_label": "ParallelThinking",
+    "generate_btn": "🎵 Generate Music",
+    "autogen_label": "AutoGen",
+    "caption_rewrite_label": "CaptionRewrite"
+  },
+  "results": {
+    "title": "🎵 Results",
+    "generated_music": "🎵 Generated Music (Sample {n})",
+    "send_to_src_btn": "🔗 Send To Src Audio",
+    "send_to_cover_btn": "🔗 Send To Cover",
+    "send_to_repaint_btn": "🔗 Send To Repaint",
+    "save_btn": "💾 Save",
+    "score_btn": "📊 Score",
+    "lrc_btn": "🎵 LRC",
+    "quality_score_label": "Quality Score (Sample {n})",
+    "quality_score_placeholder": "Click 'Score' to calculate perplexity-based quality score",
+    "codes_label": "LM Codes (Sample {n})",
+    "lrc_label": "Lyrics Timestamps (Sample {n})",
+    "lrc_placeholder": "Click 'LRC' to generate timestamps",
+    "details_accordion": "📊 Score & LRC & LM Codes",
+    "generation_status": "Generation Status",
+    "current_batch": "Current Batch",
+    "batch_indicator": "Batch {current} / {total}",
+    "next_batch_status": "Next Batch Status",
+    "prev_btn": "◀ Previous",
+    "next_btn": "Next ▶",
+    "restore_params_btn": "↙️ Apply These Settings to UI (Restore Batch Parameters)",
+    "batch_results_title": "📁 Batch Results & Generation Details",
+    "all_files_label": "📁 All Generated Files (Download)",
+    "generation_details": "Generation Details"
+  },
+  "messages": {
+    "no_audio_to_save": "❌ No audio to save",
+    "save_success": "✅ Saved audio and metadata to {filename}",
+    "save_failed": "❌ Failed to save: {error}",
+    "no_file_selected": "⚠️ No file selected",
+    "params_loaded": "✅ Parameters loaded from {filename}",
+    "invalid_json": "❌ Invalid JSON file: {error}",
+    "load_error": "❌ Error loading file: {error}",
+    "example_loaded": "📁 Loaded example from {filename}",
+    "example_failed": "Failed to parse JSON file {filename}: {error}",
+    "example_error": "Error loading example: {error}",
+    "lm_generated": "🤖 Generated example using LM",
+    "lm_fallback": "Failed to generate example using LM, falling back to examples directory",
+    "lm_not_initialized": "❌ 5Hz LM not initialized. Please initialize it first.",
+    "autogen_enabled": "🔄 AutoGen enabled - next batch will generate after this",
+    "batch_ready": "✅ Batch {n} ready! Click 'Next' to view.",
+    "batch_generating": "🔄 Starting background generation for Batch {n}...",
+    "batch_failed": "❌ Background generation failed: {error}",
+    "viewing_batch": "✅ Viewing Batch {n}",
+    "at_first_batch": "Already at first batch",
+    "at_last_batch": "No next batch available",
+    "batch_not_found": "Batch {n} not found in queue",
+    "no_batch_data": "No batch data found to restore.",
+    "params_restored": "✅ UI Parameters restored from Batch {n}",
+    "scoring_failed": "❌ Error: Batch data not found",
+    "no_codes": "❌ No audio codes available. Please generate music first.",
+    "score_failed": "❌ Scoring failed: {error}",
+    "score_error": "❌ Error calculating score: {error}",
+    "lrc_no_batch_data": "❌ No batch data found. Please generate music first.",
+    "lrc_no_extra_outputs": "❌ No extra outputs found. Condition tensors not available.",
+    "lrc_missing_tensors": "❌ Missing required tensors for LRC generation.",
+    "lrc_sample_not_exist": "❌ Sample does not exist in current batch.",
+    "lrc_empty_result": "⚠️ LRC generation produced empty result.",
+    "empty_query": "⚠️ Please enter a music description.",
+    "sample_creation_failed": "❌ Failed to create sample. Please try again.",
+    "sample_created": "✅ Sample created! Review the caption and lyrics, then click Generate Music.",
+    "simple_examples_not_found": "⚠️ Simple mode examples directory not found.",
+    "simple_examples_empty": "⚠️ No example files found in simple mode examples.",
+    "simple_example_loaded": "🎲 Loaded random example from {filename}",
+    "format_success": "✅ Caption and lyrics formatted successfully",
+    "format_failed": "❌ Format failed: {error}",
+    "skipping_metas_cot": "⚡ Skipping Phase 1 metas COT (sample already formatted)",
+    "invalid_timesteps_format": "⚠️ Invalid timesteps format. Using default schedule.",
+    "timesteps_out_of_range": "⚠️ Timesteps must be in range [0, 1]. Using default schedule.",
+    "timesteps_count_mismatch": "⚠️ Timesteps count ({actual}) differs from inference_steps ({expected}). Using timesteps count."
+  }
+}

acestep/gradio_ui/i18n/ja.json ADDED Viewed

	@@ -0,0 +1,245 @@

+{
+  "app": {
+    "title": "🎛️ ACE-Step V1.5 プレイグラウンド💡",
+    "subtitle": "オープンソース音楽生成の限界を押し広げる"
+  },
+  "dataset": {
+    "title": "📊 データセットエクスプローラー",
+    "dataset_label": "データセット",
+    "dataset_info": "探索するデータセットを選択",
+    "import_btn": "📥 データセットをインポート",
+    "search_type_label": "検索タイプ",
+    "search_type_info": "アイテムの検索方法",
+    "search_value_label": "検索値",
+    "search_value_placeholder": "キーまたはインデックスを入力(空白の場合はランダム)",
+    "search_value_info": "キー: 完全一致、インデックス: 0からデータセットサイズ-1",
+    "instruction_label": "📝 指示",
+    "instruction_placeholder": "利用可能な指示がありません",
+    "metadata_title": "📋 アイテムメタデータ (JSON)",
+    "metadata_label": "完全なアイテム情報",
+    "source_audio": "ソースオーディオ",
+    "target_audio": "ターゲットオーディオ",
+    "reference_audio": "リファレンスオーディオ",
+    "get_item_btn": "🔍 アイテムを取得",
+    "use_src_checkbox": "データセットのソースオーディオを使用",
+    "use_src_info": "データセットのソースオーディオを使用する場合はチェック",
+    "data_status_label": "📊 データステータス",
+    "data_status_default": "❌ データセットがインポートされていません",
+    "autofill_btn": "📋 生成フォームを自動入力"
+  },
+  "service": {
+    "title": "🔧 サービス設定",
+    "checkpoint_label": "チェックポイントファイル",
+    "checkpoint_info": "訓練済みモデルのチェックポイントファイルを選択(フルパスまたはファイル名)",
+    "refresh_btn": "🔄 更新",
+    "model_path_label": "メインモデルパス",
+    "model_path_info": "モデル設定ディレクトリを選択(チェックポイントから自動スキャン)",
+    "device_label": "デバイス",
+    "device_info": "処理デバイス(自動検出を推奨)",
+    "lm_model_path_label": "5Hz LM モデルパス",
+    "lm_model_path_info": "5Hz LMモデルチェックポイントを選択(チェックポイントから自動スキャン)",
+    "backend_label": "5Hz LM バックエンド",
+    "backend_info": "5Hz LMのバックエンドを選択: vllm(高速)またはpt(PyTorch、より互換性あり)",
+    "init_llm_label": "5Hz LM を初期化",
+    "init_llm_info": "サービス初期化中に5Hz LMを初期化する場合はチェック",
+    "flash_attention_label": "Flash Attention を使用",
+    "flash_attention_info_enabled": "推論を高速化するためにflash attentionを有効にする(flash_attnパッケージが必要)",
+    "flash_attention_info_disabled": "Flash attentionは利用できません(flash_attnパッケージがインストールされていません)",
+    "offload_cpu_label": "CPUにオフロード",
+    "offload_cpu_info": "使用していない時にモデルをCPUにオフロードしてGPUメモリを節約",
+    "offload_dit_cpu_label": "DiTをCPUにオフロード",
+    "offload_dit_cpu_info": "DiTをCPUにオフロード(CPUへのオフロードが必要)",
+    "init_btn": "サービスを初期化",
+    "status_label": "ステータス",
+    "language_label": "UI言語",
+    "language_info": "インターフェース言語を選択"
+  },
+  "generation": {
+    "required_inputs": "📝 必須入力",
+    "task_type_label": "タスクタイプ",
+    "task_type_info": "生成のタスクタイプを選択",
+    "instruction_label": "指示",
+    "instruction_info": "指示はタスクタイプに基づいて自動生成されます",
+    "load_btn": "読み込む",
+    "track_name_label": "トラック名",
+    "track_name_info": "lego/extractタスクのトラック名を選択",
+    "track_classes_label": "トラック名",
+    "track_classes_info": "completeタスクの複数のトラッククラスを選択",
+    "audio_uploads": "🎵 オーディオアップロード",
+    "reference_audio": "リファレンスオーディオ(オプション)",
+    "source_audio": "ソースオーディオ(オプション)",
+    "convert_codes_btn": "コードに変換",
+    "lm_codes_hints": "🎼 LM コードヒント",
+    "lm_codes_label": "LM コードヒント",
+    "lm_codes_placeholder": "<|audio_code_10695|><|audio_code_54246|>...",
+    "lm_codes_info": "text2music生成用のLMコードヒントを貼り付け",
+    "lm_codes_sample": "LM コードヒント(サンプル {n})",
+    "lm_codes_sample_info": "サンプル{n}のコード",
+    "transcribe_btn": "転写",
+    "repainting_controls": "🎨 再描画コントロール(秒)",
+    "repainting_start": "再描画開始",
+    "repainting_end": "再描画終了",
+    "mode_label": "生成モード",
+    "mode_info": "シンプル：自然言語で音楽を説明��カスタム：キャプションと歌詞を完全にコントロール。",
+    "mode_simple": "シンプル",
+    "mode_custom": "カスタム",
+    "simple_query_label": "曲の説明",
+    "simple_query_placeholder": "作成したい音楽を説明してください。例：'静かな夜のための優しいベンガルのラブソング'。空欄の場合はランダムなサンプルが生成されます。",
+    "simple_query_info": "生成したい音楽の自然言語の説明を入力",
+    "simple_vocal_language_label": "ボーカル言語(オプション)",
+    "simple_vocal_language_info": "歌詞の希望言語を選択。任意の言語の場合は'unknown'を使用。",
+    "create_sample_btn": "サンプル作成",
+    "caption_title": "📝 音楽キャプション",
+    "caption_label": "音楽キャプション(オプション)",
+    "caption_placeholder": "柔らかいボーカルを伴う穏やかなアコースティックギターのメロディー...",
+    "caption_info": "スタイル、ジャンル、楽器、ムードを説明",
+    "lyrics_title": "📝 歌詞",
+    "lyrics_label": "歌詞(オプション)",
+    "lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
+    "lyrics_info": "構造を持つ曲の歌詞",
+    "instrumental_label": "インストゥルメンタル",
+    "format_btn": "フォーマット",
+    "optional_params": "⚙️ オプションパラメータ",
+    "vocal_language_label": "ボーカル言語(オプション)",
+    "vocal_language_info": "インストには`unknown`を使用",
+    "bpm_label": "BPM(オプション)",
+    "bpm_info": "空白の場合はN/A",
+    "keyscale_label": "キースケール(オプション)",
+    "keyscale_placeholder": "空白の場合はN/A",
+    "keyscale_info": "A-G, #/♭, メジャー/マイナー",
+    "timesig_label": "拍子記号(オプション)",
+    "timesig_info": "2/4, 3/4, 4/4...",
+    "duration_label": "オーディオ長(秒)",
+    "duration_info": "ランダムの場合は-1を使用",
+    "batch_size_label": "バッチサイズ",
+    "batch_size_info": "生成するオーディオの数(最大8)",
+    "advanced_settings": "🔧 詳細設定",
+    "inference_steps_label": "DiT 推論ステップ",
+    "inference_steps_info": "Turbo: 最大8、Base: 最大200",
+    "guidance_scale_label": "DiT ガイダンススケール(baseモデルのみサポート)",
+    "guidance_scale_info": "値が高いほどテキストに忠実に従う",
+    "seed_label": "シード",
+    "seed_info": "バッチにはカンマ区切りの値を使用",
+    "random_seed_label": "ランダムシード",
+    "random_seed_info": "有効にすると自動的にシードを生成",
+    "audio_format_label": "オーディオフォーマット",
+    "audio_format_info": "保存ファイルのオーディオフォーマット",
+    "use_adg_label": "ADG を使用",
+    "use_adg_info": "角度ドメインガイダンスを有効化",
+    "shift_label": "シフト",
+    "shift_info": "baseモデル用タイムステップシフト係数 (範囲 1.0~5.0、デフォルト 3.0)。turboモデルには無効。",
+    "infer_method_label": "推論方法",
+    "infer_method_info": "拡散推論方法。ODE (オイラー) は高速、SDE (確率的) は異なる結果を生成する可能性があります。",
+    "custom_timesteps_label": "カスタムタイムステップ",
+    "custom_timesteps_info": "オプション：1.0から0.0へのカンマ区切り値（例：'0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0'）。推論ステップとシフトを上書きします。",
+    "cfg_interval_start": "CFG 間隔開始",
+    "cfg_interval_end": "CFG 間隔終了",
+    "lm_params_title": "🤖 LM 生成パラメータ",
+    "lm_temperature_label": "LM 温度",
+    "lm_temperature_info": "5Hz LM温度(高いほどランダム)",
+    "lm_cfg_scale_label": "LM CFG スケール",
+    "lm_cfg_scale_info": "5Hz LM CFG (1.0 = CFGなし)",
+    "lm_top_k_label": "LM Top-K",
+    "lm_top_k_info": "Top-K (0 = 無効)",
+    "lm_top_p_label": "LM Top-P",
+    "lm_top_p_info": "Top-P (1.0 = 無効)",
+    "lm_negative_prompt_label": "LM ネガティブプロンプト",
+    "lm_negative_prompt_placeholder": "CFGのネガティブプロンプトを入力(デフォルト: NO USER INPUT)",
+    "lm_negative_prompt_info": "ネガティブプロンプト(LM CFGスケール > 1.0の場合に使用)",
+    "cot_metas_label": "CoT メタデータ",
+    "cot_metas_info": "LMを使用してCoTメタデータを生成(チェックを外すとLM CoT生成をスキップ)",
+    "cot_language_label": "CoT 言語",
+    "cot_language_info": "CoTで言語を生成(思考の連鎖)",
+    "constrained_debug_label": "制約付きデコーディングデバッグ",
+    "constrained_debug_info": "制約付きデコーディングのデバッグログを有効化(チェックすると詳細ログを表示)",
+    "auto_score_label": "自動スコアリング",
+    "auto_score_info": "生成���れたすべてのオーディオの品質スコアを自動計算",
+    "auto_lrc_label": "自動 LRC",
+    "auto_lrc_info": "生成されたすべてのオーディオのLRC歌詞タイムスタンプを自動生成",
+    "lm_batch_chunk_label": "LM バッチチャンクサイズ",
+    "lm_batch_chunk_info": "LMバッチチャンクあたりの最大アイテム数(デフォルト: 8、GPUメモリによる制限)",
+    "codes_strength_label": "LM コード強度",
+    "codes_strength_info": "LM生成コードを使用するデノイジングステップ数を制御",
+    "cover_strength_label": "オーディオカバー強度",
+    "cover_strength_info": "カバーモードを使用するデノイジングステップ数を制御",
+    "score_sensitivity_label": "品質スコア感度",
+    "score_sensitivity_info": "低い = より敏感(デフォルト: 1.0)。PMIが[0,1]にマッピングする方法を調整",
+    "think_label": "思考",
+    "parallel_thinking_label": "並列思考",
+    "generate_btn": "🎵 音楽を生成",
+    "autogen_label": "自動生成",
+    "caption_rewrite_label": "キャプション書き換え"
+  },
+  "results": {
+    "title": "🎵 結果",
+    "generated_music": "🎵 生成された音楽(サンプル {n})",
+    "send_to_src_btn": "🔗 ソースオーディオに送信",
+    "send_to_cover_btn": "🔗 Send To Cover",
+    "send_to_repaint_btn": "🔗 Send To Repaint",
+    "save_btn": "💾 保存",
+    "score_btn": "📊 スコア",
+    "lrc_btn": "🎵 LRC",
+    "quality_score_label": "品質スコア(サンプル {n})",
+    "quality_score_placeholder": "'スコア'をクリックしてパープレキシティベースの品質スコアを計算",
+    "codes_label": "LM コード(サンプル {n})",
+    "lrc_label": "歌詞タイムスタンプ(サンプル {n})",
+    "lrc_placeholder": "'LRC'をクリックしてタイムスタンプを生成",
+    "details_accordion": "📊 スコア & LRC & LM コード",
+    "generation_status": "生成ステータス",
+    "current_batch": "現在のバッチ",
+    "batch_indicator": "バッチ {current} / {total}",
+    "next_batch_status": "次のバッチステータス",
+    "prev_btn": "◀ 前へ",
+    "next_btn": "次へ ▶",
+    "restore_params_btn": "↙️ これらの設定をUIに適用(バッチパラメータを復元)",
+    "batch_results_title": "📁 バッチ結果と生成詳細",
+    "all_files_label": "📁 すべての生成ファイル(ダウンロード)",
+    "generation_details": "生成詳細"
+  },
+  "messages": {
+    "no_audio_to_save": "❌ 保存するオーディオがありません",
+    "save_success": "✅ オーディオとメタデータを {filename} に保存しました",
+    "save_failed": "❌ 保存に失敗しました: {error}",
+    "no_file_selected": "⚠️ ファイルが選択されていません",
+    "params_loaded": "✅ {filename} からパラメータを読み込みました",
+    "invalid_json": "❌ 無効なJSONファイル: {error}",
+    "load_error": "❌ ファイルの読み込みエラー: {error}",
+    "example_loaded": "📁 {filename} からサンプルを読み込みました",
+    "example_failed": "JSONファイル {filename} の解析に失敗しました: {error}",
+    "example_error": "サンプル読み込みエラー: {error}",
+    "lm_generated": "🤖 LMを使用してサンプルを生成しました",
+    "lm_fallback": "LMを使用したサンプル生成に失敗、サンプルディレクトリにフォールバック",
+    "lm_not_initialized": "❌ 5Hz LMが初期化されていません。最初に初期化してください。",
+    "autogen_enabled": "🔄 自動生成が有効 - このあと次のバッチを生成します",
+    "batch_ready": "✅ バッチ {n} の準備完了！'次へ'をクリックして表示。",
+    "batch_generating": "🔄 バッチ {n} のバックグラウンド生成を開始...",
+    "batch_failed": "❌ バックグラウンド生成に失敗しました: {error}",
+    "viewing_batch": "✅ バッチ {n} を表示中",
+    "at_first_batch": "すでに最初のバッチです",
+    "at_last_batch": "次のバッチはありません",
+    "batch_not_found": "キューにバッチ {n} が見つかりません",
+    "no_batch_data": "復元するバッチデータがありません。",
+    "params_restored": "✅ バッチ {n} からUIパラメータを復元しました",
+    "scoring_failed": "❌ エラー: バッチデータが見つかりません",
+    "no_codes": "❌ 利用可能なオーディオコードがありません。最初に音楽を生成してください。",
+    "score_failed": "❌ スコアリングに失敗しました: {error}",
+    "score_error": "❌ スコア計算エラー: {error}",
+    "lrc_no_batch_data": "❌ バッチデータが見つかりません。最初に音楽を生成してください。",
+    "lrc_no_extra_outputs": "❌ 追加出力が見つかりません。条件テンソルが利用できません。",
+    "lrc_missing_tensors": "❌ LRC生成に必要なテンソルがありません。",
+    "lrc_sample_not_exist": "❌ 現在のバッチにサンプルが存在しません。",
+    "lrc_empty_result": "⚠️ LRC生成の結果が空です。",
+    "empty_query": "⚠️ 音楽の説明を入力してください。",
+    "sample_creation_failed": "❌ サンプルの作成に失敗しました。もう一度お試しください。",
+    "sample_created": "✅ サンプルが作成されました！キャプションと歌詞を確認して、音楽を生成をクリックしてください。",
+    "simple_examples_not_found": "⚠️ シンプルモードサンプルディレクトリが見つかりません。",
+    "simple_examples_empty": "⚠️ シンプルモードサンプルにファイルがありません。",
+    "simple_example_loaded": "🎲 {filename} からランダムサンプルを読み込みました",
+    "format_success": "✅ キャプションと歌詞のフォーマットに成功しました",
+    "format_failed": "❌ フォーマットに失敗しました: {error}",
+    "skipping_metas_cot": "⚡ Phase 1 メタデータ COT をスキップ（サンプルは既にフォーマット済み）",
+    "invalid_timesteps_format": "⚠️ タイムステップ形式が無効です。デフォルトスケジュールを使用します。",
+    "timesteps_out_of_range": "⚠️ タイムステップは [0, 1] の範囲内である必要があります。デフォルトスケジュールを使用します。",
+    "timesteps_count_mismatch": "⚠️ タイムステップ数 ({actual}) が推論ステップ数 ({expected}) と異なります。タイムステップ数を使用します。"
+  }
+}

acestep/gradio_ui/i18n/zh.json ADDED Viewed

	@@ -0,0 +1,245 @@

+{
+  "app": {
+    "title": "🎛️ ACE-Step V1.5 演练场💡",
+    "subtitle": "推动开源音乐生成的边界"
+  },
+  "dataset": {
+    "title": "📊 数据集浏览器",
+    "dataset_label": "数据集",
+    "dataset_info": "选择要浏览的数据集",
+    "import_btn": "📥 导入数据集",
+    "search_type_label": "搜索类型",
+    "search_type_info": "如何查找项目",
+    "search_value_label": "搜索值",
+    "search_value_placeholder": "输入键或索引(留空表示随机)",
+    "search_value_info": "键: 精确匹配, 索引: 0到数据集大小-1",
+    "instruction_label": "📝 指令",
+    "instruction_placeholder": "无可用指令",
+    "metadata_title": "📋 项目元数据 (JSON)",
+    "metadata_label": "完整项目信息",
+    "source_audio": "源音频",
+    "target_audio": "目标音频",
+    "reference_audio": "参考音频",
+    "get_item_btn": "🔍 获取项目",
+    "use_src_checkbox": "使用数据集中的源音频",
+    "use_src_info": "勾选以使用数据集中的源音频",
+    "data_status_label": "📊 数据状态",
+    "data_status_default": "❌ 未导入数据集",
+    "autofill_btn": "📋 自动填充生成表单"
+  },
+  "service": {
+    "title": "🔧 服务配置",
+    "checkpoint_label": "检查点文件",
+    "checkpoint_info": "选择训练好的模型检查点文件(完整路径或文件名)",
+    "refresh_btn": "🔄 刷新",
+    "model_path_label": "主模型路径",
+    "model_path_info": "选择模型配置目录(从检查点自动扫描)",
+    "device_label": "设备",
+    "device_info": "处理设备(建议自动检测)",
+    "lm_model_path_label": "5Hz LM 模型路径",
+    "lm_model_path_info": "选择5Hz LM模型检查点(从检查点自动扫描)",
+    "backend_label": "5Hz LM 后端",
+    "backend_info": "选择5Hz LM的后端: vllm(更快)或pt(PyTorch, 更兼容)",
+    "init_llm_label": "初始化 5Hz LM",
+    "init_llm_info": "勾选以在服务初始化期间初始化5Hz LM",
+    "flash_attention_label": "使用Flash Attention",
+    "flash_attention_info_enabled": "启用flash attention以加快推理速度(需要flash_attn包)",
+    "flash_attention_info_disabled": "Flash attention不可用(未安装flash_attn包)",
+    "offload_cpu_label": "卸载到CPU",
+    "offload_cpu_info": "不使用时将模型卸载到CPU以节省GPU内存",
+    "offload_dit_cpu_label": "将DiT卸载到CPU",
+    "offload_dit_cpu_info": "将DiT卸载到CPU(需要启用卸载到CPU)",
+    "init_btn": "初始化服务",
+    "status_label": "状态",
+    "language_label": "界面语言",
+    "language_info": "选择界面语言"
+  },
+  "generation": {
+    "required_inputs": "📝 必需输入",
+    "task_type_label": "任务类型",
+    "task_type_info": "选择生成的任务类型",
+    "instruction_label": "指令",
+    "instruction_info": "指令根据任务类型自动生成",
+    "load_btn": "加载",
+    "track_name_label": "音轨名称",
+    "track_name_info": "为lego/extract任务选择音轨名称",
+    "track_classes_label": "音轨名称",
+    "track_classes_info": "为complete任务选择多个音轨类别",
+    "audio_uploads": "🎵 音频上传",
+    "reference_audio": "参考音频(可选)",
+    "source_audio": "源音频(可选)",
+    "convert_codes_btn": "转换为代码",
+    "lm_codes_hints": "🎼 LM 代码提示",
+    "lm_codes_label": "LM 代码提示",
+    "lm_codes_placeholder": "<|audio_code_10695|><|audio_code_54246|>...",
+    "lm_codes_info": "粘贴用于text2music生成的LM代码提示",
+    "lm_codes_sample": "LM 代码提示(样本 {n})",
+    "lm_codes_sample_info": "样本{n}的代码",
+    "transcribe_btn": "转录",
+    "repainting_controls": "🎨 重绘控制(秒)",
+    "repainting_start": "重绘开始",
+    "repainting_end": "重绘结束",
+    "mode_label": "生成模式",
+    "mode_info": "简单模式：用自然语言描述音乐。自定义模式：完全控制描述和歌词。",
+    "mode_simple": "简单",
+    "mode_custom": "自定义",
+    "simple_query_label": "歌曲描述",
+    "simple_query_placeholder": "描述你想创作的音乐，例如：'给我生成一首暗黑的戏剧古风，歌词要华丽'。留空则随机生成样本。",
+    "simple_query_info": "输入你想生成的音乐的自然语言描述",
+    "simple_vocal_language_label": "人声语言(可选)",
+    "simple_vocal_language_info": "选择歌词的首选语言。使用 'unknown' 表示任意语言。",
+    "create_sample_btn": "创建样本",
+    "caption_title": "📝 音乐描述",
+    "caption_label": "音乐描述(可选)",
+    "caption_placeholder": "一段平和的原声吉他旋律,配有柔和的人声...",
+    "caption_info": "描述风格、流派、乐器和情绪",
+    "lyrics_title": "📝 歌词",
+    "lyrics_label": "歌词(可选)",
+    "lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
+    "lyrics_info": "带有结构的歌曲歌词",
+    "instrumental_label": "纯音乐",
+    "format_btn": "格式化",
+    "optional_params": "⚙️ 可选参数",
+    "vocal_language_label": "人声语言(可选)",
+    "vocal_language_info": "纯音乐使用 `unknown`",
+    "bpm_label": "BPM(可选)",
+    "bpm_info": "留空表示N/A",
+    "keyscale_label": "调性(可选)",
+    "keyscale_placeholder": "留空表示N/A",
+    "keyscale_info": "A-G, #/♭, 大调/小调",
+    "timesig_label": "拍号(可选)",
+    "timesig_info": "2/4, 3/4, 4/4...",
+    "duration_label": "音频时长(秒)",
+    "duration_info": "使用-1表示随机",
+    "batch_size_label": "批量大小",
+    "batch_size_info": "要生成的音频数量(最多8个)",
+    "advanced_settings": "🔧 高级设置",
+    "inference_steps_label": "DiT 推理步数",
+    "inference_steps_info": "Turbo: 最多8, Base: 最多200",
+    "guidance_scale_label": "DiT 引导比例(仅支持base模型)",
+    "guidance_scale_info": "更高的值更紧密地遵循文本",
+    "seed_label": "种子",
+    "seed_info": "批量使用逗号分隔的值",
+    "random_seed_label": "随机种子",
+    "random_seed_info": "启用以自动生成种子",
+    "audio_format_label": "音频格式",
+    "audio_format_info": "保存文件的音频格式",
+    "use_adg_label": "使用 ADG",
+    "use_adg_info": "启用角域引导",
+    "shift_label": "Shift",
+    "shift_info": "时间步偏移因子，仅对 base 模型生效 (范围 1.0~5.0，默认 3.0)。对 turbo 模型无效。",
+    "infer_method_label": "推理方法",
+    "infer_method_info": "扩散推理方法。ODE (欧拉) 更快，SDE (随机) 可能产生不同结果。",
+    "custom_timesteps_label": "自定义时间步",
+    "custom_timesteps_info": "可选：从 1.0 到 0.0 的逗号分隔值（例如 '0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0'）。会覆盖推理步数和 shift 设置。",
+    "cfg_interval_start": "CFG 间隔开始",
+    "cfg_interval_end": "CFG 间隔结束",
+    "lm_params_title": "🤖 LM 生成参数",
+    "lm_temperature_label": "LM 温度",
+    "lm_temperature_info": "5Hz LM温度(越高越随机)",
+    "lm_cfg_scale_label": "LM CFG 比例",
+    "lm_cfg_scale_info": "5Hz LM CFG (1.0 = 无CFG)",
+    "lm_top_k_label": "LM Top-K",
+    "lm_top_k_info": "Top-K (0 = 禁用)",
+    "lm_top_p_label": "LM Top-P",
+    "lm_top_p_info": "Top-P (1.0 = 禁用)",
+    "lm_negative_prompt_label": "LM 负面提示",
+    "lm_negative_prompt_placeholder": "输入CFG的负面提示(默认: NO USER INPUT)",
+    "lm_negative_prompt_info": "负面提示(当LM CFG比例 > 1.0时使用)",
+    "cot_metas_label": "CoT 元数据",
+    "cot_metas_info": "使用LM生成CoT元数据(取消勾选以跳过LM CoT生成)",
+    "cot_language_label": "CoT 语言",
+    "cot_language_info": "在CoT中生成语言(思维链)",
+    "constrained_debug_label": "约束解码调试",
+    "constrained_debug_info": "启用约束解码的调试日志(勾选以查看详细日志)",
+    "auto_score_label": "自动评分",
+    "auto_score_info": "自动计算所有生成音频的质量分数",
+    "auto_lrc_label": "自动 LRC",
+    "auto_lrc_info": "自动为所有生成的音频生成LRC歌词时间戳",
+    "lm_batch_chunk_label": "LM 批量块大小",
+    "lm_batch_chunk_info": "每个LM批量块的最大项目数(默认: 8, 受GPU内存限制)",
+    "codes_strength_label": "LM 代码强度",
+    "codes_strength_info": "控制使用LM生成代码的去噪步骤数量",
+    "cover_strength_label": "音频覆盖强度",
+    "cover_strength_info": "控制使用覆盖模式的去噪步骤数量",
+    "score_sensitivity_label": "质量评分敏感度",
+    "score_sensitivity_info": "更低 = 更敏感(默认: 1.0). 调整PMI如何映射到[0,1]",
+    "think_label": "思考",
+    "parallel_thinking_label": "并行思考",
+    "generate_btn": "🎵 生成音乐",
+    "autogen_label": "自动生成",
+    "caption_rewrite_label": "描述重写"
+  },
+  "results": {
+    "title": "🎵 结果",
+    "generated_music": "🎵 生成的音乐(样本 {n})",
+    "send_to_src_btn": "🔗 发送到源音频",
+    "send_to_cover_btn": "🔗 Send To Cover",
+    "send_to_repaint_btn": "🔗 Send To Repaint",
+    "save_btn": "💾 保存",
+    "score_btn": "📊 评分",
+    "lrc_btn": "🎵 LRC",
+    "quality_score_label": "质量分数(样本 {n})",
+    "quality_score_placeholder": "点击'评分'以计算基于困惑度的质量分数",
+    "codes_label": "LM 代码(样本 {n})",
+    "lrc_label": "歌词时间戳(样本 {n})",
+    "lrc_placeholder": "点击'LRC'生成时间戳",
+    "details_accordion": "📊 评分与LRC与LM代码",
+    "generation_status": "生成状态",
+    "current_batch": "当前批次",
+    "batch_indicator": "批次 {current} / {total}",
+    "next_batch_status": "下一批次状态",
+    "prev_btn": "◀ 上一个",
+    "next_btn": "下一个 ▶",
+    "restore_params_btn": "↙️ 将这些设置应用到UI(恢复批次参数)",
+    "batch_results_title": "📁 批量结果和生成详情",
+    "all_files_label": "📁 所有生成的文件(��载)",
+    "generation_details": "生成详情"
+  },
+  "messages": {
+    "no_audio_to_save": "❌ 没有要保存的音频",
+    "save_success": "✅ 已将音频和元数据保存到 {filename}",
+    "save_failed": "❌ 保存失败: {error}",
+    "no_file_selected": "⚠️ 未选择文件",
+    "params_loaded": "✅ 已从 {filename} 加载参数",
+    "invalid_json": "❌ 无效的JSON文件: {error}",
+    "load_error": "❌ 加载文件时出错: {error}",
+    "example_loaded": "📁 已从 {filename} 加载示例",
+    "example_failed": "解析JSON文件 {filename} 失败: {error}",
+    "example_error": "加载示例时出错: {error}",
+    "lm_generated": "🤖 使用LM生成的示例",
+    "lm_fallback": "使用LM生成示例失败,回退到示例目录",
+    "lm_not_initialized": "❌ 5Hz LM未初始化。请先初始化它。",
+    "autogen_enabled": "🔄 已启用自动生成 - 下一批次将在此之后生成",
+    "batch_ready": "✅ 批次 {n} 就绪!点击'下一个'查看。",
+    "batch_generating": "🔄 开始为批次 {n} 进行后台生成...",
+    "batch_failed": "❌ 后台生成失败: {error}",
+    "viewing_batch": "✅ 查看批次 {n}",
+    "at_first_batch": "已在第一批次",
+    "at_last_batch": "没有下一批次可用",
+    "batch_not_found": "在队列中未找到批次 {n}",
+    "no_batch_data": "没有要恢复的批次数据。",
+    "params_restored": "✅ 已从批次 {n} 恢复UI参数",
+    "scoring_failed": "❌ 错误: 未找到批次数据",
+    "no_codes": "❌ 没有可用的音频代码。请先生成音乐。",
+    "score_failed": "❌ 评分失败: {error}",
+    "score_error": "❌ 计算分数时出错: {error}",
+    "lrc_no_batch_data": "❌ 未找到批次数据。请先生成音乐。",
+    "lrc_no_extra_outputs": "❌ 未找到额外输出。条件张量不可用。",
+    "lrc_missing_tensors": "❌ 缺少LRC生成所需的张量。",
+    "lrc_sample_not_exist": "❌ 当前批次中不存在该样本。",
+    "lrc_empty_result": "⚠️ LRC生成结果为空。",
+    "empty_query": "⚠️ 请输入音乐描述。",
+    "sample_creation_failed": "❌ 创建样本失败。请重试。",
+    "sample_created": "✅ 样本已创建！检查描述和歌词，然后点击生成音乐。",
+    "simple_examples_not_found": "⚠️ 未找到简单模式示例目录。",
+    "simple_examples_empty": "⚠️ 简单模式示例中没有示例文件。",
+    "simple_example_loaded": "🎲 已从 {filename} 加载随机示例",
+    "format_success": "✅ 描述和歌词格式化成功",
+    "format_failed": "❌ 格式化失败: {error}",
+    "skipping_metas_cot": "⚡ 跳过 Phase 1 元数据 COT（样本已格式化）",
+    "invalid_timesteps_format": "⚠️ 时间步格式无效，使用默认调度。",
+    "timesteps_out_of_range": "⚠️ 时间步必须在 [0, 1] 范围内，使用默认调度。",
+    "timesteps_count_mismatch": "⚠️ 时间步数量 ({actual}) 与推理步数 ({expected}) 不匹配，将使用时间步数量。"
+  }
+}

acestep/gradio_ui/interfaces/__init__.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+Gradio UI Components Module
+Contains all Gradio interface component definitions and layouts
+"""
+import gradio as gr
+from acestep.gradio_ui.i18n import get_i18n, t
+from acestep.gradio_ui.interfaces.dataset import create_dataset_section
+from acestep.gradio_ui.interfaces.generation import create_generation_section
+from acestep.gradio_ui.interfaces.result import create_results_section
+from acestep.gradio_ui.interfaces.training import create_training_section
+from acestep.gradio_ui.events import setup_event_handlers, setup_training_event_handlers
+def create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_params=None, language='en') -> gr.Blocks:
+    """
+    Create Gradio interface
+    Args:
+        dit_handler: DiT handler instance
+        llm_handler: LM handler instance
+        dataset_handler: Dataset handler instance
+        init_params: Dictionary containing initialization parameters and state.
+                    If None, service will not be pre-initialized.
+        language: UI language code ('en', 'zh', 'ja', default: 'en')
+    Returns:
+        Gradio Blocks instance
+    """
+    # Initialize i18n with selected language
+    i18n = get_i18n(language)
+    with gr.Blocks(
+        title=t("app.title"),
+        theme=gr.themes.Soft(),
+        css="""
+        .main-header {
+            text-align: center;
+            margin-bottom: 2rem;
+        }
+        .section-header {
+            background: linear-gradient(90deg, #4CAF50, #45a049);
+            color: white;
+            padding: 10px;
+            border-radius: 5px;
+            margin: 10px 0;
+        }
+        .lm-hints-row {
+            align-items: stretch;
+        }
+        .lm-hints-col {
+            display: flex;
+        }
+        .lm-hints-col > div {
+            flex: 1;
+            display: flex;
+        }
+        .lm-hints-btn button {
+            height: 100%;
+            width: 100%;
+        }
+        """
+    ) as demo:
+        gr.HTML(f"""
+        <div class="main-header">
+            <h1>{t("app.title")}</h1>
+            <p>{t("app.subtitle")}</p>
+            <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 10px 20px; border-radius: 8px; text-align: center; margin: 8px auto; max-width: 600px;">
+                <span style="color: white; font-size: 15px;">
+                    🚀 Want faster &amp; more stable experience? Try
+                    <a href="https://acemusic.ai" target="_blank" style="color: #ffd700; font-weight: bold; text-decoration: underline;">acemusic.ai</a>
+                    — 100% free!
+                </span>
+            </div>
+            <p style="margin-top: 0.5rem;">
+                <a href="https://ace-step.github.io/ace-step-v1.5.github.io/" target="_blank">Project</a> |
+                <a href="https://huggingface.co/collections/ACE-Step/ace-step-15" target="_blank">Hugging Face</a> |
+                <a href="https://modelscope.cn/models/ACE-Step/ACE-Step-v1-5" target="_blank">ModelScope</a> |
+                <a href="https://github.com/ACE-Step/ACE-Step-1.5" target="_blank">GitHub</a> |
+                <a href="https://discord.gg/PeWDxrkdj7" target="_blank">Discord</a> |
+                <a href="https://arxiv.org/abs/2602.00744" target="_blank">Technical Report</a>
+            </p>
+        </div>
+        """)
+        # Dataset Explorer Section
+        dataset_section = create_dataset_section(dataset_handler)
+        # Generation Section (pass init_params and language to support pre-initialization)
+        generation_section = create_generation_section(dit_handler, llm_handler, init_params=init_params, language=language)
+        # Results Section
+        results_section = create_results_section(dit_handler)
+        # Training Section (LoRA training and dataset builder)
+        # Pass init_params to support hiding in service mode
+        training_section = create_training_section(dit_handler, llm_handler, init_params=init_params)
+        # Connect event handlers (pass init_params for multi-model support)
+        setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, dataset_section, generation_section, results_section, init_params=init_params)
+        # Connect training event handlers
+        setup_training_event_handlers(demo, dit_handler, llm_handler, training_section)
+    return demo

acestep/gradio_ui/interfaces/dataset.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+Gradio UI Dataset Section Module
+Contains dataset explorer section component definitions
+"""
+import gradio as gr
+def create_dataset_section(dataset_handler) -> dict:
+    """Create dataset explorer section"""
+    with gr.Accordion("📊 Dataset Explorer", open=False, visible=False):
+        with gr.Row(equal_height=True):
+            dataset_type = gr.Dropdown(
+                choices=["train", "test"],
+                value="train",
+                label="Dataset",
+                info="Choose dataset to explore",
+                scale=2
+            )
+            import_dataset_btn = gr.Button("📥 Import Dataset", variant="primary", scale=1)
+            search_type = gr.Dropdown(
+                choices=["keys", "idx", "random"],
+                value="random",
+                label="Search Type",
+                info="How to find items",
+                scale=1
+            )
+            search_value = gr.Textbox(
+                label="Search Value",
+                placeholder="Enter keys or index (leave empty for random)",
+                info="Keys: exact match, Index: 0 to dataset size-1",
+                scale=2
+            )
+        instruction_display = gr.Textbox(
+            label="📝 Instruction",
+            interactive=False,
+            placeholder="No instruction available",
+            lines=1
+        )
+        repaint_viz_plot = gr.Plot()
+        with gr.Accordion("📋 Item Metadata (JSON)", open=False):
+            item_info_json = gr.Code(
+                label="Complete Item Information",
+                language="json",
+                interactive=False,
+                lines=15
+            )
+        with gr.Row(equal_height=True):
+            item_src_audio = gr.Audio(
+                label="Source Audio",
+                type="filepath",
+                interactive=False,
+                scale=8
+            )
+            get_item_btn = gr.Button("🔍 Get Item", variant="secondary", interactive=False, scale=2)
+        with gr.Row(equal_height=True):
+            item_target_audio = gr.Audio(
+                label="Target Audio",
+                type="filepath",
+                interactive=False,
+                scale=8
+            )
+            item_refer_audio = gr.Audio(
+                label="Reference Audio",
+                type="filepath",
+                interactive=False,
+                scale=2
+            )
+        with gr.Row():
+            use_src_checkbox = gr.Checkbox(
+                label="Use Source Audio from Dataset",
+                value=True,
+                info="Check to use the source audio from dataset"
+            )
+        data_status = gr.Textbox(label="📊 Data Status", interactive=False, value="❌ No dataset imported")
+        auto_fill_btn = gr.Button("📋 Auto-fill Generation Form", variant="primary")
+    return {
+        "dataset_type": dataset_type,
+        "import_dataset_btn": import_dataset_btn,
+        "search_type": search_type,
+        "search_value": search_value,
+        "instruction_display": instruction_display,
+        "repaint_viz_plot": repaint_viz_plot,
+        "item_info_json": item_info_json,
+        "item_src_audio": item_src_audio,
+        "get_item_btn": get_item_btn,
+        "item_target_audio": item_target_audio,
+        "item_refer_audio": item_refer_audio,
+        "use_src_checkbox": use_src_checkbox,
+        "data_status": data_status,
+        "auto_fill_btn": auto_fill_btn,
+    }

acestep/gradio_ui/interfaces/generation.py ADDED Viewed

	@@ -0,0 +1,694 @@

+"""
+Gradio UI Generation Section Module
+Contains generation section component definitions - Simplified UI
+"""
+import gradio as gr
+from acestep.constants import (
+    VALID_LANGUAGES,
+    TRACK_NAMES,
+    TASK_TYPES_TURBO,
+    TASK_TYPES_BASE,
+    DEFAULT_DIT_INSTRUCTION,
+)
+from acestep.gradio_ui.i18n import t
+def create_generation_section(dit_handler, llm_handler, init_params=None, language='en') -> dict:
+    """Create generation section with simplified UI
+    Args:
+        dit_handler: DiT handler instance
+        llm_handler: LM handler instance
+        init_params: Dictionary containing initialization parameters and state.
+                    If None, service will not be pre-initialized.
+        language: UI language code ('en', 'zh', 'ja')
+    """
+    # Check if service is pre-initialized
+    service_pre_initialized = init_params is not None and init_params.get('pre_initialized', False)
+    # Check if running in service mode (restricted UI)
+    service_mode = init_params is not None and init_params.get('service_mode', False)
+    # Get current language from init_params if available
+    current_language = init_params.get('language', language) if init_params else language
+    # Get available models
+    available_dit_models = init_params.get('available_dit_models', []) if init_params else []
+    current_model_value = init_params.get('config_path', '') if init_params else ''
+    show_model_selector = len(available_dit_models) > 1
+    with gr.Group():
+        # ==================== Service Configuration (Hidden in service mode) ====================
+        accordion_open = not service_pre_initialized
+        accordion_visible = not service_pre_initialized
+        with gr.Accordion(t("service.title"), open=accordion_open, visible=accordion_visible) as service_config_accordion:
+            # Language selector at the top
+            with gr.Row():
+                language_dropdown = gr.Dropdown(
+                    choices=[
+                        ("English", "en"),
+                        ("中文", "zh"),
+                        ("日本語", "ja"),
+                    ],
+                    value=current_language,
+                    label=t("service.language_label"),
+                    info=t("service.language_info"),
+                    scale=1,
+                )
+            with gr.Row(equal_height=True):
+                with gr.Column(scale=4):
+                    checkpoint_value = init_params.get('checkpoint') if service_pre_initialized else None
+                    checkpoint_dropdown = gr.Dropdown(
+                        label=t("service.checkpoint_label"),
+                        choices=dit_handler.get_available_checkpoints(),
+                        value=checkpoint_value,
+                        info=t("service.checkpoint_info")
+                    )
+                with gr.Column(scale=1, min_width=90):
+                    refresh_btn = gr.Button(t("service.refresh_btn"), size="sm")
+            with gr.Row():
+                available_models = dit_handler.get_available_acestep_v15_models()
+                default_model = "acestep-v15-turbo" if "acestep-v15-turbo" in available_models else (available_models[0] if available_models else None)
+                config_path_value = init_params.get('config_path', default_model) if service_pre_initialized else default_model
+                config_path = gr.Dropdown(
+                    label=t("service.model_path_label"),
+                    choices=available_models,
+                    value=config_path_value,
+                    info=t("service.model_path_info")
+                )
+                device_value = init_params.get('device', 'auto') if service_pre_initialized else 'auto'
+                device = gr.Dropdown(
+                    choices=["auto", "cuda", "cpu"],
+                    value=device_value,
+                    label=t("service.device_label"),
+                    info=t("service.device_info")
+                )
+            with gr.Row():
+                available_lm_models = llm_handler.get_available_5hz_lm_models()
+                default_lm_model = "acestep-5Hz-lm-0.6B" if "acestep-5Hz-lm-0.6B" in available_lm_models else (available_lm_models[0] if available_lm_models else None)
+                lm_model_path_value = init_params.get('lm_model_path', default_lm_model) if service_pre_initialized else default_lm_model
+                lm_model_path = gr.Dropdown(
+                    label=t("service.lm_model_path_label"),
+                    choices=available_lm_models,
+                    value=lm_model_path_value,
+                    info=t("service.lm_model_path_info")
+                )
+                backend_value = init_params.get('backend', 'vllm') if service_pre_initialized else 'vllm'
+                backend_dropdown = gr.Dropdown(
+                    choices=["vllm", "pt"],
+                    value=backend_value,
+                    label=t("service.backend_label"),
+                    info=t("service.backend_info")
+                )
+            with gr.Row():
+                init_llm_value = init_params.get('init_llm', True) if service_pre_initialized else True
+                init_llm_checkbox = gr.Checkbox(
+                    label=t("service.init_llm_label"),
+                    value=init_llm_value,
+                    info=t("service.init_llm_info"),
+                )
+                flash_attn_available = dit_handler.is_flash_attention_available()
+                use_flash_attention_value = init_params.get('use_flash_attention', flash_attn_available) if service_pre_initialized else flash_attn_available
+                use_flash_attention_checkbox = gr.Checkbox(
+                    label=t("service.flash_attention_label"),
+                    value=use_flash_attention_value,
+                    interactive=flash_attn_available,
+                    info=t("service.flash_attention_info_enabled") if flash_attn_available else t("service.flash_attention_info_disabled")
+                )
+                offload_to_cpu_value = init_params.get('offload_to_cpu', False) if service_pre_initialized else False
+                offload_to_cpu_checkbox = gr.Checkbox(
+                    label=t("service.offload_cpu_label"),
+                    value=offload_to_cpu_value,
+                    info=t("service.offload_cpu_info")
+                )
+                offload_dit_to_cpu_value = init_params.get('offload_dit_to_cpu', False) if service_pre_initialized else False
+                offload_dit_to_cpu_checkbox = gr.Checkbox(
+                    label=t("service.offload_dit_cpu_label"),
+                    value=offload_dit_to_cpu_value,
+                    info=t("service.offload_dit_cpu_info")
+                )
+            init_btn = gr.Button(t("service.init_btn"), variant="primary", size="lg")
+            init_status_value = init_params.get('init_status', '') if service_pre_initialized else ''
+            init_status = gr.Textbox(label=t("service.status_label"), interactive=False, lines=3, value=init_status_value)
+            # LoRA Configuration Section
+            gr.HTML("<hr><h4>🔧 LoRA Adapter</h4>")
+            with gr.Row():
+                lora_path = gr.Textbox(
+                    label="LoRA Path",
+                    placeholder="./lora_output/final/adapter",
+                    info="Path to trained LoRA adapter directory",
+                    scale=3,
+                )
+                load_lora_btn = gr.Button("📥 Load LoRA", variant="secondary", scale=1)
+                unload_lora_btn = gr.Button("🗑️ Unload", variant="secondary", scale=1)
+            with gr.Row():
+                use_lora_checkbox = gr.Checkbox(
+                    label="Use LoRA",
+                    value=False,
+                    info="Enable LoRA adapter for inference",
+                    scale=1,
+                )
+                lora_status = gr.Textbox(
+                    label="LoRA Status",
+                    value="No LoRA loaded",
+                    interactive=False,
+                    scale=2,
+                )
+        # ==================== Model Selector (Top, only when multiple models) ====================
+        with gr.Row(visible=show_model_selector):
+            dit_model_selector = gr.Dropdown(
+                choices=available_dit_models,
+                value=current_model_value,
+                label="models",
+                scale=1,
+            )
+        # Hidden dropdown when only one model (for event handler compatibility)
+        if not show_model_selector:
+            dit_model_selector = gr.Dropdown(
+                choices=available_dit_models if available_dit_models else [current_model_value],
+                value=current_model_value,
+                visible=False,
+            )
+        # ==================== Generation Mode (4 modes) ====================
+        gr.HTML("<div style='background: #4a5568; color: white; padding: 8px 16px; border-radius: 4px; font-weight: bold;'>Generation Mode</div>")
+        with gr.Row():
+            generation_mode = gr.Radio(
+                choices=[
+                    ("Simple", "simple"),
+                    ("Custom", "custom"),
+                    ("Cover", "cover"),
+                    ("Repaint", "repaint"),
+                ],
+                value="custom",
+                label="",
+                show_label=False,
+            )
+        # ==================== Simple Mode Group ====================
+        with gr.Column(visible=False) as simple_mode_group:
+            # Row: Song Description + Vocal Language + Random button
+            with gr.Row(equal_height=True):
+                simple_query_input = gr.Textbox(
+                    label=t("generation.simple_query_label"),
+                    placeholder=t("generation.simple_query_placeholder"),
+                    lines=2,
+                    info=t("generation.simple_query_info"),
+                    scale=10,
+                )
+                simple_vocal_language = gr.Dropdown(
+                    choices=VALID_LANGUAGES,
+                    value="unknown",
+                    allow_custom_value=True,
+                    label=t("generation.simple_vocal_language_label"),
+                    interactive=True,
+                    info="use unknown for instrumental",
+                    scale=2,
+                )
+                with gr.Column(scale=1, min_width=60):
+                    random_desc_btn = gr.Button(
+                        "🎲",
+                        variant="primary",
+                        size="lg",
+                    )
+            # Hidden components (kept for compatibility but not shown)
+            simple_instrumental_checkbox = gr.Checkbox(
+                label=t("generation.instrumental_label"),
+                value=False,
+                visible=False,
+            )
+            create_sample_btn = gr.Button(
+                t("generation.create_sample_btn"),
+                variant="primary",
+                size="lg",
+                visible=False,
+            )
+        # State to track if sample has been created in Simple mode
+        simple_sample_created = gr.State(value=False)
+        # ==================== Source Audio (for Cover/Repaint) ====================
+        # This is shown above the main content for Cover and Repaint modes
+        with gr.Column(visible=False) as src_audio_group:
+            with gr.Row(equal_height=True):
+                # Source Audio - scale=10 to match (refer_audio=2 + prompt/lyrics=8)
+                src_audio = gr.Audio(
+                    label="Source Audio",
+                    type="filepath",
+                    scale=10,
+                )
+                # Process button - scale=1 to align with random button
+                with gr.Column(scale=1, min_width=80):
+                    process_src_btn = gr.Button(
+                        "Analyze",
+                        variant="secondary",
+                        size="lg",
+                    )
+        # Hidden Audio Codes storage (needed internally but not displayed)
+        text2music_audio_code_string = gr.Textbox(
+            label="Audio Codes",
+            visible=False,
+        )
+        # ==================== Custom/Cover/Repaint Mode Content ====================
+        with gr.Column() as custom_mode_content:
+            with gr.Row(equal_height=True):
+                # Left: Reference Audio
+                with gr.Column(scale=2, min_width=200):
+                    reference_audio = gr.Audio(
+                        label="Reference Audio (optional)",
+                        type="filepath",
+                        show_label=True,
+                    )
+                # Middle: Prompt + Lyrics + Format button
+                with gr.Column(scale=8):
+                    # Row 1: Prompt and Lyrics
+                    with gr.Row(equal_height=True):
+                        captions = gr.Textbox(
+                            label="Prompt",
+                            placeholder="Describe the music style, mood, instruments...",
+                            lines=12,
+                            max_lines=12,
+                            scale=1,
+                        )
+                        lyrics = gr.Textbox(
+                            label="Lyrics",
+                            placeholder="Enter lyrics here... Use [Verse], [Chorus] etc. for structure",
+                            lines=12,
+                            max_lines=12,
+                            scale=1,
+                        )
+                    # Row 2: Format button (only below Prompt and Lyrics)
+                    format_btn = gr.Button(
+                        "Format",
+                        variant="secondary",
+                    )
+                # Right: Random button
+                with gr.Column(scale=1, min_width=60):
+                    sample_btn = gr.Button(
+                        "🎲",
+                        variant="primary",
+                        size="lg",
+                    )
+        # Placeholder for removed audio_uploads_accordion (for compatibility)
+        audio_uploads_accordion = gr.Column(visible=False)
+        # Legacy cover_mode_group (hidden, for backward compatibility)
+        cover_mode_group = gr.Column(visible=False)
+        # Legacy convert button (hidden, for backward compatibility)
+        convert_src_to_codes_btn = gr.Button("Convert to Codes", visible=False)
+        # ==================== Repaint Mode: Source + Time Range ====================
+        with gr.Column(visible=False) as repainting_group:
+            with gr.Row():
+                repainting_start = gr.Number(
+                    label="Start (seconds)",
+                    value=0.0,
+                    step=0.1,
+                    scale=1,
+                )
+                repainting_end = gr.Number(
+                    label="End (seconds, -1 for end)",
+                    value=-1,
+                    minimum=-1,
+                    step=0.1,
+                    scale=1,
+                )
+        # ==================== Optional Parameters ====================
+        with gr.Accordion("⚙️ Optional Parameters", open=False, visible=False) as optional_params_accordion:
+            pass
+        # ==================== Advanced Settings ====================
+        with gr.Accordion("🔧 Advanced Settings", open=False) as advanced_options_accordion:
+            with gr.Row():
+                bpm = gr.Number(
+                    label="BPM (optional)",
+                    value=0,
+                    step=1,
+                    info="leave empty for N/A",
+                    scale=1,
+                )
+                key_scale = gr.Textbox(
+                    label="Key Signature (optional)",
+                    placeholder="Leave empty for N/A",
+                    value="",
+                    info="A-G, #/♭, major/minor",
+                    scale=1,
+                )
+                time_signature = gr.Dropdown(
+                    choices=["", "2", "3", "4"],
+                    value="",
+                    label="Time Signature (optional)",
+                    allow_custom_value=True,
+                    info="2/4, 3/4, 4/4...",
+                    scale=1,
+                )
+                audio_duration = gr.Number(
+                    label="Audio Duration (seconds)",
+                    value=-1,
+                    minimum=-1,
+                    maximum=600.0,
+                    step=1,
+                    info="Use -1 for auto, or 10-600 seconds",
+                    scale=1,
+                )
+                vocal_language = gr.Dropdown(
+                    choices=VALID_LANGUAGES,
+                    value="unknown",
+                    label="Vocal Language",
+                    allow_custom_value=True,
+                    info="use `unknown` for instrumental",
+                    scale=1,
+                )
+                batch_size_input = gr.Number(
+                    label="batch size",
+                    info="max 8",
+                    value=2,
+                    minimum=1,
+                    maximum=8,
+                    step=1,
+                    scale=1,
+                    interactive=False,
+                )
+            # Row 1: DiT Inference Steps, Seed, Audio Format
+            with gr.Row():
+                inference_steps = gr.Slider(
+                    minimum=1,
+                    maximum=20,
+                    value=8,
+                    step=1,
+                    label="DiT Inference Steps",
+                    info="Turbo: max 8, Base: max 200",
+                )
+                seed = gr.Textbox(
+                    label="Seed",
+                    value="-1",
+                    info="Use comma-separated values for batches",
+                )
+                audio_format = gr.Dropdown(
+                    choices=["mp3", "flac"],
+                    value="mp3",
+                    label="Audio Format",
+                    info="Audio format for saved files",
+                )
+            # Row 2: Shift, Random Seed, Inference Method
+            with gr.Row():
+                shift = gr.Slider(
+                    minimum=1.0,
+                    maximum=5.0,
+                    value=3.0,
+                    step=0.1,
+                    label="Shift",
+                    info="Timestep shift factor for base models (range 1.0-5.0, default 3.0). Not effective for turbo models.",
+                )
+                random_seed_checkbox = gr.Checkbox(
+                    label="Random Seed",
+                    value=True,
+                    info="Enable to auto-generate seeds",
+                )
+                infer_method = gr.Dropdown(
+                    choices=["ode", "sde"],
+                    value="ode",
+                    label="Inference Method",
+                    info="Diffusion inference method. ODE (Euler) is faster, SDE (stochastic) may produce different results.",
+                )
+            # Row 3: Custom Timesteps (full width)
+            custom_timesteps = gr.Textbox(
+                label="Custom Timesteps",
+                placeholder="0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0",
+                value="",
+                info="Optional: comma-separated values from 1.0 to 0.0 (e.g., '0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0'). Overrides inference steps and shift.",
+            )
+            # Section: LM Generation Parameters
+            gr.HTML("<h4>🎵 LM Generation Parameters</h4>")
+            # Row 4: LM Temperature, LM CFG Scale, LM Top-K, LM Top-P
+            with gr.Row():
+                lm_temperature = gr.Slider(
+                    minimum=0.0,
+                    maximum=2.0,
+                    value=0.85,
+                    step=0.05,
+                    label="LM Temperature",
+                    info="5Hz LM temperature (higher = more random)",
+                )
+                lm_cfg_scale = gr.Slider(
+                    minimum=1.0,
+                    maximum=3.0,
+                    value=2.0,
+                    step=0.1,
+                    label="LM CFG Scale",
+                    info="5Hz LM CFG (1.0 = no CFG)",
+                )
+                lm_top_k = gr.Slider(
+                    minimum=0,
+                    maximum=100,
+                    value=0,
+                    step=1,
+                    label="LM Top-K",
+                    info="Top-k (0 = disabled)",
+                )
+                lm_top_p = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.9,
+                    step=0.01,
+                    label="LM Top-P",
+                    info="Top-p (1.0 = disabled)",
+                )
+            # Row 5: LM Negative Prompt (full width)
+            lm_negative_prompt = gr.Textbox(
+                label="LM Negative Prompt",
+                value="NO USER INPUT",
+                placeholder="Things to avoid in generation...",
+                lines=2,
+                info="Negative prompt (use when LM CFG Scale > 1.0)",
+            )
+            # audio_cover_strength remains hidden for now
+            audio_cover_strength = gr.Slider(minimum=0.0, maximum=1.0, value=1.0, visible=False)
+        # Note: audio_duration, bpm, key_scale, time_signature are now visible in Optional Parameters
+        # ==================== Generate Button Row ====================
+        generate_btn_interactive = init_params.get('enable_generate', False) if service_pre_initialized else False
+        with gr.Row(equal_height=True):
+            # Left: Thinking and Instrumental checkboxes
+            with gr.Column(scale=1, min_width=120):
+                think_checkbox = gr.Checkbox(
+                    label="Thinking",
+                    value=True,
+                )
+                instrumental_checkbox = gr.Checkbox(
+                    label="Instrumental",
+                    value=False,
+                )
+            # Center: Generate button
+            with gr.Column(scale=4):
+                generate_btn = gr.Button(
+                    "🎵 Generate Music",
+                    variant="primary",
+                    size="lg",
+                    interactive=generate_btn_interactive,
+                )
+            # Right: auto_score, auto_lrc
+            with gr.Column(scale=1, min_width=120):
+                auto_score = gr.Checkbox(
+                    label="Get Scores",
+                    value=False,
+                )
+                auto_lrc = gr.Checkbox(
+                    label="Get LRC",
+                    value=False,
+                )
+        # ==================== Hidden Components (for internal use) ====================
+        # These are needed for event handlers but not shown in UI
+        # Task type (set automatically based on generation_mode)
+        actual_model = init_params.get('config_path', 'acestep-v15-turbo') if service_pre_initialized else 'acestep-v15-turbo'
+        actual_model_lower = (actual_model or "").lower()
+        if "turbo" in actual_model_lower:
+            initial_task_choices = TASK_TYPES_TURBO
+        else:
+            initial_task_choices = TASK_TYPES_BASE
+        task_type = gr.Dropdown(
+            choices=initial_task_choices,
+            value="text2music",
+            visible=False,
+        )
+        instruction_display_gen = gr.Textbox(
+            value=DEFAULT_DIT_INSTRUCTION,
+            visible=False,
+        )
+        track_name = gr.Dropdown(
+            choices=TRACK_NAMES,
+            value=None,
+            visible=False,
+        )
+        complete_track_classes = gr.CheckboxGroup(
+            choices=TRACK_NAMES,
+            visible=False,
+        )
+        # Note: lyrics, vocal_language, instrumental_checkbox, format_btn are now visible in custom_mode_content
+        # Hidden advanced settings (keep defaults)
+        # Note: Most parameters are now visible in Advanced Settings section above
+        guidance_scale = gr.Slider(value=7.0, visible=False)
+        use_adg = gr.Checkbox(value=False, visible=False)
+        cfg_interval_start = gr.Slider(value=0.0, visible=False)
+        cfg_interval_end = gr.Slider(value=1.0, visible=False)
+        # LM parameters (remaining hidden ones)
+        use_cot_metas = gr.Checkbox(value=True, visible=False)
+        use_cot_caption = gr.Checkbox(value=True, visible=False)
+        use_cot_language = gr.Checkbox(value=True, visible=False)
+        constrained_decoding_debug = gr.Checkbox(value=False, visible=False)
+        allow_lm_batch = gr.Checkbox(value=True, visible=False)
+        lm_batch_chunk_size = gr.Number(value=8, visible=False)
+        score_scale = gr.Slider(minimum=0.01, maximum=1.0, value=0.5, visible=False)
+        autogen_checkbox = gr.Checkbox(value=False, visible=False)
+        # Transcribe button (hidden)
+        transcribe_btn = gr.Button(value="Transcribe", visible=False)
+        text2music_audio_codes_group = gr.Group(visible=False)
+        # Note: format_btn is now visible in custom_mode_content
+        # Load file button (hidden for now)
+        load_file = gr.UploadButton(
+            label="Load",
+            file_types=[".json"],
+            file_count="single",
+            visible=False,
+        )
+        # Caption/Lyrics accordions (not used in new UI but needed for compatibility)
+        caption_accordion = gr.Accordion("Caption", visible=False)
+        lyrics_accordion = gr.Accordion("Lyrics", visible=False)
+        # Note: optional_params_accordion is now visible above
+    return {
+        "service_config_accordion": service_config_accordion,
+        "language_dropdown": language_dropdown,
+        "checkpoint_dropdown": checkpoint_dropdown,
+        "refresh_btn": refresh_btn,
+        "config_path": config_path,
+        "device": device,
+        "init_btn": init_btn,
+        "init_status": init_status,
+        "lm_model_path": lm_model_path,
+        "init_llm_checkbox": init_llm_checkbox,
+        "backend_dropdown": backend_dropdown,
+        "use_flash_attention_checkbox": use_flash_attention_checkbox,
+        "offload_to_cpu_checkbox": offload_to_cpu_checkbox,
+        "offload_dit_to_cpu_checkbox": offload_dit_to_cpu_checkbox,
+        # LoRA components
+        "lora_path": lora_path,
+        "load_lora_btn": load_lora_btn,
+        "unload_lora_btn": unload_lora_btn,
+        "use_lora_checkbox": use_lora_checkbox,
+        "lora_status": lora_status,
+        # DiT model selector
+        "dit_model_selector": dit_model_selector,
+        "task_type": task_type,
+        "instruction_display_gen": instruction_display_gen,
+        "track_name": track_name,
+        "complete_track_classes": complete_track_classes,
+        "audio_uploads_accordion": audio_uploads_accordion,
+        "reference_audio": reference_audio,
+        "src_audio": src_audio,
+        "convert_src_to_codes_btn": convert_src_to_codes_btn,
+        "text2music_audio_code_string": text2music_audio_code_string,
+        "transcribe_btn": transcribe_btn,
+        "text2music_audio_codes_group": text2music_audio_codes_group,
+        "lm_temperature": lm_temperature,
+        "lm_cfg_scale": lm_cfg_scale,
+        "lm_top_k": lm_top_k,
+        "lm_top_p": lm_top_p,
+        "lm_negative_prompt": lm_negative_prompt,
+        "use_cot_metas": use_cot_metas,
+        "use_cot_caption": use_cot_caption,
+        "use_cot_language": use_cot_language,
+        "repainting_group": repainting_group,
+        "repainting_start": repainting_start,
+        "repainting_end": repainting_end,
+        "audio_cover_strength": audio_cover_strength,
+        # Generation mode components
+        "generation_mode": generation_mode,
+        "simple_mode_group": simple_mode_group,
+        "simple_query_input": simple_query_input,
+        "random_desc_btn": random_desc_btn,
+        "simple_instrumental_checkbox": simple_instrumental_checkbox,
+        "simple_vocal_language": simple_vocal_language,
+        "create_sample_btn": create_sample_btn,
+        "simple_sample_created": simple_sample_created,
+        "caption_accordion": caption_accordion,
+        "lyrics_accordion": lyrics_accordion,
+        "optional_params_accordion": optional_params_accordion,
+        # Custom mode components
+        "custom_mode_content": custom_mode_content,
+        "cover_mode_group": cover_mode_group,
+        # Source audio group for Cover/Repaint
+        "src_audio_group": src_audio_group,
+        "process_src_btn": process_src_btn,
+        "advanced_options_accordion": advanced_options_accordion,
+        # Existing components
+        "captions": captions,
+        "sample_btn": sample_btn,
+        "load_file": load_file,
+        "lyrics": lyrics,
+        "vocal_language": vocal_language,
+        "bpm": bpm,
+        "key_scale": key_scale,
+        "time_signature": time_signature,
+        "audio_duration": audio_duration,
+        "batch_size_input": batch_size_input,
+        "inference_steps": inference_steps,
+        "guidance_scale": guidance_scale,
+        "seed": seed,
+        "random_seed_checkbox": random_seed_checkbox,
+        "use_adg": use_adg,
+        "cfg_interval_start": cfg_interval_start,
+        "cfg_interval_end": cfg_interval_end,
+        "shift": shift,
+        "infer_method": infer_method,
+        "custom_timesteps": custom_timesteps,
+        "audio_format": audio_format,
+        "think_checkbox": think_checkbox,
+        "autogen_checkbox": autogen_checkbox,
+        "generate_btn": generate_btn,
+        "instrumental_checkbox": instrumental_checkbox,
+        "format_btn": format_btn,
+        "constrained_decoding_debug": constrained_decoding_debug,
+        "score_scale": score_scale,
+        "allow_lm_batch": allow_lm_batch,
+        "auto_score": auto_score,
+        "auto_lrc": auto_lrc,
+        "lm_batch_chunk_size": lm_batch_chunk_size,
+    }

acestep/gradio_ui/interfaces/result.py ADDED Viewed

	@@ -0,0 +1,598 @@

+"""
+Gradio UI Results Section Module
+Contains results display section component definitions
+"""
+import gradio as gr
+from acestep.gradio_ui.i18n import t
+def create_results_section(dit_handler) -> dict:
+    """Create results display section"""
+    with gr.Accordion(t("results.title"), open=True):
+        # Hidden state to store LM-generated metadata
+        lm_metadata_state = gr.State(value=None)
+        # Hidden state to track if caption/metadata is from formatted source (LM/transcription)
+        is_format_caption_state = gr.State(value=False)
+        # Batch management states
+        current_batch_index = gr.State(value=0)  # Currently displayed batch index
+        total_batches = gr.State(value=1)  # Total number of batches generated
+        batch_queue = gr.State(value={})  # Dictionary storing all batch data
+        generation_params_state = gr.State(value={})  # Store generation parameters for next batches
+        is_generating_background = gr.State(value=False)  # Background generation flag
+        # All audio components in one row with dynamic visibility
+        with gr.Row():
+            with gr.Column(visible=True) as audio_col_1:
+                generated_audio_1 = gr.Audio(
+                    label=t("results.generated_music", n=1),
+                    type="filepath",
+                    interactive=False,
+                    buttons=[]
+                )
+                with gr.Row(equal_height=True):
+                    send_to_cover_btn_1 = gr.Button(
+                        t("results.send_to_cover_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    send_to_repaint_btn_1 = gr.Button(
+                        t("results.send_to_repaint_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    save_btn_1 = gr.Button(
+                        t("results.save_btn"),
+                        variant="primary",
+                        size="sm",
+                        scale=1
+                    )
+                    score_btn_1 = gr.Button(
+                        t("results.score_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1,
+                        visible=False
+                    )
+                    lrc_btn_1 = gr.Button(
+                        t("results.lrc_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1,
+                        visible=False
+                    )
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=True) as details_accordion_1:
+                    score_display_1 = gr.Textbox(
+                        label=t("results.quality_score_label", n=1),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=6,
+                        max_lines=6,
+                        visible=True
+                    )
+                    lrc_display_1 = gr.Textbox(
+                        label=t("results.lrc_label", n=1),
+                        interactive=True,
+                        buttons=["copy"],
+                        lines=8,
+                        max_lines=8,
+                        visible=True
+                    )
+                    codes_display_1 = gr.Textbox(
+                        label=t("results.codes_label", n=1),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=4,
+                        max_lines=4,
+                        visible=True
+                    )
+            with gr.Column(visible=True) as audio_col_2:
+                generated_audio_2 = gr.Audio(
+                    label=t("results.generated_music", n=2),
+                    type="filepath",
+                    interactive=False,
+                    buttons=[]
+                )
+                with gr.Row(equal_height=True):
+                    send_to_cover_btn_2 = gr.Button(
+                        t("results.send_to_cover_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    send_to_repaint_btn_2 = gr.Button(
+                        t("results.send_to_repaint_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    save_btn_2 = gr.Button(
+                        t("results.save_btn"),
+                        variant="primary",
+                        size="sm",
+                        scale=1
+                    )
+                    score_btn_2 = gr.Button(
+                        t("results.score_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1,
+                        visible=False
+                    )
+                    lrc_btn_2 = gr.Button(
+                        t("results.lrc_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1,
+                        visible=False
+                    )
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=True) as details_accordion_2:
+                    score_display_2 = gr.Textbox(
+                        label=t("results.quality_score_label", n=2),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=6,
+                        max_lines=6,
+                        visible=True
+                    )
+                    lrc_display_2 = gr.Textbox(
+                        label=t("results.lrc_label", n=2),
+                        interactive=True,
+                        buttons=["copy"],
+                        lines=8,
+                        max_lines=8,
+                        visible=True
+                    )
+                    codes_display_2 = gr.Textbox(
+                        label=t("results.codes_label", n=2),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=4,
+                        max_lines=4,
+                        visible=True
+                    )
+            with gr.Column(visible=False) as audio_col_3:
+                generated_audio_3 = gr.Audio(
+                    label=t("results.generated_music", n=3),
+                    type="filepath",
+                    interactive=False,
+                    buttons=[]
+                )
+                with gr.Row(equal_height=True):
+                    send_to_cover_btn_3 = gr.Button(
+                        t("results.send_to_cover_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    send_to_repaint_btn_3 = gr.Button(
+                        t("results.send_to_repaint_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    save_btn_3 = gr.Button(
+                        t("results.save_btn"),
+                        variant="primary",
+                        size="sm",
+                        scale=1
+                    )
+                    score_btn_3 = gr.Button(
+                        t("results.score_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1,
+                        visible=False
+                    )
+                    lrc_btn_3 = gr.Button(
+                        t("results.lrc_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1,
+                        visible=False
+                    )
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=True) as details_accordion_3:
+                    score_display_3 = gr.Textbox(
+                        label=t("results.quality_score_label", n=3),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=6,
+                        max_lines=6,
+                        visible=True
+                    )
+                    lrc_display_3 = gr.Textbox(
+                        label=t("results.lrc_label", n=3),
+                        interactive=True,
+                        buttons=["copy"],
+                        lines=8,
+                        max_lines=8,
+                        visible=True
+                    )
+                    codes_display_3 = gr.Textbox(
+                        label=t("results.codes_label", n=3),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=4,
+                        max_lines=4,
+                        visible=True
+                    )
+            with gr.Column(visible=False) as audio_col_4:
+                generated_audio_4 = gr.Audio(
+                    label=t("results.generated_music", n=4),
+                    type="filepath",
+                    interactive=False,
+                    buttons=[]
+                )
+                with gr.Row(equal_height=True):
+                    send_to_cover_btn_4 = gr.Button(
+                        t("results.send_to_cover_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    send_to_repaint_btn_4 = gr.Button(
+                        t("results.send_to_repaint_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    save_btn_4 = gr.Button(
+                        t("results.save_btn"),
+                        variant="primary",
+                        size="sm",
+                        scale=1
+                    )
+                    score_btn_4 = gr.Button(
+                        t("results.score_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1,
+                        visible=False
+                    )
+                    lrc_btn_4 = gr.Button(
+                        t("results.lrc_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1,
+                        visible=False
+                    )
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=True) as details_accordion_4:
+                    score_display_4 = gr.Textbox(
+                        label=t("results.quality_score_label", n=4),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=6,
+                        max_lines=6,
+                        visible=True
+                    )
+                    lrc_display_4 = gr.Textbox(
+                        label=t("results.lrc_label", n=4),
+                        interactive=True,
+                        buttons=["copy"],
+                        lines=8,
+                        max_lines=8,
+                        visible=True
+                    )
+                    codes_display_4 = gr.Textbox(
+                        label=t("results.codes_label", n=4),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=4,
+                        max_lines=4,
+                        visible=True
+                    )
+        # Second row for batch size 5-8 (initially hidden)
+        with gr.Row(visible=False) as audio_row_5_8:
+            with gr.Column() as audio_col_5:
+                generated_audio_5 = gr.Audio(
+                    label=t("results.generated_music", n=5),
+                    type="filepath",
+                    interactive=False,
+                    buttons=[]
+                )
+                with gr.Row(equal_height=True):
+                    send_to_cover_btn_5 = gr.Button(t("results.send_to_cover_btn"), variant="secondary", size="sm", scale=1)
+                    send_to_repaint_btn_5 = gr.Button(t("results.send_to_repaint_btn"), variant="secondary", size="sm", scale=1)
+                    save_btn_5 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
+                    score_btn_5 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1, visible=False)
+                    lrc_btn_5 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1, visible=False)
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=True) as details_accordion_5:
+                    score_display_5 = gr.Textbox(
+                        label=t("results.quality_score_label", n=5),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=6,
+                        max_lines=6,
+                        visible=True
+                    )
+                    lrc_display_5 = gr.Textbox(
+                        label=t("results.lrc_label", n=5),
+                        interactive=True,
+                        buttons=["copy"],
+                        lines=8,
+                        max_lines=8,
+                        visible=True
+                    )
+                    codes_display_5 = gr.Textbox(
+                        label=t("results.codes_label", n=5),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=4,
+                        max_lines=4,
+                        visible=True
+                    )
+            with gr.Column() as audio_col_6:
+                generated_audio_6 = gr.Audio(
+                    label=t("results.generated_music", n=6),
+                    type="filepath",
+                    interactive=False,
+                    buttons=[]
+                )
+                with gr.Row(equal_height=True):
+                    send_to_cover_btn_6 = gr.Button(t("results.send_to_cover_btn"), variant="secondary", size="sm", scale=1)
+                    send_to_repaint_btn_6 = gr.Button(t("results.send_to_repaint_btn"), variant="secondary", size="sm", scale=1)
+                    save_btn_6 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
+                    score_btn_6 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1, visible=False)
+                    lrc_btn_6 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1, visible=False)
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=True) as details_accordion_6:
+                    score_display_6 = gr.Textbox(
+                        label=t("results.quality_score_label", n=6),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=6,
+                        max_lines=6,
+                        visible=True
+                    )
+                    lrc_display_6 = gr.Textbox(
+                        label=t("results.lrc_label", n=6),
+                        interactive=True,
+                        buttons=["copy"],
+                        lines=8,
+                        max_lines=8,
+                        visible=True
+                    )
+                    codes_display_6 = gr.Textbox(
+                        label=t("results.codes_label", n=6),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=4,
+                        max_lines=4,
+                        visible=True
+                    )
+            with gr.Column() as audio_col_7:
+                generated_audio_7 = gr.Audio(
+                    label=t("results.generated_music", n=7),
+                    type="filepath",
+                    interactive=False,
+                    buttons=[]
+                )
+                with gr.Row(equal_height=True):
+                    send_to_cover_btn_7 = gr.Button(t("results.send_to_cover_btn"), variant="secondary", size="sm", scale=1)
+                    send_to_repaint_btn_7 = gr.Button(t("results.send_to_repaint_btn"), variant="secondary", size="sm", scale=1)
+                    save_btn_7 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
+                    score_btn_7 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1, visible=False)
+                    lrc_btn_7 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1, visible=False)
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=True) as details_accordion_7:
+                    score_display_7 = gr.Textbox(
+                        label=t("results.quality_score_label", n=7),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=6,
+                        max_lines=6,
+                        visible=True
+                    )
+                    lrc_display_7 = gr.Textbox(
+                        label=t("results.lrc_label", n=7),
+                        interactive=True,
+                        buttons=["copy"],
+                        lines=8,
+                        max_lines=8,
+                        visible=True
+                    )
+                    codes_display_7 = gr.Textbox(
+                        label=t("results.codes_label", n=7),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=4,
+                        max_lines=4,
+                        visible=True
+                    )
+            with gr.Column() as audio_col_8:
+                generated_audio_8 = gr.Audio(
+                    label=t("results.generated_music", n=8),
+                    type="filepath",
+                    interactive=False,
+                    buttons=[]
+                )
+                with gr.Row(equal_height=True):
+                    send_to_cover_btn_8 = gr.Button(t("results.send_to_cover_btn"), variant="secondary", size="sm", scale=1)
+                    send_to_repaint_btn_8 = gr.Button(t("results.send_to_repaint_btn"), variant="secondary", size="sm", scale=1)
+                    save_btn_8 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
+                    score_btn_8 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1, visible=False)
+                    lrc_btn_8 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1, visible=False)
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=True) as details_accordion_8:
+                    score_display_8 = gr.Textbox(
+                        label=t("results.quality_score_label", n=8),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=6,
+                        max_lines=6,
+                        visible=True
+                    )
+                    lrc_display_8 = gr.Textbox(
+                        label=t("results.lrc_label", n=8),
+                        interactive=True,
+                        buttons=["copy"],
+                        lines=8,
+                        max_lines=8,
+                        visible=True
+                    )
+                    codes_display_8 = gr.Textbox(
+                        label=t("results.codes_label", n=8),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=4,
+                        max_lines=4,
+                        visible=True
+                    )
+        status_output = gr.Textbox(label=t("results.generation_status"), interactive=False)
+        # Batch navigation controls (hidden for simplified UI)
+        with gr.Row(equal_height=True, visible=False):
+            prev_batch_btn = gr.Button(
+                t("results.prev_btn"),
+                variant="secondary",
+                interactive=False,
+                scale=1,
+                size="sm"
+            )
+            batch_indicator = gr.Textbox(
+                label=t("results.current_batch"),
+                value=t("results.batch_indicator", current=1, total=1),
+                interactive=False,
+                scale=3
+            )
+            next_batch_status = gr.Textbox(
+                label=t("results.next_batch_status"),
+                value="",
+                interactive=False,
+                scale=3
+            )
+            next_batch_btn = gr.Button(
+                t("results.next_btn"),
+                variant="primary",
+                interactive=False,
+                scale=1,
+                size="sm"
+            )
+        # One-click restore parameters button (hidden for simplified UI)
+        restore_params_btn = gr.Button(
+            t("results.restore_params_btn"),
+            variant="secondary",
+            interactive=False,
+            size="sm",
+            visible=False
+        )
+        with gr.Accordion(t("results.batch_results_title"), open=True):
+            generated_audio_batch = gr.File(
+                label=t("results.all_files_label"),
+                file_count="multiple",
+                interactive=False,
+                visible=False
+            )
+            generation_info = gr.Markdown(label=t("results.generation_details"))
+    return {
+        "lm_metadata_state": lm_metadata_state,
+        "is_format_caption_state": is_format_caption_state,
+        "current_batch_index": current_batch_index,
+        "total_batches": total_batches,
+        "batch_queue": batch_queue,
+        "generation_params_state": generation_params_state,
+        "is_generating_background": is_generating_background,
+        "status_output": status_output,
+        "prev_batch_btn": prev_batch_btn,
+        "batch_indicator": batch_indicator,
+        "next_batch_btn": next_batch_btn,
+        "next_batch_status": next_batch_status,
+        "restore_params_btn": restore_params_btn,
+        "generated_audio_1": generated_audio_1,
+        "generated_audio_2": generated_audio_2,
+        "generated_audio_3": generated_audio_3,
+        "generated_audio_4": generated_audio_4,
+        "generated_audio_5": generated_audio_5,
+        "generated_audio_6": generated_audio_6,
+        "generated_audio_7": generated_audio_7,
+        "generated_audio_8": generated_audio_8,
+        "audio_row_5_8": audio_row_5_8,
+        "audio_col_1": audio_col_1,
+        "audio_col_2": audio_col_2,
+        "audio_col_3": audio_col_3,
+        "audio_col_4": audio_col_4,
+        "audio_col_5": audio_col_5,
+        "audio_col_6": audio_col_6,
+        "audio_col_7": audio_col_7,
+        "audio_col_8": audio_col_8,
+        "send_to_cover_btn_1": send_to_cover_btn_1,
+        "send_to_cover_btn_2": send_to_cover_btn_2,
+        "send_to_cover_btn_3": send_to_cover_btn_3,
+        "send_to_cover_btn_4": send_to_cover_btn_4,
+        "send_to_cover_btn_5": send_to_cover_btn_5,
+        "send_to_cover_btn_6": send_to_cover_btn_6,
+        "send_to_cover_btn_7": send_to_cover_btn_7,
+        "send_to_cover_btn_8": send_to_cover_btn_8,
+        "send_to_repaint_btn_1": send_to_repaint_btn_1,
+        "send_to_repaint_btn_2": send_to_repaint_btn_2,
+        "send_to_repaint_btn_3": send_to_repaint_btn_3,
+        "send_to_repaint_btn_4": send_to_repaint_btn_4,
+        "send_to_repaint_btn_5": send_to_repaint_btn_5,
+        "send_to_repaint_btn_6": send_to_repaint_btn_6,
+        "send_to_repaint_btn_7": send_to_repaint_btn_7,
+        "send_to_repaint_btn_8": send_to_repaint_btn_8,
+        "save_btn_1": save_btn_1,
+        "save_btn_2": save_btn_2,
+        "save_btn_3": save_btn_3,
+        "save_btn_4": save_btn_4,
+        "save_btn_5": save_btn_5,
+        "save_btn_6": save_btn_6,
+        "save_btn_7": save_btn_7,
+        "save_btn_8": save_btn_8,
+        "score_btn_1": score_btn_1,
+        "score_btn_2": score_btn_2,
+        "score_btn_3": score_btn_3,
+        "score_btn_4": score_btn_4,
+        "score_btn_5": score_btn_5,
+        "score_btn_6": score_btn_6,
+        "score_btn_7": score_btn_7,
+        "score_btn_8": score_btn_8,
+        "score_display_1": score_display_1,
+        "score_display_2": score_display_2,
+        "score_display_3": score_display_3,
+        "score_display_4": score_display_4,
+        "score_display_5": score_display_5,
+        "score_display_6": score_display_6,
+        "score_display_7": score_display_7,
+        "score_display_8": score_display_8,
+        "codes_display_1": codes_display_1,
+        "codes_display_2": codes_display_2,
+        "codes_display_3": codes_display_3,
+        "codes_display_4": codes_display_4,
+        "codes_display_5": codes_display_5,
+        "codes_display_6": codes_display_6,
+        "codes_display_7": codes_display_7,
+        "codes_display_8": codes_display_8,
+        "lrc_btn_1": lrc_btn_1,
+        "lrc_btn_2": lrc_btn_2,
+        "lrc_btn_3": lrc_btn_3,
+        "lrc_btn_4": lrc_btn_4,
+        "lrc_btn_5": lrc_btn_5,
+        "lrc_btn_6": lrc_btn_6,
+        "lrc_btn_7": lrc_btn_7,
+        "lrc_btn_8": lrc_btn_8,
+        "lrc_display_1": lrc_display_1,
+        "lrc_display_2": lrc_display_2,
+        "lrc_display_3": lrc_display_3,
+        "lrc_display_4": lrc_display_4,
+        "lrc_display_5": lrc_display_5,
+        "lrc_display_6": lrc_display_6,
+        "lrc_display_7": lrc_display_7,
+        "lrc_display_8": lrc_display_8,
+        "details_accordion_1": details_accordion_1,
+        "details_accordion_2": details_accordion_2,
+        "details_accordion_3": details_accordion_3,
+        "details_accordion_4": details_accordion_4,
+        "details_accordion_5": details_accordion_5,
+        "details_accordion_6": details_accordion_6,
+        "details_accordion_7": details_accordion_7,
+        "details_accordion_8": details_accordion_8,
+        "generated_audio_batch": generated_audio_batch,
+        "generation_info": generation_info,
+    }

acestep/gradio_ui/interfaces/training.py ADDED Viewed

	@@ -0,0 +1,562 @@

+"""
+Gradio UI Training Tab Module
+Contains the dataset builder and LoRA training interface components.
+"""
+import os
+import gradio as gr
+from acestep.gradio_ui.i18n import t
+def create_training_section(dit_handler, llm_handler, init_params=None) -> dict:
+    """Create the training tab section with dataset builder and training controls.
+    Args:
+        dit_handler: DiT handler instance
+        llm_handler: LLM handler instance
+        init_params: Dictionary containing initialization parameters and state.
+                    If None, service will not be pre-initialized.
+    Returns:
+        Dictionary of Gradio components for event handling
+    """
+    # Check if running in service mode (hide training tab)
+    service_mode = init_params is not None and init_params.get('service_mode', False)
+    with gr.Tab("🎓 LoRA Training", visible=not service_mode):
+        gr.HTML("""
+        <div style="text-align: center; padding: 10px; margin-bottom: 15px;">
+            <h2>🎵 LoRA Training for ACE-Step</h2>
+            <p>Build datasets from your audio files and train custom LoRA adapters</p>
+        </div>
+        """)
+        with gr.Tabs():
+            # ==================== Dataset Builder Tab ====================
+            with gr.Tab("📁 Dataset Builder"):
+                # ========== Load Existing OR Scan New ==========
+                gr.HTML("""
+                <div style="padding: 10px; margin-bottom: 10px; border: 1px solid #4a4a6a; border-radius: 8px; background: linear-gradient(135deg, #2a2a4a 0%, #1a1a3a 100%);">
+                    <h3 style="margin: 0 0 5px 0;">🚀 Quick Start</h3>
+                    <p style="margin: 0; color: #aaa;">Choose one: <b>Load existing dataset</b> OR <b>Scan new directory</b></p>
+                </div>
+                """)
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.HTML("<h4>📂 Load Existing Dataset</h4>")
+                        with gr.Row():
+                            load_json_path = gr.Textbox(
+                                label="Dataset JSON Path",
+                                placeholder="./datasets/my_lora_dataset.json",
+                                info="Load a previously saved dataset",
+                                scale=3,
+                            )
+                            load_json_btn = gr.Button("📂 Load", variant="primary", scale=1)
+                        load_json_status = gr.Textbox(
+                            label="Load Status",
+                            interactive=False,
+                        )
+                    with gr.Column(scale=1):
+                        gr.HTML("<h4>🔍 Scan New Directory</h4>")
+                        with gr.Row():
+                            audio_directory = gr.Textbox(
+                                label="Audio Directory Path",
+                                placeholder="/path/to/your/audio/folder",
+                                info="Scan for audio files (wav, mp3, flac, ogg, opus)",
+                                scale=3,
+                            )
+                            scan_btn = gr.Button("🔍 Scan", variant="secondary", scale=1)
+                        scan_status = gr.Textbox(
+                            label="Scan Status",
+                            interactive=False,
+                        )
+                gr.HTML("<hr>")
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        # Audio files table
+                        audio_files_table = gr.Dataframe(
+                            headers=["#", "Filename", "Duration", "Labeled", "BPM", "Key", "Caption"],
+                            datatype=["number", "str", "str", "str", "str", "str", "str"],
+                            label="Found Audio Files",
+                            interactive=False,
+                            wrap=True,
+                        )
+                    with gr.Column(scale=1):
+                        gr.HTML("<h3>⚙️ Dataset Settings</h3>")
+                        dataset_name = gr.Textbox(
+                            label="Dataset Name",
+                            value="my_lora_dataset",
+                            placeholder="Enter dataset name",
+                        )
+                        all_instrumental = gr.Checkbox(
+                            label="All Instrumental",
+                            value=True,
+                            info="Check if all tracks are instrumental (no vocals)",
+                        )
+                        need_lyrics = gr.Checkbox(
+                            label="Transcribe Lyrics",
+                            value=False,
+                            info="Attempt to transcribe lyrics (slower)",
+                            interactive=False,  # Disabled for now
+                        )
+                        custom_tag = gr.Textbox(
+                            label="Custom Activation Tag",
+                            placeholder="e.g., 8bit_retro, my_style",
+                            info="Unique tag to activate this LoRA's style",
+                        )
+                        tag_position = gr.Radio(
+                            choices=[
+                                ("Prepend (tag, caption)", "prepend"),
+                                ("Append (caption, tag)", "append"),
+                                ("Replace caption", "replace"),
+                            ],
+                            value="replace",
+                            label="Tag Position",
+                            info="Where to place the custom tag in the caption",
+                        )
+                gr.HTML("<hr><h3>🤖 Step 2: Auto-Label with AI</h3>")
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        gr.Markdown("""
+                        Click the button below to automatically generate metadata for all audio files using AI:
+                        - **Caption**: Music style, genre, mood description
+                        - **BPM**: Beats per minute
+                        - **Key**: Musical key (e.g., C Major, Am)
+                        - **Time Signature**: 4/4, 3/4, etc.
+                        """)
+                        skip_metas = gr.Checkbox(
+                            label="Skip Metas (No LLM)",
+                            value=False,
+                            info="Skip AI labeling. BPM/Key/Time Signature will be N/A, Language will be 'unknown' for instrumental",
+                        )
+                    with gr.Column(scale=1):
+                        auto_label_btn = gr.Button(
+                            "🏷️ Auto-Label All",
+                            variant="primary",
+                            size="lg",
+                        )
+                label_progress = gr.Textbox(
+                    label="Labeling Progress",
+                    interactive=False,
+                    lines=2,
+                )
+                gr.HTML("<hr><h3>👀 Step 3: Preview & Edit</h3>")
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        sample_selector = gr.Slider(
+                            minimum=0,
+                            maximum=0,
+                            step=1,
+                            value=0,
+                            label="Select Sample #",
+                            info="Choose a sample to preview and edit",
+                        )
+                        preview_audio = gr.Audio(
+                            label="Audio Preview",
+                            type="filepath",
+                            interactive=False,
+                        )
+                        preview_filename = gr.Textbox(
+                            label="Filename",
+                            interactive=False,
+                        )
+                    with gr.Column(scale=2):
+                        with gr.Row():
+                            edit_caption = gr.Textbox(
+                                label="Caption",
+                                lines=3,
+                                placeholder="Music description...",
+                            )
+                        with gr.Row():
+                            edit_lyrics = gr.Textbox(
+                                label="Lyrics",
+                                lines=4,
+                                placeholder="[Verse 1]\nLyrics here...\n\n[Chorus]\n...",
+                            )
+                        with gr.Row():
+                            edit_bpm = gr.Number(
+                                label="BPM",
+                                precision=0,
+                            )
+                            edit_keyscale = gr.Textbox(
+                                label="Key",
+                                placeholder="C Major",
+                            )
+                            edit_timesig = gr.Dropdown(
+                                choices=["", "2", "3", "4", "6"],
+                                label="Time Signature",
+                            )
+                            edit_duration = gr.Number(
+                                label="Duration (s)",
+                                precision=1,
+                                interactive=False,
+                            )
+                        with gr.Row():
+                            edit_language = gr.Dropdown(
+                                choices=["instrumental", "en", "zh", "ja", "ko", "es", "fr", "de", "pt", "ru", "unknown"],
+                                value="instrumental",
+                                label="Language",
+                            )
+                            edit_instrumental = gr.Checkbox(
+                                label="Instrumental",
+                                value=True,
+                            )
+                            save_edit_btn = gr.Button("💾 Save Changes", variant="secondary")
+                        edit_status = gr.Textbox(
+                            label="Edit Status",
+                            interactive=False,
+                        )
+                gr.HTML("<hr><h3>💾 Step 4: Save Dataset</h3>")
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        save_path = gr.Textbox(
+                            label="Save Path",
+                            value="./datasets/my_lora_dataset.json",
+                            placeholder="./datasets/dataset_name.json",
+                            info="Path where the dataset JSON will be saved",
+                        )
+                    with gr.Column(scale=1):
+                        save_dataset_btn = gr.Button(
+                            "💾 Save Dataset",
+                            variant="primary",
+                            size="lg",
+                        )
+                save_status = gr.Textbox(
+                    label="Save Status",
+                    interactive=False,
+                    lines=2,
+                )
+                gr.HTML("<hr><h3>⚡ Step 5: Preprocess to Tensors</h3>")
+                gr.Markdown("""
+                **Preprocessing converts your dataset to pre-computed tensors for fast training.**
+                You can either:
+                - Use the dataset from Steps 1-4 above, **OR**
+                - Load an existing dataset JSON file (if you've already saved one)
+                """)
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        load_existing_dataset_path = gr.Textbox(
+                            label="Load Existing Dataset (Optional)",
+                            placeholder="./datasets/my_lora_dataset.json",
+                            info="Path to a previously saved dataset JSON file",
+                        )
+                    with gr.Column(scale=1):
+                        load_existing_dataset_btn = gr.Button(
+                            "📂 Load Dataset",
+                            variant="secondary",
+                            size="lg",
+                        )
+                load_existing_status = gr.Textbox(
+                    label="Load Status",
+                    interactive=False,
+                )
+                gr.Markdown("""
+                This step:
+                - Encodes audio to VAE latents
+                - Encodes captions and lyrics to text embeddings
+                - Runs the condition encoder
+                - Saves all tensors to `.pt` files
+                ⚠️ **This requires the model to be loaded and may take a few minutes.**
+                """)
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        preprocess_output_dir = gr.Textbox(
+                            label="Tensor Output Directory",
+                            value="./datasets/preprocessed_tensors",
+                            placeholder="./datasets/preprocessed_tensors",
+                            info="Directory to save preprocessed tensor files",
+                        )
+                    with gr.Column(scale=1):
+                        preprocess_btn = gr.Button(
+                            "⚡ Preprocess",
+                            variant="primary",
+                            size="lg",
+                        )
+                preprocess_progress = gr.Textbox(
+                    label="Preprocessing Progress",
+                    interactive=False,
+                    lines=3,
+                )
+            # ==================== Training Tab ====================
+            with gr.Tab("🚀 Train LoRA"):
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        gr.HTML("<h3>📊 Preprocessed Dataset Selection</h3>")
+                        gr.Markdown("""
+                        Select the directory containing preprocessed tensor files (`.pt` files).
+                        These are created in the "Dataset Builder" tab using the "Preprocess" button.
+                        """)
+                        training_tensor_dir = gr.Textbox(
+                            label="Preprocessed Tensors Directory",
+                            placeholder="./datasets/preprocessed_tensors",
+                            value="./datasets/preprocessed_tensors",
+                            info="Directory containing preprocessed .pt tensor files",
+                        )
+                        load_dataset_btn = gr.Button("📂 Load Dataset", variant="secondary")
+                        training_dataset_info = gr.Textbox(
+                            label="Dataset Info",
+                            interactive=False,
+                            lines=3,
+                        )
+                    with gr.Column(scale=1):
+                        gr.HTML("<h3>⚙️ LoRA Settings</h3>")
+                        lora_rank = gr.Slider(
+                            minimum=4,
+                            maximum=256,
+                            step=4,
+                            value=64,
+                            label="LoRA Rank (r)",
+                            info="Higher = more capacity, more memory",
+                        )
+                        lora_alpha = gr.Slider(
+                            minimum=4,
+                            maximum=512,
+                            step=4,
+                            value=128,
+                            label="LoRA Alpha",
+                            info="Scaling factor (typically 2x rank)",
+                        )
+                        lora_dropout = gr.Slider(
+                            minimum=0.0,
+                            maximum=0.5,
+                            step=0.05,
+                            value=0.1,
+                            label="LoRA Dropout",
+                        )
+                gr.HTML("<hr><h3>🎛️ Training Parameters</h3>")
+                with gr.Row():
+                    learning_rate = gr.Number(
+                        label="Learning Rate",
+                        value=1e-4,
+                        info="Start with 1e-4, adjust if needed",
+                    )
+                    train_epochs = gr.Slider(
+                        minimum=100,
+                        maximum=4000,
+                        step=100,
+                        value=500,
+                        label="Max Epochs",
+                    )
+                    train_batch_size = gr.Slider(
+                        minimum=1,
+                        maximum=8,
+                        step=1,
+                        value=1,
+                        label="Batch Size",
+                        info="Increase if you have enough VRAM",
+                    )
+                    gradient_accumulation = gr.Slider(
+                        minimum=1,
+                        maximum=16,
+                        step=1,
+                        value=1,
+                        label="Gradient Accumulation",
+                        info="Effective batch = batch_size × accumulation",
+                    )
+                with gr.Row():
+                    save_every_n_epochs = gr.Slider(
+                        minimum=50,
+                        maximum=1000,
+                        step=50,
+                        value=200,
+                        label="Save Every N Epochs",
+                    )
+                    training_shift = gr.Slider(
+                        minimum=1.0,
+                        maximum=5.0,
+                        step=0.5,
+                        value=3.0,
+                        label="Shift",
+                        info="Timestep shift for turbo model",
+                    )
+                    training_seed = gr.Number(
+                        label="Seed",
+                        value=42,
+                        precision=0,
+                    )
+                with gr.Row():
+                    lora_output_dir = gr.Textbox(
+                        label="Output Directory",
+                        value="./lora_output",
+                        placeholder="./lora_output",
+                        info="Directory to save trained LoRA weights",
+                    )
+                gr.HTML("<hr>")
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        start_training_btn = gr.Button(
+                            "🚀 Start Training",
+                            variant="primary",
+                            size="lg",
+                        )
+                    with gr.Column(scale=1):
+                        stop_training_btn = gr.Button(
+                            "⏹️ Stop Training",
+                            variant="stop",
+                            size="lg",
+                        )
+                training_progress = gr.Textbox(
+                    label="Training Progress",
+                    interactive=False,
+                    lines=2,
+                )
+                with gr.Row():
+                    training_log = gr.Textbox(
+                        label="Training Log",
+                        interactive=False,
+                        lines=10,
+                        max_lines=15,
+                        scale=1,
+                    )
+                    training_loss_plot = gr.LinePlot(
+                        x="step",
+                        y="loss",
+                        title="Training Loss",
+                        x_title="Step",
+                        y_title="Loss",
+                        scale=1,
+                    )
+                gr.HTML("<hr><h3>📦 Export LoRA</h3>")
+                with gr.Row():
+                    export_path = gr.Textbox(
+                        label="Export Path",
+                        value="./lora_output/final_lora",
+                        placeholder="./lora_output/my_lora",
+                    )
+                    export_lora_btn = gr.Button("📦 Export LoRA", variant="secondary")
+                export_status = gr.Textbox(
+                    label="Export Status",
+                    interactive=False,
+                )
+    # Store dataset builder state
+    dataset_builder_state = gr.State(None)
+    training_state = gr.State({"is_training": False, "should_stop": False})
+    return {
+        # Dataset Builder - Load or Scan
+        "load_json_path": load_json_path,
+        "load_json_btn": load_json_btn,
+        "load_json_status": load_json_status,
+        "audio_directory": audio_directory,
+        "scan_btn": scan_btn,
+        "scan_status": scan_status,
+        "audio_files_table": audio_files_table,
+        "dataset_name": dataset_name,
+        "all_instrumental": all_instrumental,
+        "need_lyrics": need_lyrics,
+        "custom_tag": custom_tag,
+        "tag_position": tag_position,
+        "skip_metas": skip_metas,
+        "auto_label_btn": auto_label_btn,
+        "label_progress": label_progress,
+        "sample_selector": sample_selector,
+        "preview_audio": preview_audio,
+        "preview_filename": preview_filename,
+        "edit_caption": edit_caption,
+        "edit_lyrics": edit_lyrics,
+        "edit_bpm": edit_bpm,
+        "edit_keyscale": edit_keyscale,
+        "edit_timesig": edit_timesig,
+        "edit_duration": edit_duration,
+        "edit_language": edit_language,
+        "edit_instrumental": edit_instrumental,
+        "save_edit_btn": save_edit_btn,
+        "edit_status": edit_status,
+        "save_path": save_path,
+        "save_dataset_btn": save_dataset_btn,
+        "save_status": save_status,
+        # Preprocessing
+        "load_existing_dataset_path": load_existing_dataset_path,
+        "load_existing_dataset_btn": load_existing_dataset_btn,
+        "load_existing_status": load_existing_status,
+        "preprocess_output_dir": preprocess_output_dir,
+        "preprocess_btn": preprocess_btn,
+        "preprocess_progress": preprocess_progress,
+        "dataset_builder_state": dataset_builder_state,
+        # Training
+        "training_tensor_dir": training_tensor_dir,
+        "load_dataset_btn": load_dataset_btn,
+        "training_dataset_info": training_dataset_info,
+        "lora_rank": lora_rank,
+        "lora_alpha": lora_alpha,
+        "lora_dropout": lora_dropout,
+        "learning_rate": learning_rate,
+        "train_epochs": train_epochs,
+        "train_batch_size": train_batch_size,
+        "gradient_accumulation": gradient_accumulation,
+        "save_every_n_epochs": save_every_n_epochs,
+        "training_shift": training_shift,
+        "training_seed": training_seed,
+        "lora_output_dir": lora_output_dir,
+        "start_training_btn": start_training_btn,
+        "stop_training_btn": stop_training_btn,
+        "training_progress": training_progress,
+        "training_log": training_log,
+        "training_loss_plot": training_loss_plot,
+        "export_path": export_path,
+        "export_lora_btn": export_lora_btn,
+        "export_status": export_status,
+        "training_state": training_state,
+    }

acestep/handler.py ADDED Viewed

The diff for this file is too large to render. See raw diff

acestep/inference.py ADDED Viewed

	@@ -0,0 +1,1181 @@

+"""
+ACE-Step Inference API Module
+This module provides a standardized inference interface for music generation,
+designed for third-party integration. It offers both a simplified API and
+backward-compatible Gradio UI support.
+"""
+import math
+import os
+import tempfile
+from typing import Optional, Union, List, Dict, Any, Tuple
+from dataclasses import dataclass, field, asdict
+from loguru import logger
+from acestep.audio_utils import AudioSaver, generate_uuid_from_params
+# HuggingFace Space environment detection
+IS_HUGGINGFACE_SPACE = os.environ.get("SPACE_ID") is not None
+@dataclass
+class GenerationParams:
+    """Configuration for music generation parameters.
+    Attributes:
+        # Text Inputs
+        caption: A short text prompt describing the desired music (main prompt). < 512 characters
+        lyrics: Lyrics for the music. Use "[Instrumental]" for instrumental songs. < 4096 characters
+        instrumental: If True, generate instrumental music regardless of lyrics.
+        # Music Metadata
+        bpm: BPM (beats per minute), e.g., 120. Set to None for automatic estimation. 30 ~ 300
+        keyscale: Musical key (e.g., "C Major", "Am"). Leave empty for auto-detection. A-G, #/♭, major/minor
+        timesignature: Time signature (2 for '2/4', 3 for '3/4', 4 for '4/4', 6 for '6/8'). Leave empty for auto-detection.
+        vocal_language: Language code for vocals, e.g., "en", "zh", "ja", or "unknown". see acestep/constants.py:VALID_LANGUAGES
+        duration: Target audio length in seconds. If <0 or None, model chooses automatically. 10 ~ 600
+        # Generation Parameters
+        inference_steps: Number of diffusion steps (e.g., 8 for turbo, 32–100 for base model).
+        guidance_scale: CFG (classifier-free guidance) strength. Higher means following the prompt more strictly. Only support for non-turbo model.
+        seed: Integer seed for reproducibility. -1 means use random seed each time.
+        # Advanced DiT Parameters
+        use_adg: Whether to use Adaptive Dual Guidance (only works for base model).
+        cfg_interval_start: Start ratio (0.0–1.0) to apply CFG.
+        cfg_interval_end: End ratio (0.0–1.0) to apply CFG.
+        shift: Timestep shift factor (default 1.0). When != 1.0, applies t = shift * t / (1 + (shift - 1) * t) to timesteps.
+        # Task-Specific Parameters
+        task_type: Type of generation task. One of: "text2music", "cover", "repaint", "lego", "extract", "complete".
+        reference_audio: Path to a reference audio file for style transfer or cover tasks.
+        src_audio: Path to a source audio file for audio-to-audio tasks.
+        audio_codes: Audio semantic codes as a string (advanced use, for code-control generation).
+        repainting_start: For repaint/lego tasks: start time in seconds for region to repaint.
+        repainting_end: For repaint/lego tasks: end time in seconds for region to repaint (-1 for until end).
+        audio_cover_strength: Strength of reference audio/codes influence (range 0.0–1.0). set smaller (0.2) for style transfer tasks.
+        instruction: Optional task instruction prompt. If empty, auto-generated by system.
+        # 5Hz Language Model Parameters for CoT reasoning
+        thinking: If True, enable 5Hz Language Model "Chain-of-Thought" reasoning for semantic/music metadata and codes.
+        lm_temperature: Sampling temperature for the LLM (0.0–2.0). Higher = more creative/varied results.
+        lm_cfg_scale: Classifier-free guidance scale for the LLM.
+        lm_top_k: LLM top-k sampling (0 = disabled).
+        lm_top_p: LLM top-p nucleus sampling (1.0 = disabled).
+        lm_negative_prompt: Negative prompt to use for LLM (for control).
+        use_cot_metas: Whether to let LLM generate music metadata via CoT reasoning.
+        use_cot_caption: Whether to let LLM rewrite or format the input caption via CoT reasoning.
+        use_cot_language: Whether to let LLM detect vocal language via CoT.
+    """
+    # Required Inputs
+    task_type: str = "text2music"
+    instruction: str = "Fill the audio semantic mask based on the given conditions:"
+    # Audio Uploads
+    reference_audio: Optional[str] = None
+    src_audio: Optional[str] = None
+    # LM Codes Hints
+    audio_codes: str = ""
+    # Text Inputs
+    caption: str = ""
+    lyrics: str = ""
+    instrumental: bool = False
+    # Metadata
+    vocal_language: str = "unknown"
+    bpm: Optional[int] = None
+    keyscale: str = ""
+    timesignature: str = ""
+    duration: float = -1.0
+    # Advanced Settings
+    inference_steps: int = 8
+    seed: int = -1
+    guidance_scale: float = 7.0
+    use_adg: bool = False
+    cfg_interval_start: float = 0.0
+    cfg_interval_end: float = 1.0
+    shift: float = 1.0
+    infer_method: str = "ode"  # "ode" or "sde" - diffusion inference method
+    # Custom timesteps (parsed from string like "0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0")
+    # If provided, overrides inference_steps and shift
+    timesteps: Optional[List[float]] = None
+    repainting_start: float = 0.0
+    repainting_end: float = -1
+    audio_cover_strength: float = 1.0
+    # 5Hz Language Model Parameters
+    thinking: bool = True
+    lm_temperature: float = 0.85
+    lm_cfg_scale: float = 2.0
+    lm_top_k: int = 0
+    lm_top_p: float = 0.9
+    lm_negative_prompt: str = "NO USER INPUT"
+    use_cot_metas: bool = True
+    use_cot_caption: bool = True
+    use_cot_lyrics: bool = False  # TODO: not used yet
+    use_cot_language: bool = True
+    use_constrained_decoding: bool = True
+    cot_bpm: Optional[int] = None
+    cot_keyscale: str = ""
+    cot_timesignature: str = ""
+    cot_duration: Optional[float] = None
+    cot_vocal_language: str = "unknown"
+    cot_caption: str = ""
+    cot_lyrics: str = ""
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert config to dictionary for JSON serialization."""
+        return asdict(self)
+@dataclass
+class GenerationConfig:
+    """Configuration for music generation.
+    Attributes:
+        batch_size: Number of audio samples to generate
+        allow_lm_batch: Whether to allow batch processing in LM
+        use_random_seed: Whether to use random seed
+        seeds: Seed(s) for batch generation. Can be:
+            - None: Use random seeds (when use_random_seed=True) or params.seed (when use_random_seed=False)
+            - List[int]: List of seeds, will be padded with random seeds if fewer than batch_size
+            - int: Single seed value (will be converted to list and padded)
+        lm_batch_chunk_size: Batch chunk size for LM processing
+        constrained_decoding_debug: Whether to enable constrained decoding debug
+        audio_format: Output audio format, one of "mp3", "wav", "flac". Default: "flac"
+    """
+    batch_size: int = 2
+    allow_lm_batch: bool = False
+    use_random_seed: bool = True
+    seeds: Optional[List[int]] = None
+    lm_batch_chunk_size: int = 8
+    constrained_decoding_debug: bool = False
+    audio_format: str = "flac"  # Default to FLAC for fast saving
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert config to dictionary for JSON serialization."""
+        return asdict(self)
+@dataclass
+class GenerationResult:
+    """Result of music generation.
+    Attributes:
+        # Audio Outputs
+        audios: List of audio dictionaries with paths, keys, params
+        status_message: Status message from generation
+        extra_outputs: Extra outputs from generation
+        success: Whether generation completed successfully
+        error: Error message if generation failed
+    """
+    # Audio Outputs
+    audios: List[Dict[str, Any]] = field(default_factory=list)
+    # Generation Information
+    status_message: str = ""
+    extra_outputs: Dict[str, Any] = field(default_factory=dict)
+    # Success Status
+    success: bool = True
+    error: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert result to dictionary for JSON serialization."""
+        return asdict(self)
+@dataclass
+class UnderstandResult:
+    """Result of music understanding from audio codes.
+    Attributes:
+        # Metadata Fields
+        caption: Generated caption describing the music
+        lyrics: Generated or extracted lyrics
+        bpm: Beats per minute (None if not detected)
+        duration: Duration in seconds (None if not detected)
+        keyscale: Musical key (e.g., "C Major")
+        language: Vocal language code (e.g., "en", "zh")
+        timesignature: Time signature (e.g., "4/4")
+        # Status
+        status_message: Status message from understanding
+        success: Whether understanding completed successfully
+        error: Error message if understanding failed
+    """
+    # Metadata Fields
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    # Status
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert result to dictionary for JSON serialization."""
+        return asdict(self)
+def _update_metadata_from_lm(
+    metadata: Dict[str, Any],
+    bpm: Optional[int],
+    key_scale: str,
+    time_signature: str,
+    audio_duration: Optional[float],
+    vocal_language: str,
+    caption: str,
+    lyrics: str,
+) -> Tuple[Optional[int], str, str, Optional[float]]:
+    """Update metadata fields from LM output if not provided by user."""
+    if bpm is None and metadata.get('bpm'):
+        bpm_value = metadata.get('bpm')
+        if bpm_value not in ["N/A", ""]:
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+    if not key_scale and metadata.get('keyscale'):
+        key_scale_value = metadata.get('keyscale', metadata.get('key_scale', ""))
+        if key_scale_value != "N/A":
+            key_scale = key_scale_value
+    if not time_signature and metadata.get('timesignature'):
+        time_signature_value = metadata.get('timesignature', metadata.get('time_signature', ""))
+        if time_signature_value != "N/A":
+            time_signature = time_signature_value
+    if audio_duration is None or audio_duration <= 0:
+        audio_duration_value = metadata.get('duration', -1)
+        if audio_duration_value not in ["N/A", ""]:
+            try:
+                audio_duration = float(audio_duration_value)
+            except (ValueError, TypeError):
+                pass
+    if not vocal_language and metadata.get('vocal_language'):
+        vocal_language = metadata.get('vocal_language')
+    if not caption and metadata.get('caption'):
+        caption = metadata.get('caption')
+    if not lyrics and metadata.get('lyrics'):
+        lyrics = metadata.get('lyrics')
+    return bpm, key_scale, time_signature, audio_duration, vocal_language, caption, lyrics
+def generate_music(
+    dit_handler,
+    llm_handler,
+    params: GenerationParams,
+    config: GenerationConfig,
+    save_dir: Optional[str] = None,
+    progress=None,
+) -> GenerationResult:
+    """Generate music using ACE-Step model with optional LM reasoning.
+    Args:
+        dit_handler: Initialized DiT model handler (AceStepHandler instance)
+        llm_handler: Initialized LLM handler (LLMHandler instance)
+        params: Generation parameters (GenerationParams instance)
+        config: Generation configuration (GenerationConfig instance)
+    Returns:
+        GenerationResult with generated audio files and metadata
+    """
+    try:
+        # Phase 1: LM-based metadata and code generation (if enabled)
+        audio_code_string_to_use = params.audio_codes
+        lm_generated_metadata = None
+        lm_generated_audio_codes_list = []
+        lm_total_time_costs = {
+            "phase1_time": 0.0,
+            "phase2_time": 0.0,
+            "total_time": 0.0,
+        }
+        # Extract mutable copies of metadata (will be updated by LM if needed)
+        bpm = params.bpm
+        key_scale = params.keyscale
+        time_signature = params.timesignature
+        audio_duration = params.duration
+        dit_input_caption = params.caption
+        dit_input_vocal_language = params.vocal_language
+        dit_input_lyrics = params.lyrics
+        # Determine if we need to generate audio codes
+        # If user has provided audio_codes, we don't need to generate them
+        # Otherwise, check if we need audio codes (lm_dit mode) or just metas (dit mode)
+        user_provided_audio_codes = bool(params.audio_codes and str(params.audio_codes).strip())
+        # Determine infer_type: use "llm_dit" if we need audio codes, "dit" if only metas needed
+        # For now, we use "llm_dit" if batch mode or if user hasn't provided codes
+        # Use "dit" if user has provided codes (only need metas) or if explicitly only need metas
+        # Note: This logic can be refined based on specific requirements
+        need_audio_codes = not user_provided_audio_codes
+        # Determine if we should use chunk-based LM generation (always use chunks for consistency)
+        # Determine actual batch size for chunk processing
+        actual_batch_size = config.batch_size if config.batch_size is not None else 1
+        # Prepare seeds for batch generation
+        # Use config.seed if provided, otherwise fallback to params.seed
+        # Convert config.seed (None, int, or List[int]) to format that prepare_seeds accepts
+        seed_for_generation = ""
+        if config.seeds is not None and len(config.seeds) > 0:
+            if isinstance(config.seeds, list):
+                # Convert List[int] to comma-separated string
+                seed_for_generation = ",".join(str(s) for s in config.seeds)
+        # Use dit_handler.prepare_seeds to handle seed list generation and padding
+        # This will handle all the logic: padding with random seeds if needed, etc.
+        actual_seed_list, _ = dit_handler.prepare_seeds(actual_batch_size, seed_for_generation, config.use_random_seed)
+        # LM-based Chain-of-Thought reasoning
+        # Skip LM for cover/repaint tasks - these tasks use reference/src audio directly
+        # and don't need LM to generate audio codes
+        skip_lm_tasks = {"cover", "repaint"}
+        # Determine if we should use LLM
+        # LLM is needed for:
+        # 1. thinking=True: generate audio codes via LM
+        # 2. use_cot_caption=True: enhance/generate caption via CoT
+        # 3. use_cot_language=True: detect vocal language via CoT
+        # 4. use_cot_metas=True: fill missing metadata via CoT
+        need_lm_for_cot = params.use_cot_caption or params.use_cot_language or params.use_cot_metas
+        use_lm = (params.thinking or need_lm_for_cot) and llm_handler.llm_initialized and params.task_type not in skip_lm_tasks
+        lm_status = []
+        if params.task_type in skip_lm_tasks:
+            logger.info(f"Skipping LM for task_type='{params.task_type}' - using DiT directly")
+        logger.info(f"[generate_music] LLM usage decision: thinking={params.thinking}, "
+                   f"use_cot_caption={params.use_cot_caption}, use_cot_language={params.use_cot_language}, "
+                   f"use_cot_metas={params.use_cot_metas}, need_lm_for_cot={need_lm_for_cot}, "
+                   f"llm_initialized={llm_handler.llm_initialized if llm_handler else False}, use_lm={use_lm}")
+        if use_lm:
+            # Convert sampling parameters - handle None values safely
+            top_k_value = None if not params.lm_top_k or params.lm_top_k == 0 else int(params.lm_top_k)
+            top_p_value = None if not params.lm_top_p or params.lm_top_p >= 1.0 else params.lm_top_p
+            # Build user_metadata from user-provided values
+            user_metadata = {}
+            if bpm is not None:
+                try:
+                    bpm_value = float(bpm)
+                    if bpm_value > 0:
+                        user_metadata['bpm'] = int(bpm_value)
+                except (ValueError, TypeError):
+                    pass
+            if key_scale and key_scale.strip():
+                key_scale_clean = key_scale.strip()
+                if key_scale_clean.lower() not in ["n/a", ""]:
+                    user_metadata['keyscale'] = key_scale_clean
+            if time_signature and time_signature.strip():
+                time_sig_clean = time_signature.strip()
+                if time_sig_clean.lower() not in ["n/a", ""]:
+                    user_metadata['timesignature'] = time_sig_clean
+            if audio_duration is not None:
+                try:
+                    duration_value = float(audio_duration)
+                    if duration_value > 0:
+                        user_metadata['duration'] = int(duration_value)
+                except (ValueError, TypeError):
+                    pass
+            user_metadata_to_pass = user_metadata if user_metadata else None
+            # Determine infer_type based on whether we need audio codes
+            # - "llm_dit": generates both metas and audio codes (two-phase internally)
+            # - "dit": generates only metas (single phase)
+            infer_type = "llm_dit" if need_audio_codes and params.thinking else "dit"
+            # Use chunk size from config, or default to batch_size if not set
+            max_inference_batch_size = int(config.lm_batch_chunk_size) if config.lm_batch_chunk_size > 0 else actual_batch_size
+            num_chunks = math.ceil(actual_batch_size / max_inference_batch_size)
+            all_metadata_list = []
+            all_audio_codes_list = []
+            for chunk_idx in range(num_chunks):
+                chunk_start = chunk_idx * max_inference_batch_size
+                chunk_end = min(chunk_start + max_inference_batch_size, actual_batch_size)
+                chunk_size = chunk_end - chunk_start
+                chunk_seeds = actual_seed_list[chunk_start:chunk_end] if chunk_start < len(actual_seed_list) else None
+                logger.info(f"LM chunk {chunk_idx+1}/{num_chunks} (infer_type={infer_type}) "
+                            f"(size: {chunk_size}, seeds: {chunk_seeds})")
+                # Use the determined infer_type
+                # - "llm_dit" will internally run two phases (metas + codes)
+                # - "dit" will only run phase 1 (metas only)
+                result = llm_handler.generate_with_stop_condition(
+                    caption=params.caption or "",
+                    lyrics=params.lyrics or "",
+                    infer_type=infer_type,
+                    temperature=params.lm_temperature,
+                    cfg_scale=params.lm_cfg_scale,
+                    negative_prompt=params.lm_negative_prompt,
+                    top_k=top_k_value,
+                    top_p=top_p_value,
+                    target_duration=audio_duration,  # Pass duration to limit audio codes generation
+                    user_metadata=user_metadata_to_pass,
+                    use_cot_caption=params.use_cot_caption,
+                    use_cot_language=params.use_cot_language,
+                    use_cot_metas=params.use_cot_metas,
+                    use_constrained_decoding=params.use_constrained_decoding,
+                    constrained_decoding_debug=config.constrained_decoding_debug,
+                    batch_size=chunk_size,
+                    seeds=chunk_seeds,
+                    progress=progress,
+                )
+                # Check if LM generation failed
+                if not result.get("success", False):
+                    error_msg = result.get("error", "Unknown LM error")
+                    lm_status.append(f"❌ LM Error: {error_msg}")
+                    # Return early with error
+                    return GenerationResult(
+                        audios=[],
+                        status_message=f"❌ LM generation failed: {error_msg}",
+                        extra_outputs={},
+                        success=False,
+                        error=error_msg,
+                    )
+                # Extract metadata and audio_codes from result dict
+                if chunk_size > 1:
+                    metadata_list = result.get("metadata", [])
+                    audio_codes_list = result.get("audio_codes", [])
+                    all_metadata_list.extend(metadata_list)
+                    all_audio_codes_list.extend(audio_codes_list)
+                else:
+                    metadata = result.get("metadata", {})
+                    audio_codes = result.get("audio_codes", "")
+                    all_metadata_list.append(metadata)
+                    all_audio_codes_list.append(audio_codes)
+                # Collect time costs from LM extra_outputs
+                lm_extra = result.get("extra_outputs", {})
+                lm_chunk_time_costs = lm_extra.get("time_costs", {})
+                if lm_chunk_time_costs:
+                    # Accumulate time costs from all chunks
+                    for key in ["phase1_time", "phase2_time", "total_time"]:
+                        if key in lm_chunk_time_costs:
+                            lm_total_time_costs[key] += lm_chunk_time_costs[key]
+                    time_str = ", ".join([f"{k}: {v:.2f}s" for k, v in lm_chunk_time_costs.items()])
+                    lm_status.append(f"✅ LM chunk {chunk_idx+1}: {time_str}")
+            lm_generated_metadata = all_metadata_list[0] if all_metadata_list else None
+            lm_generated_audio_codes_list = all_audio_codes_list
+            # Set audio_code_string_to_use based on infer_type
+            if infer_type == "llm_dit":
+                # If batch mode, use list; otherwise use single string
+                if actual_batch_size > 1:
+                    audio_code_string_to_use = all_audio_codes_list
+                else:
+                    audio_code_string_to_use = all_audio_codes_list[0] if all_audio_codes_list else ""
+            else:
+                # For "dit" mode, keep user-provided codes or empty
+                audio_code_string_to_use = params.audio_codes
+            # Update metadata from LM if not provided by user
+            if lm_generated_metadata:
+                bpm, key_scale, time_signature, audio_duration, vocal_language, caption, lyrics = _update_metadata_from_lm(
+                    metadata=lm_generated_metadata,
+                    bpm=bpm,
+                    key_scale=key_scale,
+                    time_signature=time_signature,
+                    audio_duration=audio_duration,
+                    vocal_language=dit_input_vocal_language,
+                    caption=dit_input_caption,
+                    lyrics=dit_input_lyrics)
+                if not params.bpm:
+                    params.cot_bpm = bpm
+                if not params.keyscale:
+                    params.cot_keyscale = key_scale
+                if not params.timesignature:
+                    params.cot_timesignature = time_signature
+                if not params.duration:
+                    params.cot_duration = audio_duration
+                if not params.vocal_language:
+                    params.cot_vocal_language = vocal_language
+                if not params.caption:
+                    params.cot_caption = caption
+                if not params.lyrics:
+                    params.cot_lyrics = lyrics
+            # set cot caption and language if needed
+            if params.use_cot_caption:
+                dit_input_caption = lm_generated_metadata.get("caption", dit_input_caption)
+            if params.use_cot_language:
+                dit_input_vocal_language = lm_generated_metadata.get("vocal_language", dit_input_vocal_language)
+        # Phase 2: DiT music generation
+        # Use seed_for_generation (from config.seed or params.seed) instead of params.seed for actual generation
+        result = dit_handler.generate_music(
+            captions=dit_input_caption,
+            lyrics=dit_input_lyrics,
+            bpm=bpm,
+            key_scale=key_scale,
+            time_signature=time_signature,
+            vocal_language=dit_input_vocal_language,
+            inference_steps=params.inference_steps,
+            guidance_scale=params.guidance_scale,
+            use_random_seed=config.use_random_seed,
+            seed=seed_for_generation,  # Use config.seed (or params.seed fallback) instead of params.seed directly
+            reference_audio=params.reference_audio,
+            audio_duration=audio_duration,
+            batch_size=config.batch_size if config.batch_size is not None else 1,
+            src_audio=params.src_audio,
+            audio_code_string=audio_code_string_to_use,
+            repainting_start=params.repainting_start,
+            repainting_end=params.repainting_end,
+            instruction=params.instruction,
+            audio_cover_strength=params.audio_cover_strength,
+            task_type=params.task_type,
+            use_adg=params.use_adg,
+            cfg_interval_start=params.cfg_interval_start,
+            cfg_interval_end=params.cfg_interval_end,
+            shift=params.shift,
+            infer_method=params.infer_method,
+            timesteps=params.timesteps,
+            progress=progress,
+        )
+        # Check if generation failed
+        if not result.get("success", False):
+            return GenerationResult(
+                audios=[],
+                status_message=result.get("status_message", ""),
+                extra_outputs={},
+                success=False,
+                error=result.get("error"),
+            )
+        # Extract results from dit_handler.generate_music dict
+        dit_audios = result.get("audios", [])
+        status_message = result.get("status_message", "")
+        dit_extra_outputs = result.get("extra_outputs", {})
+        # Use the seed list already prepared above (from config.seed or params.seed fallback)
+        # actual_seed_list was computed earlier using dit_handler.prepare_seeds
+        seed_list = actual_seed_list
+        # Get base params dictionary
+        base_params_dict = params.to_dict()
+        # Save audio files using AudioSaver (format from config)
+        audio_format = config.audio_format if config.audio_format else "flac"
+        audio_saver = AudioSaver(default_format=audio_format)
+        # Use handler's temp_dir for saving files
+        if save_dir is not None:
+            os.makedirs(save_dir, exist_ok=True)
+        # Build audios list for GenerationResult with params and save files
+        # Audio saving and UUID generation handled here, outside of handler
+        audios = []
+        for idx, dit_audio in enumerate(dit_audios):
+            # Create a copy of params dict for this audio
+            audio_params = base_params_dict.copy()
+            # Update audio-specific values
+            audio_params["seed"] = seed_list[idx] if idx < len(seed_list) else None
+            # Add audio codes if batch mode
+            if lm_generated_audio_codes_list and idx < len(lm_generated_audio_codes_list):
+                audio_params["audio_codes"] = lm_generated_audio_codes_list[idx]
+            # Get audio tensor and metadata
+            audio_tensor = dit_audio.get("tensor")
+            sample_rate = dit_audio.get("sample_rate", 48000)
+            # Generate UUID for this audio (moved from handler)
+            batch_seed = seed_list[idx] if idx < len(seed_list) else seed_list[0] if seed_list else -1
+            audio_code_str = lm_generated_audio_codes_list[idx] if (
+                lm_generated_audio_codes_list and idx < len(lm_generated_audio_codes_list)) else audio_code_string_to_use
+            if isinstance(audio_code_str, list):
+                audio_code_str = audio_code_str[idx] if idx < len(audio_code_str) else ""
+            audio_key = generate_uuid_from_params(audio_params)
+            # Save audio file (handled outside handler)
+            audio_path = None
+            if audio_tensor is not None and save_dir is not None:
+                try:
+                    audio_file = os.path.join(save_dir, f"{audio_key}.{audio_format}")
+                    audio_path = audio_saver.save_audio(audio_tensor,
+                                                        audio_file,
+                                                        sample_rate=sample_rate,
+                                                        format=audio_format,
+                                                        channels_first=True)
+                except Exception as e:
+                    logger.error(f"[generate_music] Failed to save audio file: {e}")
+                    audio_path = ""  # Fallback to empty path
+            audio_dict = {
+                "path": audio_path or "",  # File path (saved here, not in handler)
+                "tensor": audio_tensor,  # Audio tensor [channels, samples], CPU, float32
+                "key": audio_key,
+                "sample_rate": sample_rate,
+                "params": audio_params,
+            }
+            audios.append(audio_dict)
+        # Merge extra_outputs: include dit_extra_outputs (latents, masks) and add LM metadata
+        extra_outputs = dit_extra_outputs.copy()
+        extra_outputs["lm_metadata"] = lm_generated_metadata
+        # Merge time_costs from both LM and DiT into a unified dictionary
+        unified_time_costs = {}
+        # Add LM time costs (if LM was used)
+        if use_lm and lm_total_time_costs:
+            for key, value in lm_total_time_costs.items():
+                unified_time_costs[f"lm_{key}"] = value
+        # Add DiT time costs (if available)
+        dit_time_costs = dit_extra_outputs.get("time_costs", {})
+        if dit_time_costs:
+            for key, value in dit_time_costs.items():
+                unified_time_costs[f"dit_{key}"] = value
+        # Calculate total pipeline time
+        if unified_time_costs:
+            lm_total = unified_time_costs.get("lm_total_time", 0.0)
+            dit_total = unified_time_costs.get("dit_total_time_cost", 0.0)
+            unified_time_costs["pipeline_total_time"] = lm_total + dit_total
+        # Update extra_outputs with unified time_costs
+        extra_outputs["time_costs"] = unified_time_costs
+        if lm_status:
+            status_message = "\n".join(lm_status) + "\n" + status_message
+        else:
+            status_message = status_message
+        # Create and return GenerationResult
+        return GenerationResult(
+            audios=audios,
+            status_message=status_message,
+            extra_outputs=extra_outputs,
+            success=True,
+            error=None,
+        )
+    except Exception as e:
+        logger.exception("Music generation failed")
+        return GenerationResult(
+            audios=[],
+            status_message=f"Error: {str(e)}",
+            extra_outputs={},
+            success=False,
+            error=str(e),
+        )
+def understand_music(
+    llm_handler,
+    audio_codes: str,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> UnderstandResult:
+    """Understand music from audio codes using the 5Hz Language Model.
+    This function analyzes audio semantic codes and generates metadata about the music,
+    including caption, lyrics, BPM, duration, key scale, language, and time signature.
+    If audio_codes is empty or "NO USER INPUT", the LM will generate a sample example
+    instead of analyzing existing codes.
+    Note: cfg_scale and negative_prompt are not supported in understand mode.
+    Args:
+        llm_handler: Initialized LLM handler (LLMHandler instance)
+        audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
+                     Use empty string or "NO USER INPUT" to generate a sample example.
+        temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
+        top_k: Top-K sampling (None or 0 = disabled)
+        top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
+        repetition_penalty: Repetition penalty (1.0 = no penalty)
+        use_constrained_decoding: Whether to use FSM-based constrained decoding for metadata
+        constrained_decoding_debug: Whether to enable debug logging for constrained decoding
+    Returns:
+        UnderstandResult with parsed metadata fields and status
+    Example:
+        >>> result = understand_music(llm_handler, audio_codes="<|audio_code_123|>...")
+        >>> if result.success:
+        ...     print(f"Caption: {result.caption}")
+        ...     print(f"BPM: {result.bpm}")
+        ...     print(f"Lyrics: {result.lyrics}")
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        return UnderstandResult(
+            status_message="5Hz LM not initialized. Please initialize it first.",
+            success=False,
+            error="LLM not initialized",
+        )
+    # If codes are empty, use "NO USER INPUT" to generate a sample example
+    if not audio_codes or not audio_codes.strip():
+        audio_codes = "NO USER INPUT"
+    try:
+        # Call LLM understanding
+        metadata, status = llm_handler.understand_audio_from_codes(
+            audio_codes=audio_codes,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+        )
+        # Check if LLM returned empty metadata (error case)
+        if not metadata:
+            return UnderstandResult(
+                status_message=status or "Failed to understand audio codes",
+                success=False,
+                error=status or "Empty metadata returned",
+            )
+        # Extract and convert fields
+        caption = metadata.get('caption', '')
+        lyrics = metadata.get('lyrics', '')
+        keyscale = metadata.get('keyscale', '')
+        language = metadata.get('language', metadata.get('vocal_language', ''))
+        timesignature = metadata.get('timesignature', '')
+        # Convert BPM to int
+        bpm = None
+        bpm_value = metadata.get('bpm')
+        if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+        # Convert duration to float
+        duration = None
+        duration_value = metadata.get('duration')
+        if duration_value is not None and duration_value != 'N/A' and duration_value != '':
+            try:
+                duration = float(duration_value)
+            except (ValueError, TypeError):
+                pass
+        # Clean up N/A values
+        if keyscale == 'N/A':
+            keyscale = ''
+        if language == 'N/A':
+            language = ''
+        if timesignature == 'N/A':
+            timesignature = ''
+        return UnderstandResult(
+            caption=caption,
+            lyrics=lyrics,
+            bpm=bpm,
+            duration=duration,
+            keyscale=keyscale,
+            language=language,
+            timesignature=timesignature,
+            status_message=status,
+            success=True,
+            error=None,
+        )
+    except Exception as e:
+        logger.exception("Music understanding failed")
+        return UnderstandResult(
+            status_message=f"Error: {str(e)}",
+            success=False,
+            error=str(e),
+        )
+@dataclass
+class CreateSampleResult:
+    """Result of creating a music sample from a natural language query.
+    This is used by the "Simple Mode" / "Inspiration Mode" feature where users
+    provide a natural language description and the LLM generates a complete
+    sample with caption, lyrics, and metadata.
+    Attributes:
+        # Metadata Fields
+        caption: Generated detailed music description/caption
+        lyrics: Generated lyrics (or "[Instrumental]" for instrumental music)
+        bpm: Beats per minute (None if not generated)
+        duration: Duration in seconds (None if not generated)
+        keyscale: Musical key (e.g., "C Major")
+        language: Vocal language code (e.g., "en", "zh")
+        timesignature: Time signature (e.g., "4")
+        instrumental: Whether this is an instrumental piece
+        # Status
+        status_message: Status message from sample creation
+        success: Whether sample creation completed successfully
+        error: Error message if sample creation failed
+    """
+    # Metadata Fields
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    instrumental: bool = False
+    # Status
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert result to dictionary for JSON serialization."""
+        return asdict(self)
+def create_sample(
+    llm_handler,
+    query: str,
+    instrumental: bool = False,
+    vocal_language: Optional[str] = None,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> CreateSampleResult:
+    """Create a music sample from a natural language query using the 5Hz Language Model.
+    This is the "Simple Mode" / "Inspiration Mode" feature that takes a user's natural
+    language description of music and generates a complete sample including:
+    - Detailed caption/description
+    - Lyrics (unless instrumental)
+    - Metadata (BPM, duration, key, language, time signature)
+    Note: cfg_scale and negative_prompt are not supported in create_sample mode.
+    Args:
+        llm_handler: Initialized LLM handler (LLMHandler instance)
+        query: User's natural language music description (e.g., "a soft Bengali love song")
+        instrumental: Whether to generate instrumental music (no vocals)
+        vocal_language: Allowed vocal language for constrained decoding (e.g., "en", "zh").
+                       If provided, the model will be constrained to generate lyrics in this language.
+                       If None or "unknown", no language constraint is applied.
+        temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
+        top_k: Top-K sampling (None or 0 = disabled)
+        top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
+        repetition_penalty: Repetition penalty (1.0 = no penalty)
+        use_constrained_decoding: Whether to use FSM-based constrained decoding
+        constrained_decoding_debug: Whether to enable debug logging
+    Returns:
+        CreateSampleResult with generated sample fields and status
+    Example:
+        >>> result = create_sample(llm_handler, "a soft Bengali love song for a quiet evening", vocal_language="bn")
+        >>> if result.success:
+        ...     print(f"Caption: {result.caption}")
+        ...     print(f"Lyrics: {result.lyrics}")
+        ...     print(f"BPM: {result.bpm}")
+    """
+    import torch
+    # Debug logging for ZeroGPU diagnosis
+    logger.info(f"[create_sample Debug] Entry: IS_HUGGINGFACE_SPACE={IS_HUGGINGFACE_SPACE}")
+    logger.info(f"[create_sample Debug] torch.cuda.is_available()={torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        logger.info(f"[create_sample Debug] torch.cuda.current_device()={torch.cuda.current_device()}")
+    logger.info(f"[create_sample Debug] llm_handler.device={llm_handler.device}, llm_handler.offload_to_cpu={llm_handler.offload_to_cpu}")
+    if llm_handler.llm is not None:
+        try:
+            logger.info(f"[create_sample Debug] Model device: {next(llm_handler.llm.parameters()).device}")
+        except Exception as e:
+            logger.info(f"[create_sample Debug] Could not get model device: {e}")
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        return CreateSampleResult(
+            status_message="5Hz LM not initialized. Please initialize it first.",
+            success=False,
+            error="LLM not initialized",
+        )
+    try:
+        # Call LLM to create sample
+        metadata, status = llm_handler.create_sample_from_query(
+            query=query,
+            instrumental=instrumental,
+            vocal_language=vocal_language,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+        )
+        # Check if LLM returned empty metadata (error case)
+        if not metadata:
+            return CreateSampleResult(
+                status_message=status or "Failed to create sample",
+                success=False,
+                error=status or "Empty metadata returned",
+            )
+        # Extract and convert fields
+        caption = metadata.get('caption', '')
+        lyrics = metadata.get('lyrics', '')
+        keyscale = metadata.get('keyscale', '')
+        language = metadata.get('language', metadata.get('vocal_language', ''))
+        timesignature = metadata.get('timesignature', '')
+        is_instrumental = metadata.get('instrumental', instrumental)
+        # Convert BPM to int
+        bpm = None
+        bpm_value = metadata.get('bpm')
+        if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+        # Convert duration to float
+        duration = None
+        duration_value = metadata.get('duration')
+        if duration_value is not None and duration_value != 'N/A' and duration_value != '':
+            try:
+                duration = float(duration_value)
+            except (ValueError, TypeError):
+                pass
+        # Clean up N/A values
+        if keyscale == 'N/A':
+            keyscale = ''
+        if language == 'N/A':
+            language = ''
+        if timesignature == 'N/A':
+            timesignature = ''
+        return CreateSampleResult(
+            caption=caption,
+            lyrics=lyrics,
+            bpm=bpm,
+            duration=duration,
+            keyscale=keyscale,
+            language=language,
+            timesignature=timesignature,
+            instrumental=is_instrumental,
+            status_message=status,
+            success=True,
+            error=None,
+        )
+    except Exception as e:
+        logger.exception("Sample creation failed")
+        return CreateSampleResult(
+            status_message=f"Error: {str(e)}",
+            success=False,
+            error=str(e),
+        )
+@dataclass
+class FormatSampleResult:
+    """Result of formatting user-provided caption and lyrics.
+    This is used by the "Format" feature where users provide caption and lyrics,
+    and the LLM formats them into structured music metadata and an enhanced description.
+    Attributes:
+        # Metadata Fields
+        caption: Enhanced/formatted music description/caption
+        lyrics: Formatted lyrics (may be same as input or reformatted)
+        bpm: Beats per minute (None if not detected)
+        duration: Duration in seconds (None if not detected)
+        keyscale: Musical key (e.g., "C Major")
+        language: Vocal language code (e.g., "en", "zh")
+        timesignature: Time signature (e.g., "4")
+        # Status
+        status_message: Status message from formatting
+        success: Whether formatting completed successfully
+        error: Error message if formatting failed
+    """
+    # Metadata Fields
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    # Status
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert result to dictionary for JSON serialization."""
+        return asdict(self)
+def format_sample(
+    llm_handler,
+    caption: str,
+    lyrics: str,
+    user_metadata: Optional[Dict[str, Any]] = None,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> FormatSampleResult:
+    """Format user-provided caption and lyrics using the 5Hz Language Model.
+    This function takes user input (caption and lyrics) and generates structured
+    music metadata including an enhanced caption, BPM, duration, key, language,
+    and time signature.
+    If user_metadata is provided, those values will be used to constrain the
+    decoding, ensuring the output matches user-specified values.
+    Note: cfg_scale and negative_prompt are not supported in format mode.
+    Args:
+        llm_handler: Initialized LLM handler (LLMHandler instance)
+        caption: User's caption/description (e.g., "Latin pop, reggaeton")
+        lyrics: User's lyrics with structure tags
+        user_metadata: Optional dict with user-provided metadata to constrain decoding.
+                      Supported keys: bpm, duration, keyscale, timesignature, language
+        temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
+        top_k: Top-K sampling (None or 0 = disabled)
+        top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
+        repetition_penalty: Repetition penalty (1.0 = no penalty)
+        use_constrained_decoding: Whether to use FSM-based constrained decoding for metadata
+        constrained_decoding_debug: Whether to enable debug logging for constrained decoding
+    Returns:
+        FormatSampleResult with formatted metadata fields and status
+    Example:
+        >>> result = format_sample(llm_handler, "Latin pop, reggaeton", "[Verse 1]\\nHola mundo...")
+        >>> if result.success:
+        ...     print(f"Caption: {result.caption}")
+        ...     print(f"BPM: {result.bpm}")
+        ...     print(f"Lyrics: {result.lyrics}")
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        return FormatSampleResult(
+            status_message="5Hz LM not initialized. Please initialize it first.",
+            success=False,
+            error="LLM not initialized",
+        )
+    try:
+        # Call LLM formatting
+        metadata, status = llm_handler.format_sample_from_input(
+            caption=caption,
+            lyrics=lyrics,
+            user_metadata=user_metadata,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+        )
+        # Check if LLM returned empty metadata (error case)
+        if not metadata:
+            return FormatSampleResult(
+                status_message=status or "Failed to format input",
+                success=False,
+                error=status or "Empty metadata returned",
+            )
+        # Extract and convert fields
+        result_caption = metadata.get('caption', '')
+        result_lyrics = metadata.get('lyrics', lyrics)  # Fall back to input lyrics
+        keyscale = metadata.get('keyscale', '')
+        language = metadata.get('language', metadata.get('vocal_language', ''))
+        timesignature = metadata.get('timesignature', '')
+        # Convert BPM to int
+        bpm = None
+        bpm_value = metadata.get('bpm')
+        if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+        # Convert duration to float
+        duration = None
+        duration_value = metadata.get('duration')
+        if duration_value is not None and duration_value != 'N/A' and duration_value != '':
+            try:
+                duration = float(duration_value)
+            except (ValueError, TypeError):
+                pass
+        # Clean up N/A values
+        if keyscale == 'N/A':
+            keyscale = ''
+        if language == 'N/A':
+            language = ''
+        if timesignature == 'N/A':
+            timesignature = ''
+        return FormatSampleResult(
+            caption=result_caption,
+            lyrics=result_lyrics,
+            bpm=bpm,
+            duration=duration,
+            keyscale=keyscale,
+            language=language,
+            timesignature=timesignature,
+            status_message=status,
+            success=True,
+            error=None,
+        )
+    except Exception as e:
+        logger.exception("Format sample failed")
+        return FormatSampleResult(
+            status_message=f"Error: {str(e)}",
+            success=False,
+            error=str(e),
+        )

acestep/llm_inference.py ADDED Viewed

The diff for this file is too large to render. See raw diff

acestep/local_cache.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""Local cache module to replace Redis
+Uses diskcache as backend, provides Redis-compatible API.
+Supports persistent storage and TTL expiration.
+"""
+import json
+import os
+from typing import Any, Optional
+from threading import Lock
+try:
+    from diskcache import Cache
+    HAS_DISKCACHE = True
+except ImportError:
+    HAS_DISKCACHE = False
+class LocalCache:
+    """
+    Local cache implementation with Redis-compatible API.
+    Uses diskcache as backend, supports persistence and TTL.
+    """
+    _instance = None
+    _lock = Lock()
+    def __new__(cls, cache_dir: Optional[str] = None):
+        """Singleton pattern"""
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = super().__new__(cls)
+                    cls._instance._initialized = False
+        return cls._instance
+    def __init__(self, cache_dir: Optional[str] = None):
+        if getattr(self, '_initialized', False):
+            return
+        if not HAS_DISKCACHE:
+            raise ImportError(
+                "diskcache not installed. Run: pip install diskcache"
+            )
+        if cache_dir is None:
+            cache_dir = os.path.join(
+                os.path.dirname(os.path.dirname(__file__)),
+                ".cache",
+                "local_redis"
+            )
+        os.makedirs(cache_dir, exist_ok=True)
+        self._cache = Cache(cache_dir)
+        self._initialized = True
+    def set(self, name: str, value: Any, ex: Optional[int] = None) -> bool:
+        """
+        Set key-value pair
+        Args:
+            name: Key name
+            value: Value (auto-serialize dict/list)
+            ex: Expiration time (seconds)
+        Returns:
+            bool: Success status
+        """
+        if isinstance(value, (dict, list)):
+            value = json.dumps(value, ensure_ascii=False)
+        self._cache.set(name, value, expire=ex)
+        return True
+    def get(self, name: str) -> Optional[str]:
+        """Get value"""
+        return self._cache.get(name)
+    def delete(self, name: str) -> int:
+        """Delete key, returns number of deleted items"""
+        return 1 if self._cache.delete(name) else 0
+    def exists(self, name: str) -> bool:
+        """Check if key exists"""
+        return name in self._cache
+    def keys(self, pattern: str = "*") -> list:
+        """
+        Get list of matching keys
+        Note: Simplified implementation, only supports prefix and full matching
+        """
+        if pattern == "*":
+            return list(self._cache.iterkeys())
+        prefix = pattern.rstrip("*")
+        return [k for k in self._cache.iterkeys() if k.startswith(prefix)]
+    def expire(self, name: str, seconds: int) -> bool:
+        """Set key expiration time"""
+        value = self._cache.get(name)
+        if value is not None:
+            self._cache.set(name, value, expire=seconds)
+            return True
+        return False
+    def ttl(self, name: str) -> int:
+        """
+        Get remaining time to live (seconds)
+        Note: diskcache does not directly support TTL queries
+        """
+        if name in self._cache:
+            return -1  # Exists but TTL unknown
+        return -2  # Key does not exist
+    def close(self):
+        """Close cache connection"""
+        if hasattr(self, '_cache'):
+            self._cache.close()
+# Lazily initialized global instance
+_local_cache: Optional[LocalCache] = None
+def get_local_cache(cache_dir: Optional[str] = None) -> LocalCache:
+    """Get local cache instance"""
+    global _local_cache
+    if _local_cache is None:
+        _local_cache = LocalCache(cache_dir)
+    return _local_cache

acestep/test_time_scaling.py ADDED Viewed

	@@ -0,0 +1,410 @@

+"""
+Test-Time Scaling Module
+Implements perplexity-based scoring for generated audio codes
+"""
+import torch
+import torch.nn.functional as F
+from typing import Tuple, Optional, Dict, Any, List
+from loguru import logger
+import yaml
+import math
+import re
+def pmi_score(log_prob_conditional: float, log_prob_unconditional: float) -> float:
+    """
+    Calculate Pointwise Mutual Information (PMI) score.
+    PMI = log P(condition|codes) - log P(condition)
+        = log [P(codes|condition) / P(codes)]
+    This removes the bias from P(condition) and measures how much the codes
+    improve our ability to predict the condition.
+    Args:
+        log_prob_conditional: Average log probability of condition given codes
+        log_prob_unconditional: Average log probability of condition without codes
+    Returns:
+        PMI score (higher is better, can be positive or negative)
+        - Positive: codes improve prediction → good match
+        - Zero: codes don't help → no correlation
+        - Negative: codes hurt prediction → poor match
+    """
+    return log_prob_conditional - log_prob_unconditional
+def pmi_to_normalized_score(pmi: float, scale: float = 0.1) -> float:
+    """
+    Convert PMI score to normalized [0, 1] range using sigmoid function.
+    score = sigmoid(PMI / scale) = 1 / (1 + exp(-PMI / scale))
+    Args:
+        pmi: PMI score (can be positive or negative)
+        scale: Scale parameter to control sensitivity (default 0.1)
+               - Smaller scale: more sensitive to PMI changes
+               - Larger scale: less sensitive to PMI changes
+    Returns:
+        Normalized score in [0, 1] range, where:
+        - PMI > 0 → score > 0.5 (good match)
+        - PMI = 0 → score = 0.5 (neutral)
+        - PMI < 0 → score < 0.5 (poor match)
+    Examples (scale=1.0):
+        PMI=2.0  → score≈0.88  (excellent)
+        PMI=1.0  → score≈0.73  (good)
+        PMI=0.0  → score=0.50  (neutral)
+        PMI=-1.0 → score≈0.27  (poor)
+        PMI=-2.0 → score≈0.12  (bad)
+    """
+    return 1.0 / (1.0 + math.exp(-pmi / scale))
+def _get_logits_and_target_for_scoring(llm_handler, formatted_prompt: str,
+                                       target_text: str) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Args:
+        llm_handler: The handler containing the model and tokenizer.
+        formatted_prompt: The input context.
+        target_text: The text we want to calculate probability/recall for.
+    Returns:
+        Tuple of (target_logits, target_ids)
+        - target_logits: Logits used to predict the target tokens.
+        - target_ids: The ground truth token IDs of the target.
+    """
+    model = llm_handler.get_hf_model_for_scoring()
+    tokenizer = llm_handler.llm_tokenizer
+    device = llm_handler.device if llm_handler.llm_backend == "pt" else next(model.parameters()).device
+    # 1. Tokenize prompt ONLY to get its length (used for slicing later).
+    #    We must ensure special tokens are added to count the offset correctly.
+    prompt_tokens_temp = tokenizer(formatted_prompt, return_tensors="pt", add_special_tokens=True)
+    prompt_len = prompt_tokens_temp['input_ids'].shape[1]
+    # 2. Tokenize the FULL text (Prompt + Target).
+    #    This ensures subword merging at boundaries is handled correctly by the tokenizer.
+    full_text = formatted_prompt + target_text
+    full_tokens = tokenizer(full_text, return_tensors="pt", padding=False, truncation=True, add_special_tokens=True).to(device)
+    input_ids = full_tokens['input_ids']
+    # Safety check: if target was empty or truncated entirely
+    if input_ids.shape[1] <= prompt_len:
+        return torch.empty(0, device=device), torch.empty(0, device=device)
+    # 3. Forward Pass (Teacher Forcing)
+    with torch.no_grad():
+        with llm_handler._load_model_context():
+            outputs = model(input_ids=input_ids, attention_mask=full_tokens['attention_mask'])
+            all_logits = outputs.logits  # [1, seq_len, vocab_size]
+    # 4. Extract Logits and Labels
+    #    We need to predict `input_ids[i]`. The logit for this is at `all_logits[i-1]`.
+    #    Target starts at index `prompt_len`.
+    #    So we need logits from `prompt_len - 1` up to the second to last position.
+    target_logits = all_logits[0, prompt_len - 1:-1, :]  # [target_len, vocab_size]
+    target_ids = input_ids[0, prompt_len:]  # [target_len]
+    return target_logits, target_ids
+# ==============================================================================
+# Scoring Logic
+# ==============================================================================
+def _calculate_topk_recall(llm_handler,
+                           formatted_prompt: str,
+                           target_text: str,
+                           topk: int = 10) -> Tuple[float, Dict[int, float]]:
+    """
+    Calculate top-k recall for target text given prompt.
+    Checks if the ground truth token is within the top-k probabilities at each step.
+    """
+    # Use the fixed helper to get aligned logits/labels
+    pred_logits, target_ids = _get_logits_and_target_for_scoring(llm_handler, formatted_prompt, target_text)
+    if target_ids.shape[0] == 0:
+        return 0.0, {}
+    target_len = target_ids.shape[0]
+    # Get top-k indices for all positions at once
+    # topk_indices: [target_len, topk]
+    _, topk_indices = torch.topk(pred_logits, k=min(topk, pred_logits.shape[-1]), dim=-1)
+    recall_per_k = {}
+    position_scores = []
+    # Convert to list for faster CPU iteration
+    target_ids_list = target_ids.tolist()
+    topk_indices_list = topk_indices.tolist()
+    for k in range(1, topk + 1):
+        hits = 0
+        for pos in range(target_len):
+            gt_token = target_ids_list[pos]
+            # Check the top-k slice
+            topk_at_pos = topk_indices_list[pos][:k]
+            if gt_token in topk_at_pos:
+                hits += 1
+                # Calculate position-weighted score only once (when k=topk)
+                if k == topk:
+                    rank = topk_at_pos.index(gt_token) + 1
+                    # Rank 1 = 1.0, Rank k = small positive
+                    position_weight = 1.0 - (rank - 1) / topk
+                    position_scores.append(position_weight)
+        recall_per_k[k] = hits / target_len if target_len > 0 else 0.0
+    # Fill scores for positions where GT was NOT in top-k
+    while len(position_scores) < target_len:
+        position_scores.append(0.0)
+    average_recall = sum(position_scores) / len(position_scores) if position_scores else 0.0
+    return average_recall, recall_per_k
+def _calculate_metadata_recall(llm_handler,
+                               formatted_prompt: str,
+                               fields_dict: Dict[str, Any],
+                               topk: int = 10) -> Dict[str, float]:
+    """
+    Args:
+        fields_dict: Dictionary of {field_name: field_value}
+    """
+    if not fields_dict:
+        return {}
+    field_scores = {}
+    for field_name in sorted(fields_dict.keys()):
+        # Construct target text for this specific field
+        # e.g. <think>\nbpm: 120\n</think>\n
+        field_yaml = yaml.dump({field_name: fields_dict[field_name]}, allow_unicode=True, sort_keys=True).strip()
+        field_target_text = f"<think>\n{field_yaml}\n</think>\n"
+        # Calculate recall using the robust logic
+        avg_score, _ = _calculate_topk_recall(llm_handler, formatted_prompt, field_target_text, topk=topk)
+        field_scores[field_name] = avg_score
+        logger.debug(f"Recall for {field_name}: {avg_score:.4f}")
+    return field_scores
+def _calculate_log_prob(
+        llm_handler,
+        formatted_prompt: str,
+        target_text: str,
+        temperature: float = 1.0  # Kept for API compatibility, but ignored for scoring
+) -> float:
+    """
+    Calculate average log probability of target text given prompt.
+    """
+    pred_logits, target_ids = _get_logits_and_target_for_scoring(llm_handler, formatted_prompt, target_text)
+    if target_ids.shape[0] == 0:
+        return float('-inf')
+    # FIX: Do not divide by temperature.
+    # Log-probability for PMI/Perplexity should be exact.
+    # Calculate log probabilities (log_softmax)
+    log_probs = F.log_softmax(pred_logits, dim=-1)  # [target_len, vocab_size]
+    # Gather log probabilities of the ground truth tokens
+    target_log_probs = log_probs[torch.arange(target_ids.shape[0]), target_ids]
+    # Return average log probability
+    mean_log_prob = target_log_probs.mean().item()
+    return mean_log_prob
+def calculate_reward_score(
+    scores: Dict[str, float],
+    weights_config: Optional[Dict[str, float]] = None
+) -> Tuple[float, str]:
+    """
+    Reward Model Calculator: Computes a final reward based on user priorities.
+    Priority Logic:
+        1. Caption (Highest): The overall vibe/style must match.
+        2. Lyrics (Medium): Content accuracy is important but secondary to vibe.
+        3. Metadata (Lowest): Technical constraints (BPM, Key) allow for slight deviations.
+    Strategy: Dynamic Weighted Sum
+    - Metadata fields are aggregated into a single 'metadata' score first.
+    - Weights are dynamically renormalized if any component (e.g., lyrics) is missing.
+    Args:
+        scores: Dictionary of raw scores (0.0 - 1.0) from the evaluation module.
+        weights_config: Optional custom weights. Defaults to:
+                        Caption (50%), Lyrics (30%), Metadata (20%).
+    Returns:
+        final_reward: The calculated reward score (0.0 - 1.0).
+        explanation: A formatted string explaining how the score was derived.
+    """
+    # 1. Default Preference Configuration
+    # These weights determine the relative importance of each component.
+    if weights_config is None:
+        weights_config = {
+            'caption': 0.50,  # High priority: Style/Vibe
+            'lyrics':  0.30,  # Medium priority: Content
+            'metadata': 0.20  # Low priority: Technical details
+        }
+    # 2. Extract and Group Scores
+    # Caption and Lyrics are standalone high-level features.
+    caption_score = scores.get('caption')
+    lyrics_score = scores.get('lyrics')
+    # Metadata fields (bpm, key, duration, etc.) are aggregated.
+    # We treat them as a single "Technical Score" to prevent them from
+    # diluting the weight of Caption/Lyrics simply by having many fields.
+    meta_scores_list = [
+        val for key, val in scores.items()
+        if key not in ['caption', 'lyrics']
+    ]
+    # Calculate average of all metadata fields (if any exist)
+    meta_aggregate_score = None
+    if meta_scores_list:
+        meta_aggregate_score = sum(meta_scores_list) / len(meta_scores_list)
+    # 3. specific Active Components & Dynamic Weighting
+    # We only include components that actually exist in this generation.
+    active_components = {}
+    if caption_score is not None:
+        active_components['caption'] = (caption_score, weights_config['caption'])
+    if lyrics_score is not None:
+        active_components['lyrics'] = (lyrics_score, weights_config['lyrics'])
+    if meta_aggregate_score is not None:
+        active_components['metadata'] = (meta_aggregate_score, weights_config['metadata'])
+    # 4. Calculate Final Weighted Score
+    total_base_weight = sum(w for _, w in active_components.values())
+    total_score = 0.0
+    breakdown_lines = []
+    if total_base_weight == 0:
+        return 0.0, "❌ No valid scores available to calculate reward."
+    # Sort by weight (importance) for display
+    sorted_components = sorted(active_components.items(), key=lambda x: x[1][1], reverse=True)
+    for name, (score, base_weight) in sorted_components:
+        # Renormalize weight: If lyrics are missing, caption/metadata weights scale up proportionately.
+        normalized_weight = base_weight / total_base_weight
+        weighted_contribution = score * normalized_weight
+        total_score += weighted_contribution
+        breakdown_lines.append(
+            f"  • {name.title():<8} | Score: {score:.4f} | Weight: {normalized_weight:.2f} "
+            f"-> Contrib: +{weighted_contribution:.4f}"
+        )
+    return total_score, "\n".join(breakdown_lines)
+# ==============================================================================
+# Main Public API
+# ==============================================================================
+def calculate_pmi_score_per_condition(
+    llm_handler,
+    audio_codes: str,
+    caption: str = "",
+    lyrics: str = "",
+    metadata: Optional[Dict[str, Any]] = None,
+    temperature: float = 1.0,
+    topk: int = 10,
+    score_scale: float = 0.1,
+) -> Tuple[Dict[str, float], float, str]:
+    """
+    Calculate quality score separately for each condition.
+    - Metadata: Uses Top-k Recall.
+    - Caption/Lyrics: Uses PMI (Normalized).
+    """
+    if not llm_handler.llm_initialized:
+        return {}, 0.0, "❌ LLM not initialized"
+    if not audio_codes or not audio_codes.strip():
+        return {}, 0.0, "❌ No audio codes provided"
+    if "caption" not in metadata:
+        metadata['caption'] = caption
+    formatted_prompt = llm_handler.build_formatted_prompt_for_understanding(audio_codes=audio_codes, is_negative_prompt=False)
+    prompt_uncond = llm_handler.build_formatted_prompt_for_understanding(audio_codes="NO USER INPUT", is_negative_prompt=False)
+    try:
+        # 1. Calculate Recall for Metadata Fields
+        if metadata and isinstance(metadata, dict):
+            scores = {}
+            # Define which fields use which metric
+            metadata_recall_keys = ['bpm', 'duration', 'genres', 'keyscale', 'language', 'timesignature']
+            metadata_pmi_keys = ['caption']
+            for key in metadata_recall_keys:
+                if key in metadata and metadata[key] is not None:
+                    recall_metadata = {key: metadata[key]}
+                    field_scores = _calculate_metadata_recall(llm_handler, formatted_prompt, recall_metadata, topk=topk)
+                    scores.update(field_scores)
+            # 2. Calculate PMI for Caption
+            for key in metadata_pmi_keys:
+                if key in metadata and metadata[key] is not None:
+                    cot_yaml = yaml.dump({key: metadata[key]}, allow_unicode=True, sort_keys=True).strip()
+                    target_text = f"<think>\n{cot_yaml}\n</think>\n"
+                    log_prob_cond = _calculate_log_prob(llm_handler, formatted_prompt, target_text)
+                    log_prob_uncond = _calculate_log_prob(llm_handler, prompt_uncond, target_text)
+                    pmi_normalized = pmi_to_normalized_score(log_prob_cond - log_prob_uncond, scale=score_scale)
+                    scores[key] = pmi_normalized
+        # 3. Calculate PMI for Lyrics
+        if lyrics:
+            target_text = f"<think>\n</think>\n# Lyric\n{lyrics}\n"
+            log_prob_cond = _calculate_log_prob(llm_handler, formatted_prompt, target_text)
+            prompt_uncond = llm_handler.build_formatted_prompt_for_understanding(audio_codes="NO USER INPUT", is_negative_prompt=False)
+            log_prob_uncond = _calculate_log_prob(llm_handler, prompt_uncond, target_text)
+            scores['lyrics'] = pmi_to_normalized_score(log_prob_cond - log_prob_uncond, scale=score_scale)
+        if not scores:
+            return {}, 0.0, "❌ No conditions to evaluate"
+        # 4. Global Score
+        global_score = sum(scores.values()) / len(scores)
+        global_score, breakdown_lines = calculate_reward_score(scores)
+        # Status Message
+        status_lines = [breakdown_lines, "\n✅ Per-condition scores (0-1):"]
+        for key, score in sorted(scores.items()):
+            metric = "Top-k Recall" if key in metadata_recall_keys else "PMI (Norm)"
+            status_lines.append(f"  {key}: {score:.4f} ({metric})")
+        status = "\n".join(status_lines)
+        logger.info(f"Calculated scores: {global_score:.4f}\n{status}")
+        return scores, global_score, status
+    except Exception as e:
+        import traceback
+        error_msg = f"❌ Error: {str(e)}"
+        logger.error(error_msg)
+        logger.error(traceback.format_exc())
+        return {}, float('-inf'), error_msg

acestep/third_parts/nano-vllm/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Xingkai Yu
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

acestep/third_parts/nano-vllm/README.md ADDED Viewed

	@@ -0,0 +1,66 @@

+<p align="center">
+<img width="300" src="assets/logo.png">
+</p>
+<p align="center">
+<a href="https://trendshift.io/repositories/15323" target="_blank"><img src="https://trendshift.io/api/badge/repositories/15323" alt="GeeeekExplorer%2Fnano-vllm | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</p>
+# Nano-vLLM
+A lightweight vLLM implementation built from scratch.
+## Key Features
+* 🚀 **Fast offline inference** - Comparable inference speeds to vLLM
+* 📖 **Readable codebase** - Clean implementation in ~ 1,200 lines of Python code
+* ⚡ **Optimization Suite** - Prefix caching, Tensor Parallelism, Torch compilation, CUDA graph, etc.
+## Installation
+```bash
+pip install git+https://github.com/GeeeekExplorer/nano-vllm.git
+```
+## Model Download
+To download the model weights manually, use the following command:
+```bash
+huggingface-cli download --resume-download Qwen/Qwen3-0.6B \
+  --local-dir ~/huggingface/Qwen3-0.6B/ \
+  --local-dir-use-symlinks False
+```
+## Quick Start
+See `example.py` for usage. The API mirrors vLLM's interface with minor differences in the `LLM.generate` method:
+```python
+from nanovllm import LLM, SamplingParams
+llm = LLM("/YOUR/MODEL/PATH", enforce_eager=True, tensor_parallel_size=1)
+sampling_params = SamplingParams(temperature=0.6, max_tokens=256)
+prompts = ["Hello, Nano-vLLM."]
+outputs = llm.generate(prompts, sampling_params)
+outputs[0]["text"]
+```
+## Benchmark
+See `bench.py` for benchmark.
+**Test Configuration:**
+- Hardware: RTX 4070 Laptop (8GB)
+- Model: Qwen3-0.6B
+- Total Requests: 256 sequences
+- Input Length: Randomly sampled between 100–1024 tokens
+- Output Length: Randomly sampled between 100–1024 tokens
+**Performance Results:**
+| Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) |
+|----------------|-------------|----------|-----------------------|
+| vLLM           | 133,966     | 98.37    | 1361.84               |
+| Nano-vLLM      | 133,966     | 93.41    | 1434.13               |
+## Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=GeeeekExplorer/nano-vllm&type=Date)](https://www.star-history.com/#GeeeekExplorer/nano-vllm&Date)

acestep/third_parts/nano-vllm/bench.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+import time
+from random import randint, seed
+from nanovllm import LLM, SamplingParams
+# from vllm import LLM, SamplingParams
+def main():
+    seed(0)
+    num_seqs = 256
+    max_input_len = 1024
+    max_ouput_len = 1024
+    path = os.path.expanduser("~/huggingface/Qwen3-0.6B/")
+    llm = LLM(path, enforce_eager=False, max_model_len=4096)
+    prompt_token_ids = [[randint(0, 10000) for _ in range(randint(100, max_input_len))] for _ in range(num_seqs)]
+    sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=randint(100, max_ouput_len)) for _ in range(num_seqs)]
+    # uncomment the following line for vllm
+    # prompt_token_ids = [dict(prompt_token_ids=p) for p in prompt_token_ids]
+    llm.generate(["Benchmark: "], SamplingParams())
+    t = time.time()
+    llm.generate(prompt_token_ids, sampling_params, use_tqdm=False)
+    t = (time.time() - t)
+    total_tokens = sum(sp.max_tokens for sp in sampling_params)
+    throughput = total_tokens / t
+    print(f"Total: {total_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
+if __name__ == "__main__":
+    main()

acestep/third_parts/nano-vllm/example.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+from nanovllm import LLM, SamplingParams
+from transformers import AutoTokenizer
+def main():
+    path = os.path.expanduser("~/huggingface/Qwen3-0.6B/")
+    tokenizer = AutoTokenizer.from_pretrained(path)
+    llm = LLM(path, enforce_eager=True, tensor_parallel_size=1)
+    sampling_params = SamplingParams(temperature=0.6, max_tokens=256)
+    prompts = [
+        "introduce yourself",
+        "list all prime numbers within 100",
+    ]
+    prompts = [
+        tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        for prompt in prompts
+    ]
+    outputs = llm.generate(prompts, sampling_params)
+    for prompt, output in zip(prompts, outputs):
+        print("\n")
+        print(f"Prompt: {prompt!r}")
+        print(f"Completion: {output['text']!r}")
+if __name__ == "__main__":
+    main()

acestep/third_parts/nano-vllm/nanovllm/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from nanovllm.llm import LLM
2	+ from nanovllm.sampling_params import SamplingParams

acestep/third_parts/nano-vllm/nanovllm/config.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+from dataclasses import dataclass
+from transformers import AutoConfig
+@dataclass
+class Config:
+    model: str
+    max_num_batched_tokens: int = 16384
+    max_num_seqs: int = 512
+    max_model_len: int = 4096
+    gpu_memory_utilization: float = 0.9
+    tensor_parallel_size: int = 1
+    enforce_eager: bool = False
+    hf_config: AutoConfig | None = None
+    eos: int = -1
+    kvcache_block_size: int = 256
+    num_kvcache_blocks: int = -1
+    def __post_init__(self):
+        assert os.path.isdir(self.model)
+        assert self.kvcache_block_size % 256 == 0
+        assert 1 <= self.tensor_parallel_size <= 8
+        self.hf_config = AutoConfig.from_pretrained(self.model)
+        self.max_model_len = min(self.max_model_len, self.hf_config.max_position_embeddings)
+        assert self.max_num_batched_tokens >= self.max_model_len

acestep/third_parts/nano-vllm/nanovllm/engine/block_manager.py ADDED Viewed

	@@ -0,0 +1,119 @@

+from collections import deque
+import xxhash
+import numpy as np
+from nanovllm.engine.sequence import Sequence
+class Block:
+    def __init__(self, block_id):
+        self.block_id = block_id
+        self.ref_count = 0
+        self.hash = -1
+        self.token_ids = []
+    def update(self, hash: int, token_ids: list[int]):
+        self.hash = hash
+        self.token_ids = token_ids
+    def reset(self):
+        self.ref_count = 1
+        self.hash = -1
+        self.token_ids = []
+class BlockManager:
+    def __init__(self, num_blocks: int, block_size: int):
+        self.block_size = block_size
+        self.blocks: list[Block] = [Block(i) for i in range(num_blocks)]
+        self.hash_to_block_id: dict[int, int] = dict()
+        self.free_block_ids: deque[int] = deque(range(num_blocks))
+        self.used_block_ids: set[int] = set()
+    @classmethod
+    def compute_hash(cls, token_ids: list[int], prefix: int = -1):
+        h = xxhash.xxh64()
+        if prefix != -1:
+            h.update(prefix.to_bytes(8, "little"))
+        h.update(np.array(token_ids).tobytes())
+        return h.intdigest()
+    def _allocate_block(self, block_id: int) -> Block:
+        block = self.blocks[block_id]
+        assert block.ref_count == 0
+        block.reset()
+        self.free_block_ids.remove(block_id)
+        self.used_block_ids.add(block_id)
+        return self.blocks[block_id]
+    def _deallocate_block(self, block_id: int) -> Block:
+        assert self.blocks[block_id].ref_count == 0
+        self.used_block_ids.remove(block_id)
+        self.free_block_ids.append(block_id)
+    def can_allocate(self, seq: Sequence) -> bool:
+        return len(self.free_block_ids) >= seq.num_blocks
+    def allocate(self, seq: Sequence):
+        assert not seq.block_table
+        h = -1
+        cache_miss = False
+        for i in range(seq.num_blocks):
+            token_ids = seq.block(i)
+            h = self.compute_hash(token_ids, h) if len(token_ids) == self.block_size else -1
+            block_id = self.hash_to_block_id.get(h, -1)
+            if block_id == -1 or self.blocks[block_id].token_ids != token_ids:
+                cache_miss = True
+            if cache_miss:
+                block_id = self.free_block_ids[0]
+                block = self._allocate_block(block_id)
+            else:
+                seq.num_cached_tokens += self.block_size
+                if block_id in self.used_block_ids:
+                    block = self.blocks[block_id]
+                    block.ref_count += 1
+                else:
+                    block = self._allocate_block(block_id)
+            if h != -1:
+                block.update(h, token_ids)
+                self.hash_to_block_id[h] = block_id
+            seq.block_table.append(block_id)
+    def deallocate(self, seq: Sequence):
+        for block_id in reversed(seq.block_table):
+            block = self.blocks[block_id]
+            block.ref_count -= 1
+            if block.ref_count == 0:
+                # Fix: Clean up hash_to_block_id mapping to prevent stale references
+                # This prevents CUDA illegal memory access when prefix cache tries to
+                # reuse a block_id that has already been freed
+                if block.hash != -1:
+                    cached_id = self.hash_to_block_id.get(block.hash)
+                    if cached_id == block_id:
+                        del self.hash_to_block_id[block.hash]
+                self._deallocate_block(block_id)
+        seq.num_cached_tokens = 0
+        seq.block_table.clear()
+    def can_append(self, seq: Sequence) -> bool:
+        return len(self.free_block_ids) >= (len(seq) % self.block_size == 1)
+    def may_append(self, seq: Sequence):
+        block_table = seq.block_table
+        last_block = self.blocks[block_table[-1]]
+        if len(seq) % self.block_size == 1:
+            assert last_block.hash != -1
+            block_id = self.free_block_ids[0]
+            self._allocate_block(block_id)
+            block_table.append(block_id)
+        elif len(seq) % self.block_size == 0:
+            assert last_block.hash == -1
+            token_ids = seq.block(seq.num_blocks-1)
+            prefix = self.blocks[block_table[-2]].hash if len(block_table) > 1 else -1
+            h = self.compute_hash(token_ids, prefix)
+            last_block.update(h, token_ids)
+            self.hash_to_block_id[h] = last_block.block_id
+        else:
+            assert last_block.hash == -1

acestep/third_parts/nano-vllm/nanovllm/engine/llm_engine.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import atexit
+import threading
+from dataclasses import fields
+from time import perf_counter
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer
+import torch.multiprocessing as mp
+from nanovllm.config import Config
+from nanovllm.sampling_params import SamplingParams
+from nanovllm.engine.sequence import Sequence
+from nanovllm.engine.scheduler import Scheduler
+from nanovllm.engine.model_runner import ModelRunner
+class LLMEngine:
+    def __init__(self, model, **kwargs):
+        config_fields = {field.name for field in fields(Config)}
+        config_kwargs = {k: v for k, v in kwargs.items() if k in config_fields}
+        config = Config(model, **config_kwargs)
+        self.ps = []
+        self.events = []
+        # Thread-safety lock for generate().
+        # The scheduler, block manager, model runner, and CUDA graph buffers are all
+        # shared mutable state that is NOT thread-safe. In concurrent serving scenarios
+        # (API server with ThreadPoolExecutor, multiple queue workers, Gradio with
+        # concurrent requests), multiple threads can call generate() simultaneously.
+        # Without this lock, concurrent access corrupts scheduler state, block tables,
+        # and CUDA graph input buffers, leading to intermittent CUDA device-side
+        # assertion failures (illegal memory access in KV cache).
+        self._generate_lock = threading.Lock()
+        ctx = mp.get_context("spawn")
+        for i in range(1, config.tensor_parallel_size):
+            event = ctx.Event()
+            process = ctx.Process(target=ModelRunner, args=(config, i, event))
+            process.start()
+            self.ps.append(process)
+            self.events.append(event)
+        self.model_runner = ModelRunner(config, 0, self.events)
+        tokenizer = kwargs.get("tokenizer", None)
+        if tokenizer is not None:
+            self.tokenizer = tokenizer
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(config.model, use_fast=True)
+        config.eos = self.tokenizer.eos_token_id
+        self.scheduler = Scheduler(config)
+        atexit.register(self.exit)
+    def exit(self):
+        self.model_runner.call("exit")
+        del self.model_runner
+        for p in self.ps:
+            p.join()
+    def add_request(self, prompt: str | list[int], sampling_params: SamplingParams, unconditional_prompt: str | list[int] | None = None):
+        if isinstance(prompt, str):
+            prompt = self.tokenizer.encode(prompt)
+        # For CFG: if cfg_scale > 1.0, create both conditional and unconditional sequences
+        if sampling_params.cfg_scale > 1.0:
+            if unconditional_prompt is None:
+                # Try to construct unconditional prompt by replacing user input with "NO USER INPUT"
+                # This is a fallback - ideally users should provide unconditional_prompt
+                if isinstance(prompt, list):
+                    # For now, just use the same prompt (user should provide unconditional_prompt)
+                    # TODO: Implement automatic "NO USER INPUT" replacement if possible
+                    unconditional_prompt = prompt
+                else:
+                    unconditional_prompt = prompt
+            if isinstance(unconditional_prompt, str):
+                unconditional_prompt = self.tokenizer.encode(unconditional_prompt)
+            # Create unconditional sequence first (so we can reference it from conditional)
+            uncond_seq = Sequence(unconditional_prompt, sampling_params, is_unconditional=True)
+            # Create conditional sequence with reference to unconditional
+            cond_seq = Sequence(prompt, sampling_params, is_unconditional=False, conditional_seq=uncond_seq)
+            uncond_seq.paired_seq = cond_seq  # Link them bidirectionally
+            # Add both sequences to scheduler
+            self.scheduler.add(cond_seq)
+            self.scheduler.add(uncond_seq)
+        else:
+            seq = Sequence(prompt, sampling_params)
+            self.scheduler.add(seq)
+    def step(self):
+        seqs, is_prefill = self.scheduler.schedule()
+        token_ids = self.model_runner.call("run", seqs, is_prefill)
+        self.scheduler.postprocess(seqs, token_ids)
+        # Only output conditional sequences (unconditional sequences are just for CFG computation)
+        output_seqs = [seq for seq in seqs if seq.is_finished and (seq.cfg_scale <= 1.0 or not seq.is_unconditional)]
+        outputs = [(seq.seq_id, seq.completion_token_ids) for seq in output_seqs]
+        num_tokens = sum(len(seq) for seq in seqs) if is_prefill else -len([s for s in seqs if not s.is_unconditional])
+        return outputs, num_tokens
+    def is_finished(self):
+        return self.scheduler.is_finished()
+    def reset(self):
+        """
+        Reset the scheduler state and release all allocated blocks.
+        This should be called when an exception occurs during generation to prevent
+        KV cache block leaks that can cause 'deque index out of range' errors.
+        """
+        # Deallocate all running sequences
+        while self.scheduler.running:
+            seq = self.scheduler.running.popleft()
+            if seq.block_table:  # Only deallocate if blocks are allocated
+                self.scheduler.block_manager.deallocate(seq)
+        # Deallocate all waiting sequences (they might have blocks from preemption)
+        while self.scheduler.waiting:
+            seq = self.scheduler.waiting.popleft()
+            if seq.block_table:
+                self.scheduler.block_manager.deallocate(seq)
+    def generate(
+        self,
+        prompts: list[str] | list[list[int]],
+        sampling_params: SamplingParams | list[SamplingParams],
+        use_tqdm: bool = True,
+        unconditional_prompts: list[str] | list[list[int]] | None = None,
+    ) -> list[str]:
+        # Serialize access to the engine to prevent concurrent corruption of
+        # scheduler state, block manager, CUDA graph buffers, and KV cache.
+        # This is the primary defense against the intermittent CUDA device-side
+        # assertion error that occurs in concurrent serving scenarios.
+        with self._generate_lock:
+            return self._generate_impl(prompts, sampling_params, use_tqdm, unconditional_prompts)
+    def _generate_impl(
+        self,
+        prompts: list[str] | list[list[int]],
+        sampling_params: SamplingParams | list[SamplingParams],
+        use_tqdm: bool = True,
+        unconditional_prompts: list[str] | list[list[int]] | None = None,
+    ) -> list[str]:
+        # Clean up any residual state from previous interrupted generations
+        # This prevents 'deque index out of range' errors from accumulated block leaks
+        if not self.is_finished():
+            self.reset()
+        if use_tqdm:
+            pbar = tqdm(total=len(prompts), desc="Generating", dynamic_ncols=True)
+        if not isinstance(sampling_params, list):
+            sampling_params = [sampling_params] * len(prompts)
+        if unconditional_prompts is None:
+            unconditional_prompts = [None] * len(prompts)
+        for prompt, sp, uncond_prompt in zip(prompts, sampling_params, unconditional_prompts):
+            self.add_request(prompt, sp, uncond_prompt)
+        outputs = {}
+        prefill_throughput = decode_throughput = 0.
+        try:
+            while not self.is_finished():
+                t = perf_counter()
+                output, num_tokens = self.step()
+                if use_tqdm:
+                    if num_tokens > 0:
+                        prefill_throughput = num_tokens / (perf_counter() - t)
+                    else:
+                        decode_throughput = -num_tokens / (perf_counter() - t)
+                    pbar.set_postfix({
+                        "Prefill": f"{int(prefill_throughput)}tok/s",
+                        "Decode": f"{int(decode_throughput)}tok/s",
+                    })
+                for seq_id, token_ids in output:
+                    outputs[seq_id] = token_ids
+                    if use_tqdm:
+                        pbar.update(1)
+        except Exception:
+            # Clean up on exception to prevent block leaks
+            self.reset()
+            raise
+        finally:
+            if use_tqdm:
+                pbar.close()
+        outputs = [outputs[seq_id] for seq_id in sorted(outputs.keys())]
+        outputs = [{"text": self.tokenizer.decode(token_ids), "token_ids": token_ids} for token_ids in outputs]
+        return outputs

acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py ADDED Viewed

	@@ -0,0 +1,543 @@

+import pickle
+import torch
+import torch.distributed as dist
+from multiprocessing.synchronize import Event
+from multiprocessing.shared_memory import SharedMemory
+import sys
+from nanovllm.config import Config
+from nanovllm.engine.sequence import Sequence
+from nanovllm.models.qwen3 import Qwen3ForCausalLM
+from nanovllm.layers.sampler import Sampler
+from nanovllm.utils.context import set_context, get_context, reset_context
+from nanovllm.utils.loader import load_model
+import socket
+def find_available_port(start_port: int = 2333, max_attempts: int = 100) -> int:
+    """Find an available port starting from start_port.
+    Args:
+        start_port: The starting port number to check
+        max_attempts: Maximum number of ports to try
+    Returns:
+        An available port number
+    Raises:
+        RuntimeError: If no available port is found within max_attempts
+    """
+    for i in range(max_attempts):
+        port = start_port + i
+        try:
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+                s.bind(('localhost', port))
+                return port
+        except OSError:
+            # Port is in use, try next one
+            continue
+    raise RuntimeError(f"Could not find an available port starting from {start_port} after {max_attempts} attempts")
+class ModelRunner:
+    def __init__(self, config: Config, rank: int, event: Event | list[Event]):
+        # Enable capturing scalar outputs to avoid graph breaks from Tensor.item() calls
+        torch._dynamo.config.capture_scalar_outputs = True
+        self.config = config
+        hf_config = config.hf_config
+        self.block_size = config.kvcache_block_size
+        self.enforce_eager = config.enforce_eager
+        self.world_size = config.tensor_parallel_size
+        self.rank = rank
+        self.event = event
+        dist_port = find_available_port()
+        print(f"[debug]dist_port: {dist_port}")
+        # Use gloo backend on Windows, nccl on Linux/other platforms
+        backend = "gloo" if sys.platform == "win32" else "nccl"
+        dist.init_process_group(backend, f"tcp://127.0.0.1:{dist_port}", world_size=self.world_size, rank=rank)
+        torch.cuda.set_device(rank)
+        default_dtype = torch.get_default_dtype()
+        # Use dtype instead of deprecated torch_dtype
+        config_dtype = getattr(hf_config, 'dtype', getattr(hf_config, 'torch_dtype', torch.float32))
+        torch.set_default_dtype(config_dtype)
+        torch.set_default_device("cuda")
+        self.model = Qwen3ForCausalLM(hf_config)
+        load_model(self.model, config.model)
+        self.sampler = Sampler()
+        # Pre-allocate buffers for sampling (optimization: avoid repeated tensor creation)
+        # Must be called before warmup_model() since it uses these buffers
+        self._allocate_sample_buffers()
+        self.warmup_model()
+        self.allocate_kv_cache()
+        if not self.enforce_eager:
+            self.capture_cudagraph()
+        torch.set_default_device("cpu")
+        torch.set_default_dtype(default_dtype)
+        if self.world_size > 1:
+            if rank == 0:
+                self.shm = SharedMemory(name="nanovllm", create=True, size=2**20)
+                dist.barrier()
+            else:
+                dist.barrier()
+                self.shm = SharedMemory(name="nanovllm")
+                self.loop()
+    def _allocate_sample_buffers(self):
+        """Pre-allocate reusable buffers for sampling to avoid repeated tensor creation."""
+        max_bs = self.config.max_num_seqs
+        max_tokens = self.config.max_num_batched_tokens
+        max_num_blocks = (self.config.max_model_len + self.block_size - 1) // self.block_size
+        # Pre-allocate pinned memory buffers on CPU for fast transfer
+        # Must explicitly specify device="cpu" since default device may be "cuda"
+        self._cpu_temperatures = torch.zeros(max_bs, dtype=torch.float32, device="cpu", pin_memory=True)
+        self._cpu_cfg_scales = torch.zeros(max_bs, dtype=torch.float32, device="cpu", pin_memory=True)
+        self._cpu_top_ks = torch.zeros(max_bs, dtype=torch.int32, device="cpu", pin_memory=True)
+        self._cpu_top_ps = torch.zeros(max_bs, dtype=torch.float32, device="cpu", pin_memory=True)
+        self._cpu_repetition_penalties = torch.zeros(max_bs, dtype=torch.float32, device="cpu", pin_memory=True)
+        # Pre-allocate decode buffers on CPU with pinned memory
+        self._cpu_input_ids = torch.zeros(max_bs, dtype=torch.int64, device="cpu", pin_memory=True)
+        self._cpu_positions = torch.zeros(max_bs, dtype=torch.int64, device="cpu", pin_memory=True)
+        self._cpu_slot_mapping = torch.zeros(max_bs, dtype=torch.int32, device="cpu", pin_memory=True)
+        self._cpu_context_lens = torch.zeros(max_bs, dtype=torch.int32, device="cpu", pin_memory=True)
+        # Pre-allocate prefill buffers on CPU with pinned memory (optimization to avoid repeated tensor creation)
+        self._cpu_prefill_input_ids = torch.zeros(max_tokens, dtype=torch.int64, device="cpu", pin_memory=True)
+        self._cpu_prefill_positions = torch.zeros(max_tokens, dtype=torch.int64, device="cpu", pin_memory=True)
+        self._cpu_prefill_cu_seqlens = torch.zeros(max_bs + 1, dtype=torch.int32, device="cpu", pin_memory=True)
+        self._cpu_prefill_slot_mapping = torch.zeros(max_tokens, dtype=torch.int32, device="cpu", pin_memory=True)
+        # Pre-allocate block tables buffer (shared by both decode and prefill)
+        self._cpu_block_tables = torch.zeros(max_bs, max_num_blocks, dtype=torch.int32, device="cpu", pin_memory=True)
+        # Pre-allocate buffer for sequence token IDs (used in logits processor and sampler)
+        # Max length is max_model_len since sequences can be that long
+        self._seq_token_ids_buffer = torch.zeros(max_bs, self.config.max_model_len, dtype=torch.int64, device="cpu", pin_memory=True)
+    def exit(self):
+        if self.world_size > 1:
+            self.shm.close()
+            dist.barrier()
+            if self.rank == 0:
+                self.shm.unlink()
+        if not self.enforce_eager:
+            del self.graphs, self.graph_pool
+        torch.cuda.synchronize()
+        dist.destroy_process_group()
+    def loop(self):
+        while True:
+            method_name, args = self.read_shm()
+            self.call(method_name, *args)
+            if method_name == "exit":
+                break
+    def read_shm(self):
+        assert self.world_size > 1 and self.rank > 0
+        self.event.wait()
+        n = int.from_bytes(self.shm.buf[0:4], "little")
+        method_name, *args = pickle.loads(self.shm.buf[4:n+4])
+        self.event.clear()
+        return method_name, args
+    def write_shm(self, method_name, *args):
+        assert self.world_size > 1 and self.rank == 0
+        data = pickle.dumps([method_name, *args])
+        n = len(data)
+        self.shm.buf[0:4] = n.to_bytes(4, "little")
+        self.shm.buf[4:n+4] = data
+        for event in self.event:
+            event.set()
+    def call(self, method_name, *args):
+        if self.world_size > 1 and self.rank == 0:
+            self.write_shm(method_name, *args)
+        method = getattr(self, method_name, None)
+        return method(*args)
+    def warmup_model(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        max_num_batched_tokens, max_model_len = self.config.max_num_batched_tokens, self.config.max_model_len
+        num_seqs = min(max_num_batched_tokens // max_model_len, self.config.max_num_seqs)
+        seqs = [Sequence([0] * max_model_len) for _ in range(num_seqs)]
+        self.run(seqs, True)
+        torch.cuda.empty_cache()
+    def allocate_kv_cache(self):
+        config = self.config
+        hf_config = config.hf_config
+        free, total = torch.cuda.mem_get_info()
+        current = torch.cuda.memory_stats()["allocated_bytes.all.current"]
+        num_kv_heads = hf_config.num_key_value_heads // self.world_size
+        head_dim = getattr(hf_config, "head_dim", hf_config.hidden_size // hf_config.num_attention_heads)
+        # Use dtype instead of deprecated torch_dtype
+        config_dtype = getattr(hf_config, 'dtype', getattr(hf_config, 'torch_dtype', torch.float32))
+        block_bytes = 2 * hf_config.num_hidden_layers * self.block_size * num_kv_heads * head_dim * config_dtype.itemsize
+        # Calculate available memory for KV cache
+        # After warmup_model, empty_cache has been called, so current represents model memory only
+        # Use free memory but respect the gpu_memory_utilization limit
+        target_total_usage = total * config.gpu_memory_utilization
+        available_for_kv_cache = min(free * 0.9, target_total_usage - current)
+        # Ensure we have positive memory available
+        if available_for_kv_cache <= 0:
+            available_for_kv_cache = free * 0.5  # Fallback to 50% of free memory
+        config.num_kvcache_blocks = max(1, int(available_for_kv_cache) // block_bytes)
+        if config.num_kvcache_blocks <= 0:
+            raise RuntimeError(
+                f"Insufficient GPU memory for KV cache. "
+                f"Free: {free / 1024**3:.2f} GB, Current: {current / 1024**3:.2f} GB, "
+                f"Available for KV: {available_for_kv_cache / 1024**3:.2f} GB, "
+                f"Block size: {block_bytes / 1024**2:.2f} MB"
+            )
+        self.kv_cache = torch.empty(2, hf_config.num_hidden_layers, config.num_kvcache_blocks, self.block_size, num_kv_heads, head_dim)
+        layer_id = 0
+        for module in self.model.modules():
+            if hasattr(module, "k_cache") and hasattr(module, "v_cache"):
+                module.k_cache = self.kv_cache[0, layer_id]
+                module.v_cache = self.kv_cache[1, layer_id]
+                layer_id += 1
+    def prepare_block_tables(self, seqs: list[Sequence]):
+        max_len = max(len(seq.block_table) for seq in seqs)
+        block_tables = [seq.block_table + [-1] * (max_len - len(seq.block_table)) for seq in seqs]
+        block_tables = torch.tensor(block_tables, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        return block_tables
+    def prepare_prefill(self, seqs: list[Sequence]):
+        input_ids = []
+        positions = []
+        cu_seqlens_q = [0]
+        cu_seqlens_k = [0]
+        max_seqlen_q = 0
+        max_seqlen_k = 0
+        slot_mapping = []
+        block_tables = None
+        for seq in seqs:
+            seqlen = len(seq)
+            input_ids.extend(seq[seq.num_cached_tokens:])
+            positions.extend(list(range(seq.num_cached_tokens, seqlen)))
+            seqlen_q = seqlen - seq.num_cached_tokens
+            seqlen_k = seqlen
+            cu_seqlens_q.append(cu_seqlens_q[-1] + seqlen_q)
+            cu_seqlens_k.append(cu_seqlens_k[-1] + seqlen_k)
+            max_seqlen_q = max(seqlen_q, max_seqlen_q)
+            max_seqlen_k = max(seqlen_k, max_seqlen_k)
+            if not seq.block_table:    # warmup
+                continue
+            for i in range(seq.num_cached_blocks, seq.num_blocks):
+                start = seq.block_table[i] * self.block_size
+                if i != seq.num_blocks - 1:
+                    end = start + self.block_size
+                else:
+                    end = start + seq.last_block_num_tokens
+                slot_mapping.extend(list(range(start, end)))
+        if cu_seqlens_k[-1] > cu_seqlens_q[-1]:    # prefix cache
+            block_tables = self.prepare_block_tables(seqs)
+        input_ids = torch.tensor(input_ids, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
+        positions = torch.tensor(positions, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
+        cu_seqlens_q = torch.tensor(cu_seqlens_q, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        cu_seqlens_k = torch.tensor(cu_seqlens_k, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        slot_mapping = torch.tensor(slot_mapping, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        set_context(True, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, slot_mapping, None, block_tables)
+        return input_ids, positions
+    def prepare_decode(self, seqs: list[Sequence]):
+        """Optimized decode preparation using pre-allocated buffers."""
+        bs = len(seqs)
+        # Use pre-allocated CPU buffers
+        for i, seq in enumerate(seqs):
+            self._cpu_input_ids[i] = seq.last_token
+            self._cpu_positions[i] = len(seq) - 1
+            self._cpu_context_lens[i] = len(seq)
+            self._cpu_slot_mapping[i] = seq.block_table[-1] * self.block_size + seq.last_block_num_tokens - 1
+        # Transfer to GPU using sliced views
+        input_ids = self._cpu_input_ids[:bs].cuda(non_blocking=True)
+        positions = self._cpu_positions[:bs].cuda(non_blocking=True)
+        slot_mapping = self._cpu_slot_mapping[:bs].cuda(non_blocking=True)
+        context_lens = self._cpu_context_lens[:bs].cuda(non_blocking=True)
+        block_tables = self.prepare_block_tables(seqs)
+        set_context(False, slot_mapping=slot_mapping, context_lens=context_lens, block_tables=block_tables)
+        return input_ids, positions
+    def prepare_sample(self, seqs: list[Sequence], is_cfg_batch: bool = False):
+        """Optimized sample preparation using pre-allocated buffers."""
+        if is_cfg_batch:
+            num_seqs = len(seqs) // 2
+            target_seqs = seqs[:num_seqs]
+        else:
+            num_seqs = len(seqs)
+            target_seqs = seqs
+        # Fill pre-allocated CPU buffers
+        top_ks_is_zero = True
+        top_ps_is_one = True
+        repetition_penalties_is_one = True
+        for i, seq in enumerate(target_seqs):
+            self._cpu_temperatures[i] = seq.temperature
+            self._cpu_cfg_scales[i] = seq.cfg_scale
+            self._cpu_top_ks[i] = seq.top_k if seq.top_k is not None else 0
+            if seq.top_k is not None and seq.top_k > 0:
+                top_ks_is_zero = False
+            self._cpu_top_ps[i] = seq.top_p if seq.top_p is not None else 1.0
+            if seq.top_p is not None and seq.top_p == 1.0:
+                top_ps_is_one = False
+            self._cpu_repetition_penalties[i] = seq.repetition_penalty if seq.repetition_penalty is not None else 1.0
+            if seq.repetition_penalty is not None and seq.repetition_penalty == 1.0:
+                repetition_penalties_is_one = False
+        # Transfer to GPU using sliced views (single batched transfer)
+        temperatures = self._cpu_temperatures[:num_seqs].cuda(non_blocking=True)
+        cfg_scales = self._cpu_cfg_scales[:num_seqs].cuda(non_blocking=True)
+        top_ks = self._cpu_top_ks[:num_seqs].cuda(non_blocking=True) if not top_ks_is_zero else None
+        top_ps = self._cpu_top_ps[:num_seqs].cuda(non_blocking=True) if not top_ps_is_one else None
+        repetition_penalties = self._cpu_repetition_penalties[:num_seqs].cuda(non_blocking=True) if not repetition_penalties_is_one else None
+        return temperatures, cfg_scales, top_ks, top_ps, repetition_penalties
+    @torch.inference_mode()
+    def run_model(self, input_ids: torch.Tensor, positions: torch.Tensor, is_prefill: bool):
+        if is_prefill or self.enforce_eager or input_ids.size(0) > 512:
+            return self.model.compute_logits(self.model(input_ids, positions))
+        else:
+            bs = input_ids.size(0)
+            context = get_context()
+            # Check if block_tables size exceeds pre-allocated buffer size
+            # This can happen when conditional and unconditional sequences have different lengths
+            # in CFG mode, causing block_tables to have more columns than expected
+            max_num_blocks = self.graph_vars["block_tables"].size(1)
+            if context.block_tables.size(1) > max_num_blocks:
+                # Fall back to eager mode when block_tables is too large for CUDA graph
+                return self.model.compute_logits(self.model(input_ids, positions))
+            # Fix: Also check if block_tables row count matches batch size
+            # Dimension mismatch can cause CUDA illegal memory access during graph replay
+            if context.block_tables.size(0) != bs:
+                # Fall back to eager mode when block_tables row count doesn't match batch size
+                return self.model.compute_logits(self.model(input_ids, positions))
+            # Fix: Verify slot_mapping and context_lens dimensions match batch size
+            if context.slot_mapping.size(0) != bs or context.context_lens.size(0) != bs:
+                # Fall back to eager mode when dimensions don't match
+                return self.model.compute_logits(self.model(input_ids, positions))
+            graph = self.graphs[next(x for x in self.graph_bs if x >= bs)]
+            graph_vars = self.graph_vars
+            graph_vars["input_ids"][:bs] = input_ids
+            graph_vars["positions"][:bs] = positions
+            graph_vars["slot_mapping"].fill_(-1)
+            graph_vars["slot_mapping"][:bs] = context.slot_mapping
+            graph_vars["context_lens"].zero_()
+            graph_vars["context_lens"][:bs] = context.context_lens
+            # Clear block_tables first to ensure no stale data from previous runs
+            graph_vars["block_tables"][:bs].fill_(-1)
+            graph_vars["block_tables"][:bs, :context.block_tables.size(1)] = context.block_tables
+            graph.replay()
+            return self.model.compute_logits(graph_vars["outputs"][:bs])
+    def run(self, seqs: list[Sequence], is_prefill: bool) -> list[int]:
+        """Run model forward and sampling. For CFG sequences, batch is structured as:
+        [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
+        where uncond_seqi is the paired unconditional sequence of cond_seqi."""
+        # Check if this is a CFG batch (contains paired conditional and unconditional sequences)
+        is_cfg_batch = seqs[0].cfg_scale > 1.0 and seqs[0].paired_seq is not None
+        if is_cfg_batch:
+            # CFG batch: seqs = [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
+            num_cond = len(seqs) // 2
+            cond_seqs = seqs[:num_cond]
+            # uncond_seqs = seqs[num_cond:]
+            # Prepare inputs for both conditional and unconditional (they're already in the batch)
+            input_ids, positions = (self.prepare_prefill(seqs) if is_prefill else self.prepare_decode(seqs))
+            sample_params = self.prepare_sample(seqs, is_cfg_batch=True) if self.rank == 0 else None
+            if sample_params is not None:
+                temperatures, cfg_scales, top_ks, top_ps, repetition_penalties = sample_params
+            else:
+                temperatures = cfg_scales = top_ks = top_ps = repetition_penalties = None
+            # Run model forward (processes entire batch: cond + uncond)
+            logits_all = self.run_model(input_ids, positions, is_prefill)
+            reset_context()
+            if self.rank == 0:
+                # Split logits: first half is conditional, second half is unconditional
+                logits_cond = logits_all[:num_cond]
+                logits_uncond = logits_all[num_cond:]
+                # Apply repetition penalty to conditional logits (before CFG)
+                if repetition_penalties is not None:
+                    for i, seq in enumerate(cond_seqs):
+                        penalty = repetition_penalties[i].item()
+                        if penalty != 1.0:
+                            # Only penalize completion tokens (not prompt tokens)
+                            completion_tokens = torch.tensor(seq.completion_token_ids, device=logits_cond.device)
+                            if len(completion_tokens) > 0:
+                                # Create token mask: mark tokens that appeared in completion
+                                token_mask = torch.zeros(logits_cond.shape[1], dtype=torch.bool, device=logits_cond.device)
+                                token_mask[completion_tokens] = True
+                                # Apply standard repetition penalty formula (matching transformers implementation):
+                                # For tokens in completion: if score < 0 then score * penalty, else score / penalty
+                                penalty_scores = torch.where(
+                                    logits_cond[i] < 0,
+                                    logits_cond[i] * penalty,
+                                    logits_cond[i] / penalty
+                                )
+                                # Only apply penalty to tokens that appeared in completion
+                                logits_cond[i] = torch.where(token_mask, penalty_scores, logits_cond[i])
+                # Apply CFG formula: logits_cfg = logits_uncond + cfg_scale * (logits_cond - logits_uncond)
+                cfg_scales_tensor = cfg_scales.unsqueeze(1)  # [num_cond, 1]
+                logits_cfg = logits_uncond + cfg_scales_tensor * (logits_cond - logits_uncond)
+                # Apply logits processor for constrained decoding (if any sequence has one)
+                for i, seq in enumerate(cond_seqs):
+                    if seq.logits_processor is not None:
+                        # Create input_ids tensor for this sequence
+                        seq_input_ids = torch.tensor([seq.token_ids], device=logits_cfg.device)
+                        # Apply processor to this sequence's logits
+                        logits_cfg[i:i+1] = seq.logits_processor(seq_input_ids, logits_cfg[i:i+1])
+                # Prepare input_ids for sampler (for repetition penalty, though we already applied it)
+                # cond_input_ids = torch.tensor([seq.token_ids for seq in cond_seqs], device=logits_cfg.device)
+                # Sample from CFG logits
+                token_ids_cfg = self.sampler(
+                    logits_cfg,
+                    temperatures,
+                    top_ks=top_ks if top_ks is not None else None,
+                    top_ps=top_ps if top_ps is not None else None,
+                    repetition_penalties=None,  # Already applied above
+                    # input_ids=cond_input_ids,
+                ).tolist()
+                # Update logits processor state after sampling
+                # NOTE: Only update for the first sequence since all sequences share the same processor
+                # Updating multiple times would cause duplicate state updates (e.g., codes_count += N instead of += 1)
+                if cond_seqs and cond_seqs[0].logits_processor_update_state is not None:
+                    cond_seqs[0].logits_processor_update_state(token_ids_cfg[0])
+                # Return token_ids (will be applied to both conditional and unconditional sequences)
+                return token_ids_cfg
+            else:
+                return None
+        else:
+            # Normal batch (non-CFG)
+            input_ids, positions = (self.prepare_prefill(seqs) if is_prefill
+                                   else self.prepare_decode(seqs))
+            sample_params = self.prepare_sample(seqs, is_cfg_batch=False) if self.rank == 0 else None
+            if sample_params is not None:
+                temperatures, cfg_scales, top_ks, top_ps, repetition_penalties = sample_params
+            else:
+                temperatures = cfg_scales = top_ks = top_ps = repetition_penalties = None
+            logits = self.run_model(input_ids, positions, is_prefill)
+            reset_context()
+            if self.rank == 0:
+                # Apply repetition penalty to logits
+                if repetition_penalties is not None:
+                    for i, seq in enumerate(seqs):
+                        penalty = repetition_penalties[i].item()
+                        if penalty != 1.0:
+                            # Only penalize completion tokens (not prompt tokens)
+                            completion_tokens = torch.tensor(seq.completion_token_ids, device=logits.device)
+                            if len(completion_tokens) > 0:
+                                # Create token mask: mark tokens that appeared in completion
+                                token_mask = torch.zeros(logits.shape[1], dtype=torch.bool, device=logits.device)
+                                token_mask[completion_tokens] = True
+                                # Apply standard repetition penalty formula (matching transformers implementation):
+                                # For tokens in completion: if score < 0 then score * penalty, else score / penalty
+                                penalty_scores = torch.where(
+                                    logits[i] < 0,
+                                    logits[i] * penalty,
+                                    logits[i] / penalty
+                                )
+                                # Only apply penalty to tokens that appeared in completion
+                                logits[i] = torch.where(token_mask, penalty_scores, logits[i])
+                # Apply logits processor for constrained decoding (if any sequence has one)
+                # Clone logits to avoid in-place update issues in inference mode
+                logits = logits.clone()
+                for i, seq in enumerate(seqs):
+                    if seq.logits_processor is not None:
+                        # Create input_ids tensor for this sequence
+                        seq_input_ids = torch.tensor([seq.token_ids], device=logits.device)
+                        # Apply processor to this sequence's logits (clone to avoid inference mode issues)
+                        processed = seq.logits_processor(seq_input_ids, logits[i:i+1].clone())
+                        logits[i] = processed[0]
+                # Prepare input_ids for sampler
+                # seq_input_ids = torch.tensor([seq.token_ids for seq in seqs], device=logits.device)
+                token_ids = self.sampler(
+                    logits,
+                    temperatures,
+                    top_ks=top_ks if top_ks is not None else None,
+                    top_ps=top_ps if top_ps is not None else None,
+                    repetition_penalties=None,  # Already applied above
+                    # input_ids=seq_input_ids,
+                ).tolist()
+                # Update logits processor state after sampling
+                # NOTE: Only update for the first sequence since all sequences may share the same processor
+                # (when using a single SamplingParams for batch generation)
+                # Updating multiple times would cause duplicate state updates (e.g., codes_count += N instead of += 1)
+                if seqs and seqs[0].logits_processor_update_state is not None:
+                    seqs[0].logits_processor_update_state(token_ids[0])
+                return token_ids
+            else:
+                return None
+    @torch.inference_mode()
+    def capture_cudagraph(self):
+        config = self.config
+        hf_config = config.hf_config
+        max_bs = min(self.config.max_num_seqs, 512)
+        max_num_blocks = (config.max_model_len + self.block_size - 1) // self.block_size
+        input_ids = torch.zeros(max_bs, dtype=torch.int64)
+        positions = torch.zeros(max_bs, dtype=torch.int64)
+        slot_mapping = torch.zeros(max_bs, dtype=torch.int32)
+        context_lens = torch.zeros(max_bs, dtype=torch.int32)
+        block_tables = torch.zeros(max_bs, max_num_blocks, dtype=torch.int32)
+        outputs = torch.zeros(max_bs, hf_config.hidden_size)
+        self.graph_bs = [1, 2, 4, 8] + list(range(16, max_bs + 1, 16))
+        self.graphs = {}
+        self.graph_pool = None
+        for bs in reversed(self.graph_bs):
+            graph = torch.cuda.CUDAGraph()
+            set_context(False, slot_mapping=slot_mapping[:bs], context_lens=context_lens[:bs], block_tables=block_tables[:bs])
+            outputs[:bs] = self.model(input_ids[:bs], positions[:bs])    # warmup
+            with torch.cuda.graph(graph, self.graph_pool):
+                outputs[:bs] = self.model(input_ids[:bs], positions[:bs])    # capture
+            if self.graph_pool is None:
+                self.graph_pool = graph.pool()
+            self.graphs[bs] = graph
+            torch.cuda.synchronize()
+            reset_context()
+        self.graph_vars = dict(
+            input_ids=input_ids,
+            positions=positions,
+            slot_mapping=slot_mapping,
+            context_lens=context_lens,
+            block_tables=block_tables,
+            outputs=outputs,
+        )

acestep/third_parts/nano-vllm/nanovllm/engine/scheduler.py ADDED Viewed

	@@ -0,0 +1,230 @@

+from collections import deque
+from nanovllm.config import Config
+from nanovllm.engine.sequence import Sequence, SequenceStatus
+from nanovllm.engine.block_manager import BlockManager
+class Scheduler:
+    def __init__(self, config: Config):
+        self.max_num_seqs = config.max_num_seqs
+        self.max_num_batched_tokens = config.max_num_batched_tokens
+        self.eos = config.eos
+        self.block_manager = BlockManager(config.num_kvcache_blocks, config.kvcache_block_size)
+        self.waiting: deque[Sequence] = deque()
+        self.running: deque[Sequence] = deque()
+    def is_finished(self):
+        return not self.waiting and not self.running
+    def add(self, seq: Sequence):
+        self.waiting.append(seq)
+    def schedule(self) -> tuple[list[Sequence], bool]:
+        # prefill
+        scheduled_seqs = []
+        num_seqs = 0
+        num_batched_tokens = 0
+        processed_seqs = set()  # Track processed sequences to handle CFG pairs
+        while self.waiting and num_seqs < self.max_num_seqs:
+            seq = self.waiting[0]
+            # For CFG sequences, ensure conditional and unconditional are scheduled together
+            if seq.cfg_scale > 1.0 and seq.paired_seq is not None and not seq.is_unconditional:
+                # This is a conditional sequence, need to schedule its paired unconditional sequence too
+                paired_seq = seq.paired_seq
+                if paired_seq.status != SequenceStatus.WAITING:
+                    # Paired sequence not in waiting, skip this conditional sequence for now
+                    break
+                # Calculate tokens for both sequences
+                total_tokens = (len(seq) - seq.num_cached_tokens) + (len(paired_seq) - paired_seq.num_cached_tokens)
+                # FIX: Check if we have enough blocks for BOTH sequences combined
+                # The old check was wrong: it checked each sequence independently,
+                # but didn't account for the total blocks needed by both
+                total_blocks_needed = seq.num_blocks + paired_seq.num_blocks
+                can_allocate_both = len(self.block_manager.free_block_ids) >= total_blocks_needed
+                if num_batched_tokens + total_tokens > self.max_num_batched_tokens or not can_allocate_both:
+                    break
+                # Schedule both sequences: conditional first, then unconditional
+                for s in [seq, paired_seq]:
+                    num_seqs += 1
+                    self.block_manager.allocate(s)
+                    num_batched_tokens += len(s) - s.num_cached_tokens
+                    s.status = SequenceStatus.RUNNING
+                    self.waiting.remove(s)
+                    self.running.append(s)
+                    scheduled_seqs.append(s)
+                    processed_seqs.add(s.seq_id)
+            else:
+                # Normal sequence or unconditional sequence (already processed with its conditional)
+                if seq.seq_id in processed_seqs:
+                    # Skip if already processed as part of a CFG pair
+                    self.waiting.popleft()
+                    continue
+                if num_batched_tokens + len(seq) > self.max_num_batched_tokens or not self.block_manager.can_allocate(seq):
+                    break
+                num_seqs += 1
+                self.block_manager.allocate(seq)
+                num_batched_tokens += len(seq) - seq.num_cached_tokens
+                seq.status = SequenceStatus.RUNNING
+                self.waiting.popleft()
+                self.running.append(seq)
+                scheduled_seqs.append(seq)
+        if scheduled_seqs:
+            # For CFG batches, ensure conditional sequences come before their unconditional pairs
+            cfg_cond_seqs = [s for s in scheduled_seqs if s.cfg_scale > 1.0 and not s.is_unconditional]
+            cfg_uncond_seqs = [s for s in scheduled_seqs if s.is_unconditional]
+            non_cfg_seqs = [s for s in scheduled_seqs if s.cfg_scale <= 1.0]
+            # Reorder: non-CFG, then CFG conditional, then CFG unconditional
+            scheduled_seqs = non_cfg_seqs + cfg_cond_seqs + cfg_uncond_seqs
+            return scheduled_seqs, True
+        # decode
+        processed_seqs = set()
+        temp_running = list(self.running)  # Work with a copy
+        while temp_running and num_seqs < self.max_num_seqs:
+            seq = temp_running.pop(0)
+            # For CFG sequences, ensure conditional and unconditional are scheduled together
+            if seq.cfg_scale > 1.0 and seq.paired_seq is not None and not seq.is_unconditional:
+                paired_seq = seq.paired_seq
+                if paired_seq not in temp_running:
+                    # Paired sequence not available, skip for now
+                    continue
+                # Remove paired_seq from temp_running
+                temp_running.remove(paired_seq)
+                # FIX: Check if we have enough blocks for BOTH sequences to append
+                # Each sequence needs 1 block when at block boundary (len % block_size == 1)
+                block_size = self.block_manager.block_size
+                blocks_needed_seq = 1 if len(seq) % block_size == 1 else 0
+                blocks_needed_paired = 1 if len(paired_seq) % block_size == 1 else 0
+                total_blocks_needed = blocks_needed_seq + blocks_needed_paired
+                can_append_both = len(self.block_manager.free_block_ids) >= total_blocks_needed
+                if not can_append_both:
+                    # Try preempting other sequences
+                    preempted = False
+                    while not can_append_both and temp_running:
+                        other_seq = temp_running.pop(0)
+                        if other_seq != seq and other_seq != paired_seq:
+                            self.preempt(other_seq)
+                            # Recalculate with the same correct logic
+                            can_append_both = len(self.block_manager.free_block_ids) >= total_blocks_needed
+                            preempted = True
+                        else:
+                            temp_running.append(other_seq)
+                            break
+                    if not can_append_both:
+                        # Can't schedule this pair right now
+                        temp_running.append(seq)
+                        temp_running.append(paired_seq)
+                        continue
+                # Schedule both sequences
+                for s in [seq, paired_seq]:
+                    num_seqs += 1
+                    self.block_manager.may_append(s)
+                    scheduled_seqs.append(s)
+                    processed_seqs.add(s.seq_id)
+                    # Remove from actual running list if scheduled
+                    if s in self.running:
+                        self.running.remove(s)
+            else:
+                # Normal sequence or unconditional (already processed)
+                if seq.seq_id in processed_seqs:
+                    continue
+                while not self.block_manager.can_append(seq):
+                    if temp_running:
+                        other_seq = temp_running.pop(0)
+                        if other_seq != seq:
+                            self.preempt(other_seq)
+                        else:
+                            temp_running.append(other_seq)
+                            break
+                    else:
+                        self.preempt(seq)
+                        if seq in self.running:
+                            self.running.remove(seq)
+                        break
+                else:
+                    num_seqs += 1
+                    self.block_manager.may_append(seq)
+                    scheduled_seqs.append(seq)
+                    if seq in self.running:
+                        self.running.remove(seq)
+        assert scheduled_seqs
+        # For CFG batches in decode, ensure conditional sequences come before unconditional
+        cfg_cond_seqs = [s for s in scheduled_seqs if s.cfg_scale > 1.0 and not s.is_unconditional]
+        cfg_uncond_seqs = [s for s in scheduled_seqs if s.is_unconditional]
+        non_cfg_seqs = [s for s in scheduled_seqs if s.cfg_scale <= 1.0]
+        scheduled_seqs = non_cfg_seqs + cfg_cond_seqs + cfg_uncond_seqs
+        self.running.extendleft(reversed(scheduled_seqs))
+        return scheduled_seqs, False
+    def preempt(self, seq: Sequence):
+        seq.status = SequenceStatus.WAITING
+        self.block_manager.deallocate(seq)
+        self.waiting.appendleft(seq)
+    def postprocess(self, seqs: list[Sequence], token_ids: list[int]) -> list[bool]:
+        # Check if this is a CFG batch
+        is_cfg_batch = False
+        if len(seqs) > 0 and seqs[0].cfg_scale > 1.0 and seqs[0].paired_seq is not None:
+            num_cond = len(seqs) // 2
+            is_cfg_batch = (num_cond > 0 and
+                           not seqs[0].is_unconditional and
+                           seqs[num_cond].is_unconditional)
+        if is_cfg_batch:
+            # CFG batch: seqs = [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
+            # token_ids correspond to conditional sequences only (sampled from CFG logits)
+            num_cond = len(seqs) // 2
+            cond_seqs = seqs[:num_cond]
+            uncond_seqs = seqs[num_cond:]
+            # Apply the same sampled token to both conditional and unconditional sequences
+            for i, (cond_seq, uncond_seq, token_id) in enumerate(zip(cond_seqs, uncond_seqs, token_ids)):
+                cond_seq.append_token(token_id)
+                uncond_seq.append_token(token_id)  # Same token for unconditional
+                # Check if either sequence is finished
+                cond_finished = ((not cond_seq.ignore_eos and token_id == self.eos) or
+                                cond_seq.num_completion_tokens == cond_seq.max_tokens)
+                uncond_finished = ((not uncond_seq.ignore_eos and token_id == self.eos) or
+                                  uncond_seq.num_completion_tokens == uncond_seq.max_tokens)
+                if cond_finished or uncond_finished:
+                    # Mark both as finished
+                    cond_seq.status = SequenceStatus.FINISHED
+                    uncond_seq.status = SequenceStatus.FINISHED
+                    self.block_manager.deallocate(cond_seq)
+                    self.block_manager.deallocate(uncond_seq)
+                    if cond_seq in self.running:
+                        self.running.remove(cond_seq)
+                    if uncond_seq in self.running:
+                        self.running.remove(uncond_seq)
+        else:
+            # Normal batch
+            for seq, token_id in zip(seqs, token_ids):
+                seq.append_token(token_id)
+                if (not seq.ignore_eos and token_id == self.eos) or seq.num_completion_tokens == seq.max_tokens:
+                    seq.status = SequenceStatus.FINISHED
+                    self.block_manager.deallocate(seq)
+                    self.running.remove(seq)

acestep/third_parts/nano-vllm/nanovllm/engine/sequence.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from copy import copy
+from enum import Enum, auto
+from itertools import count
+from typing import Optional, Callable, Any
+from nanovllm.sampling_params import SamplingParams
+class SequenceStatus(Enum):
+    WAITING = auto()
+    RUNNING = auto()
+    FINISHED = auto()
+class Sequence:
+    block_size = 256
+    counter = count()
+    def __init__(self, token_ids: list[int], sampling_params = SamplingParams(), is_unconditional: bool = False, conditional_seq = None):
+        self.seq_id = next(Sequence.counter)
+        self.status = SequenceStatus.WAITING
+        self.token_ids = copy(token_ids)
+        self.last_token = token_ids[-1]
+        self.num_tokens = len(self.token_ids)
+        self.num_prompt_tokens = len(token_ids)
+        self.num_cached_tokens = 0
+        self.block_table = []
+        self.temperature = sampling_params.temperature
+        self.max_tokens = sampling_params.max_tokens
+        self.ignore_eos = sampling_params.ignore_eos
+        self.cfg_scale = sampling_params.cfg_scale
+        self.top_k = sampling_params.top_k
+        self.top_p = sampling_params.top_p
+        self.repetition_penalty = sampling_params.repetition_penalty
+        # For CFG: mark if this is an unconditional sequence
+        self.is_unconditional = is_unconditional
+        # For CFG: reference to the corresponding conditional sequence (if this is unconditional)
+        # For conditional sequences, this points to the unconditional sequence
+        self.paired_seq = conditional_seq  # For conditional seq, points to uncond; for uncond seq, points to cond
+        # For constrained decoding: logits processor and state update callback
+        self.logits_processor: Optional[Any] = sampling_params.logits_processor
+        self.logits_processor_update_state: Optional[Callable[[int], None]] = sampling_params.logits_processor_update_state
+    def __len__(self):
+        return self.num_tokens
+    def __getitem__(self, key):
+        return self.token_ids[key]
+    @property
+    def is_finished(self):
+        return self.status == SequenceStatus.FINISHED
+    @property
+    def num_completion_tokens(self):
+        return self.num_tokens - self.num_prompt_tokens
+    @property
+    def prompt_token_ids(self):
+        return self.token_ids[:self.num_prompt_tokens]
+    @property
+    def completion_token_ids(self):
+        return self.token_ids[self.num_prompt_tokens:]
+    @property
+    def num_cached_blocks(self):
+        return self.num_cached_tokens // self.block_size
+    @property
+    def num_blocks(self):
+        return (self.num_tokens + self.block_size - 1) // self.block_size
+    @property
+    def last_block_num_tokens(self):
+        return self.num_tokens - (self.num_blocks - 1) * self.block_size
+    def block(self, i):
+        assert 0 <= i < self.num_blocks
+        return self.token_ids[i*self.block_size: (i+1)*self.block_size]
+    def append_token(self, token_id: int):
+        self.token_ids.append(token_id)
+        self.last_token = token_id
+        self.num_tokens += 1
+    def __getstate__(self):
+        return (self.num_tokens, self.num_prompt_tokens, self.num_cached_tokens, self.block_table,
+                self.token_ids if self.num_completion_tokens == 0 else self.last_token)
+    def __setstate__(self, state):
+        self.num_tokens, self.num_prompt_tokens, self.num_cached_tokens, self.block_table = state[:-1]
+        if self.num_completion_tokens == 0:
+            self.token_ids = state[-1]
+        else:
+            self.last_token = state[-1]

acestep/third_parts/nano-vllm/nanovllm/layers/activation.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+class SiluAndMul(nn.Module):
+    def __init__(self):
+        super().__init__()
+    @torch.compile
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, y = x.chunk(2, -1)
+        return F.silu(x) * y

acestep/third_parts/nano-vllm/nanovllm/layers/attention.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+from torch import nn
+import triton
+import triton.language as tl
+from flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
+from nanovllm.utils.context import get_context
+@triton.jit
+def store_kvcache_kernel(
+    key_ptr,
+    key_stride,
+    value_ptr,
+    value_stride,
+    k_cache_ptr,
+    v_cache_ptr,
+    slot_mapping_ptr,
+    D: tl.constexpr,
+):
+    idx = tl.program_id(0)
+    slot = tl.load(slot_mapping_ptr + idx)
+    if slot == -1: return
+    key_offsets = idx * key_stride + tl.arange(0, D)
+    value_offsets = idx * value_stride + tl.arange(0, D)
+    key = tl.load(key_ptr + key_offsets)
+    value = tl.load(value_ptr + value_offsets)
+    cache_offsets = slot * D + tl.arange(0, D)
+    tl.store(k_cache_ptr + cache_offsets, key)
+    tl.store(v_cache_ptr + cache_offsets, value)
+def store_kvcache(key: torch.Tensor, value: torch.Tensor, k_cache: torch.Tensor, v_cache: torch.Tensor, slot_mapping: torch.Tensor):
+    N, num_heads, head_dim = key.shape
+    D = num_heads * head_dim
+    assert key.stride(-1) == 1 and value.stride(-1) == 1
+    assert key.stride(1) == head_dim and value.stride(1) == head_dim
+    assert k_cache.stride(1) == D and v_cache.stride(1) == D
+    assert slot_mapping.numel() == N
+    store_kvcache_kernel[(N,)](key, key.stride(0), value, value.stride(0), k_cache, v_cache, slot_mapping, D)
+class Attention(nn.Module):
+    def __init__(
+        self,
+        num_heads,
+        head_dim,
+        scale,
+        num_kv_heads,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.scale = scale
+        self.num_kv_heads = num_kv_heads
+        self.k_cache = self.v_cache = torch.tensor([])
+    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
+        context = get_context()
+        k_cache, v_cache = self.k_cache, self.v_cache
+        if k_cache.numel() and v_cache.numel():
+            store_kvcache(k, v, k_cache, v_cache, context.slot_mapping)
+        if context.is_prefill:
+            if context.block_tables is not None:    # prefix cache
+                k, v = k_cache, v_cache
+            o = flash_attn_varlen_func(q, k, v,
+                                       max_seqlen_q=context.max_seqlen_q, cu_seqlens_q=context.cu_seqlens_q,
+                                       max_seqlen_k=context.max_seqlen_k, cu_seqlens_k=context.cu_seqlens_k,
+                                       softmax_scale=self.scale, causal=True, block_table=context.block_tables)
+        else:    # decode
+            o = flash_attn_with_kvcache(q.unsqueeze(1), k_cache, v_cache,
+                                        cache_seqlens=context.context_lens, block_table=context.block_tables,
+                                        softmax_scale=self.scale, causal=True)
+        return o

acestep/third_parts/nano-vllm/nanovllm/layers/embed_head.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from nanovllm.utils.context import get_context
+class VocabParallelEmbedding(nn.Module):
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+    ):
+        super().__init__()
+        self.tp_rank = dist.get_rank()
+        self.tp_size = dist.get_world_size()
+        assert num_embeddings % self.tp_size == 0
+        self.num_embeddings = num_embeddings
+        self.num_embeddings_per_partition = self.num_embeddings // self.tp_size
+        self.vocab_start_idx = self.num_embeddings_per_partition * self.tp_rank
+        self.vocab_end_idx = self.vocab_start_idx + self.num_embeddings_per_partition
+        self.weight = nn.Parameter(torch.empty(self.num_embeddings_per_partition, embedding_dim))
+        self.weight.weight_loader = self.weight_loader
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        shard_size = param_data.size(0)
+        start_idx = self.tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
+        param_data.copy_(loaded_weight)
+    def forward(self, x: torch.Tensor):
+        if self.tp_size > 1:
+            mask = (x >= self.vocab_start_idx) & (x < self.vocab_end_idx)
+            x = mask * (x - self.vocab_start_idx)
+        y = F.embedding(x, self.weight)
+        if self.tp_size > 1:
+            y = mask.unsqueeze(1) * y
+            dist.all_reduce(y)
+        return y
+class ParallelLMHead(VocabParallelEmbedding):
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        bias: bool = False,
+    ):
+        assert not bias
+        super().__init__(num_embeddings, embedding_dim)
+    def forward(self, x: torch.Tensor):
+        context = get_context()
+        if context.is_prefill:
+            last_indices = context.cu_seqlens_q[1:] - 1
+            x = x[last_indices].contiguous()
+        logits = F.linear(x, self.weight)
+        if self.tp_size > 1:
+            all_logits = [torch.empty_like(logits) for _ in range(self.tp_size)] if self.tp_rank == 0 else None
+            dist.gather(logits, all_logits, 0)
+            logits = torch.cat(all_logits, -1) if self.tp_rank == 0 else None
+        return logits

acestep/third_parts/nano-vllm/nanovllm/layers/layernorm.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+from torch import nn
+class RMSNorm(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+    @torch.compile
+    def rms_forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        orig_dtype = x.dtype
+        x = x.float()
+        var = x.pow(2).mean(dim=-1, keepdim=True)
+        x.mul_(torch.rsqrt(var + self.eps))
+        x = x.to(orig_dtype).mul_(self.weight)
+        return x
+    @torch.compile
+    def add_rms_forward(
+        self,
+        x: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        orig_dtype = x.dtype
+        x = x.float().add_(residual.float())
+        residual = x.to(orig_dtype)
+        var = x.pow(2).mean(dim=-1, keepdim=True)
+        x.mul_(torch.rsqrt(var + self.eps))
+        x = x.to(orig_dtype).mul_(self.weight)
+        return x, residual
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            return self.rms_forward(x)
+        else:
+            return self.add_rms_forward(x, residual)

acestep/third_parts/nano-vllm/nanovllm/layers/linear.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.distributed as dist
+def divide(numerator, denominator):
+    assert numerator % denominator == 0
+    return numerator // denominator
+class LinearBase(nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = False,
+        tp_dim: int | None = None,
+    ):
+        super().__init__()
+        self.tp_dim = tp_dim
+        self.tp_rank = dist.get_rank()
+        self.tp_size = dist.get_world_size()
+        self.weight = nn.Parameter(torch.empty(output_size, input_size))
+        self.weight.weight_loader = self.weight_loader
+        if bias:
+            self.bias = nn.Parameter(torch.empty(output_size))
+            self.bias.weight_loader = self.weight_loader
+        else:
+            self.register_parameter("bias", None)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+class ReplicatedLinear(LinearBase):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = False,
+    ):
+        super().__init__(input_size, output_size, bias)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param.data.copy_(loaded_weight)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(x, self.weight, self.bias)
+class ColumnParallelLinear(LinearBase):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = False,
+    ):
+        tp_size = dist.get_world_size()
+        super().__init__(input_size, divide(output_size, tp_size), bias, 0)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        shard_size = param_data.size(self.tp_dim)
+        start_idx = self.tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(self.tp_dim, start_idx, shard_size)
+        param_data.copy_(loaded_weight)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(x, self.weight, self.bias)
+class MergedColumnParallelLinear(ColumnParallelLinear):
+    def __init__(
+        self,
+        input_size: int,
+        output_sizes: list[int],
+        bias: bool = False,
+    ):
+        self.output_sizes = output_sizes
+        super().__init__(input_size, sum(output_sizes), bias)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, loaded_shard_id: int):
+        param_data = param.data
+        shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
+        shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
+        param_data = param_data.narrow(self.tp_dim, shard_offset, shard_size)
+        loaded_weight = loaded_weight.chunk(self.tp_size, self.tp_dim)[self.tp_rank]
+        param_data.copy_(loaded_weight)
+class QKVParallelLinear(ColumnParallelLinear):
+    def __init__(
+        self,
+        hidden_size: int,
+        head_size: int,
+        total_num_heads: int,
+        total_num_kv_heads: int | None = None,
+        bias: bool = False,
+    ):
+        tp_size = dist.get_world_size()
+        total_num_kv_heads = total_num_kv_heads or total_num_heads
+        self.head_size = head_size
+        self.num_heads = divide(total_num_heads, tp_size)
+        self.num_kv_heads = divide(total_num_kv_heads, tp_size)
+        output_size = (total_num_heads + 2 * total_num_kv_heads) * self.head_size
+        super().__init__(hidden_size, output_size, bias)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, loaded_shard_id: str):
+        param_data = param.data
+        assert loaded_shard_id in ["q", "k", "v"]
+        if loaded_shard_id == "q":
+            shard_size = self.num_heads * self.head_size
+            shard_offset = 0
+        elif loaded_shard_id == "k":
+            shard_size = self.num_kv_heads * self.head_size
+            shard_offset = self.num_heads * self.head_size
+        else:
+            shard_size = self.num_kv_heads * self.head_size
+            shard_offset = self.num_heads * self.head_size + self.num_kv_heads * self.head_size
+        param_data = param_data.narrow(self.tp_dim, shard_offset, shard_size)
+        loaded_weight = loaded_weight.chunk(self.tp_size, self.tp_dim)[self.tp_rank]
+        param_data.copy_(loaded_weight)
+class RowParallelLinear(LinearBase):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = False,
+    ):
+        tp_size = dist.get_world_size()
+        super().__init__(divide(input_size, tp_size), output_size, bias, 1)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        shard_size = param_data.size(self.tp_dim)
+        start_idx = self.tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(self.tp_dim, start_idx, shard_size)
+        param_data.copy_(loaded_weight)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        y = F.linear(x, self.weight, self.bias if self.tp_rank == 0 else None)
+        if self.tp_size > 1:
+            dist.all_reduce(y)
+        return y

acestep/third_parts/nano-vllm/nanovllm/layers/rotary_embedding.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from functools import lru_cache
+import torch
+from torch import nn
+def apply_rotary_emb(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> torch.Tensor:
+    x1, x2 = torch.chunk(x.float(), 2, dim=-1)
+    y1 = x1 * cos - x2 * sin
+    y2 = x2 * cos + x1 * sin
+    return torch.cat((y1, y2), dim=-1).to(x.dtype)
+class RotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        assert rotary_dim == head_size
+        inv_freq = 1.0 / (base**(torch.arange(0, rotary_dim, 2, dtype=torch.float) / rotary_dim))
+        t = torch.arange(max_position_embeddings, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1).unsqueeze_(1)
+        self.register_buffer("cos_sin_cache", cache, persistent=False)
+    @torch.compile
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        query = apply_rotary_emb(query, cos, sin)
+        key = apply_rotary_emb(key, cos, sin)
+        return query, key
+@lru_cache(1)
+def get_rope(
+    head_size: int,
+    rotary_dim: int,
+    max_position: int,
+    base: float,
+    rope_scaling: dict | None = None,
+):
+    assert rope_scaling is None
+    rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base)
+    return rotary_emb

acestep/third_parts/nano-vllm/nanovllm/layers/sampler.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+from torch import nn
+from typing import Optional
+def apply_top_k_top_p(
+    logits: torch.Tensor,
+    k: Optional[torch.Tensor],
+    p: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """Apply top-k and top-p masks to the logits (vLLM style).
+    The logits tensor is updated in-place.
+    """
+    if p is None:
+        if k is None:
+            return logits
+        # Avoid sorting vocab for top-k only case
+        return apply_top_k_only(logits, k)
+    # Need to sort for top-p
+    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
+    if k is not None:
+        # Apply top-k first
+        vocab_size = logits_sort.size(1)
+        # Clamp k to valid range
+        k_clamped = k.clamp(1, vocab_size).long()
+        top_k_mask_idx = vocab_size - k_clamped  # shape: [B]
+        # Get the threshold value for each batch
+        top_k_thresh = logits_sort.gather(1, top_k_mask_idx.unsqueeze(1))
+        top_k_mask = logits_sort < top_k_thresh
+        logits_sort.masked_fill_(top_k_mask, float('-inf'))
+    # Apply top-p
+    probs_sort = logits_sort.softmax(dim=-1)
+    probs_sum = torch.cumsum(probs_sort, dim=-1, out=probs_sort)  # reuse buffer
+    top_p_mask = probs_sum <= (1.0 - p.unsqueeze(1))
+    # Ensure at least one token is kept
+    top_p_mask[:, -1] = False
+    logits_sort.masked_fill_(top_p_mask, float('-inf'))
+    # Re-sort back to original positions
+    logits.scatter_(dim=-1, index=logits_idx, src=logits_sort)
+    return logits
+def apply_top_k_only(
+    logits: torch.Tensor,
+    k: torch.Tensor,
+) -> torch.Tensor:
+    """Apply top-k mask without sorting the entire vocab (vLLM style).
+    This is much faster than sorting for top-k only cases.
+    The logits tensor is updated in-place.
+    """
+    vocab_size = logits.shape[1]
+    # Handle cases where k >= vocab_size (no filtering needed)
+    no_top_k_mask = (k <= 0) | (k >= vocab_size)
+    # Set invalid k to 1 so we can still gather
+    k_safe = k.masked_fill(no_top_k_mask, 1).long()
+    # NOTE: This int() causes CPU-GPU sync, but torch.topk requires Python int
+    max_top_k = int(k_safe.max().clamp(max=vocab_size))
+    # Get top-k values for all batches
+    # topk.values has shape [batch_size, max_top_k]
+    topk_values = logits.topk(max_top_k, dim=1).values
+    # Convert k to 0-based index: we want the k-th largest value (index k-1)
+    # Clamp to valid range for gather
+    k_index = (k_safe - 1).clamp(0, max_top_k - 1).unsqueeze(1)  # shape: [B, 1]
+    # Gather the threshold value (the k-th largest)
+    top_k_thresh = topk_values.gather(1, k_index)
+    # For rows with no top-k filtering, set threshold to -inf so nothing gets masked
+    top_k_thresh.masked_fill_(no_top_k_mask.unsqueeze(1), float('-inf'))
+    # Mask all values below the threshold
+    logits.masked_fill_(logits < top_k_thresh, float('-inf'))
+    return logits
+class Sampler(nn.Module):
+    def __init__(self):
+        super().__init__()
+    @torch.compile
+    def forward(
+        self,
+        logits: torch.Tensor,
+        temperatures: torch.Tensor,
+        top_ks: Optional[torch.Tensor] = None,
+        top_ps: Optional[torch.Tensor] = None,
+        repetition_penalties: Optional[torch.Tensor] = None,
+        input_ids: Optional[torch.Tensor] = None,
+    ):
+        """
+        Sample tokens from logits with optional top-k and top-p filtering.
+        Condition checking is done OUTSIDE the compiled function to avoid
+        graph breaks from .any() calls.
+        """
+        # Apply temperature
+        logits = logits.float().div_(temperatures.unsqueeze(dim=1))
+        logits = apply_top_k_top_p(
+            logits,
+            top_ks,
+            top_ps,
+        )
+        probs = torch.softmax(logits, dim=-1)
+        sample_tokens = probs.div_(torch.empty_like(probs).exponential_(1).clamp_min_(1e-10)).argmax(dim=-1)
+        return sample_tokens

acestep/third_parts/nano-vllm/nanovllm/llm.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from nanovllm.engine.llm_engine import LLMEngine
+class LLM(LLMEngine):
+    pass