mrfakename commited on Jan 29

Commit

9f5c8f7

verified ·

1 Parent(s): f0bb57d

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
code/.env.example +4 -0
code/.gitignore +228 -0
code/LICENSE +21 -0
code/README.md +221 -0
code/acestep/__init__.py +1 -0
code/acestep/acestep_v15_pipeline.py +298 -0
code/acestep/api_server.py +1725 -0
code/acestep/audio_utils.py +327 -0
code/acestep/constants.py +109 -0
code/acestep/constrained_logits_processor.py +0 -0
code/acestep/dataset_handler.py +37 -0
code/acestep/dit_alignment_score.py +870 -0
code/acestep/genres_vocab.txt +0 -0
code/acestep/gradio_ui/__init__.py +1 -0
code/acestep/gradio_ui/events/__init__.py +1129 -0
code/acestep/gradio_ui/events/generation_handlers.py +974 -0
code/acestep/gradio_ui/events/results_handlers.py +0 -0
code/acestep/gradio_ui/events/training_handlers.py +644 -0
code/acestep/gradio_ui/i18n.py +152 -0
code/acestep/gradio_ui/i18n/en.json +243 -0
code/acestep/gradio_ui/i18n/ja.json +243 -0
code/acestep/gradio_ui/i18n/zh.json +243 -0
code/acestep/gradio_ui/interfaces/__init__.py +90 -0
code/acestep/gradio_ui/interfaces/dataset.py +101 -0
code/acestep/gradio_ui/interfaces/generation.py +766 -0
code/acestep/gradio_ui/interfaces/result.py +552 -0
code/acestep/gradio_ui/interfaces/training.py +562 -0
code/acestep/handler.py +0 -0
code/acestep/inference.py +1164 -0
code/acestep/llm_inference.py +0 -0
code/acestep/local_cache.py +129 -0
code/acestep/test_time_scaling.py +410 -0
code/acestep/third_parts/nano-vllm/LICENSE +21 -0
code/acestep/third_parts/nano-vllm/README.md +66 -0
code/acestep/third_parts/nano-vllm/assets/logo.png +3 -0
code/acestep/third_parts/nano-vllm/bench.py +32 -0
code/acestep/third_parts/nano-vllm/example.py +33 -0
code/acestep/third_parts/nano-vllm/nanovllm/__init__.py +2 -0
code/acestep/third_parts/nano-vllm/nanovllm/config.py +26 -0
code/acestep/third_parts/nano-vllm/nanovllm/engine/block_manager.py +112 -0
code/acestep/third_parts/nano-vllm/nanovllm/engine/llm_engine.py +124 -0
code/acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py +529 -0
code/acestep/third_parts/nano-vllm/nanovllm/engine/scheduler.py +222 -0
code/acestep/third_parts/nano-vllm/nanovllm/engine/sequence.py +96 -0
code/acestep/third_parts/nano-vllm/nanovllm/layers/activation.py +14 -0
code/acestep/third_parts/nano-vllm/nanovllm/layers/attention.py +75 -0
code/acestep/third_parts/nano-vllm/nanovllm/layers/embed_head.py +66 -0
code/acestep/third_parts/nano-vllm/nanovllm/layers/layernorm.py +50 -0
code/acestep/third_parts/nano-vllm/nanovllm/layers/linear.py +153 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+code/acestep/third_parts/nano-vllm/assets/logo.png filter=lfs diff=lfs merge=lfs -text
+code/assets/ACE-Step_framework.png filter=lfs diff=lfs merge=lfs -text
+code/assets/acestudio_logo.png filter=lfs diff=lfs merge=lfs -text
+code/assets/application_map.png filter=lfs diff=lfs merge=lfs -text
+code/assets/model_zoo.png filter=lfs diff=lfs merge=lfs -text
+code/assets/orgnization_logos.png filter=lfs diff=lfs merge=lfs -text

code/.env.example ADDED Viewed

	@@ -0,0 +1,4 @@

+ACESTEP_CONFIG_PATH=acestep-v15-turbo
+ACESTEP_LM_MODEL_PATH=acestep-5Hz-lm-1.7B
+ACESTEP_DEVICE=auto
+ACESTEP_LM_BACKEND=vllm

code/.gitignore ADDED Viewed

	@@ -0,0 +1,228 @@

+data/
+*.mp3
+*.wav
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+tests/
+checkpoints/
+playground.ipynb
+.history/
+upload_checkpoints.sh
+checkpoints.7z
+README_old.md
+discord_bot/
+feishu_bot/
+tmp*
+torchinductor_root/
+scripts/
+checkpoints_legacy/
+lora_output/
+datasets/
+python_embeded/
+checkpoints_pack/

code/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 ACEStep
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

code/README.md ADDED Viewed

	@@ -0,0 +1,221 @@

+<h1 align="center">ACE-Step 1.5</h1>
+<h1 align="center">Pushing the Boundaries of Open-Source Music Generation</h1>
+<p align="center">
+    <a href="https://ace-step.github.io/ace-step-v1.5.github.io/">Project</a> |
+    <a href="https://huggingface.co/collections/ACE-Step/ace-step-15">Hugging Face</a> |
+    <a href="https://modelscope.cn/models/ACE-Step/ACE-Step-v1-5">ModelScope</a> |
+    <a href="https://huggingface.co/spaces/ACE-Step/Ace-Step-v1.5">Space Demo</a> |
+    <a href="https://discord.gg/PeWDxrkdj7">Discord</a> |
+    <a href="https://arxiv.org/abs/2506.00045">Technical Report</a>
+</p>
+<p align="center">
+    <img src="./assets/orgnization_logos.png" width="100%" alt="StepFun Logo">
+</p>
+## Table of Contents
+- [✨ Features](#-features)
+- [📦 Installation](#-installation)
+- [🚀 Usage](#-usage)
+- [🔨 Train](#-train)
+- [🏗️ Architecture](#️-architecture)
+- [🦁 Model Zoo](#-model-zoo)
+## 📝 Abstract
+🚀 We present ACE-Step v1.5, a highly efficient open-source music foundation model that brings commercial-grade generation to consumer hardware. On commonly used evaluation metrics, ACE-Step v1.5 achieves quality beyond most commercial music models while remaining extremely fast—under 2 seconds per full song on an A100 and under 10 seconds on an RTX 3090. The model runs locally with less than 4GB of VRAM, and supports lightweight personalization: users can train a LoRA from just a few songs to capture their own style.
+🌉 At its core lies a novel hybrid architecture where the Language Model (LM) functions as an omni-capable planner: it transforms simple user queries into comprehensive song blueprints—scaling from short loops to 10-minute compositions—while synthesizing metadata, lyrics, and captions via Chain-of-Thought to guide the Diffusion Transformer (DiT). ⚡ Uniquely, this alignment is achieved through intrinsic reinforcement learning relying solely on the model's internal mechanisms, thereby eliminating the biases inherent in external reward models or human preferences. 🎚️
+🔮 Beyond standard synthesis, ACE-Step v1.5 unifies precise stylistic control with versatile editing capabilities—such as cover generation, repainting, and vocal-to-BGM conversion—while maintaining strict adherence to prompts across 50+ languages. This paves the way for powerful tools that seamlessly integrate into the creative workflows of music artists, producers, and content creators. 🎸
+## ✨ Features
+<p align="center">
+    <img src="./assets/application_map.png" width="100%" alt="ACE-Step Framework">
+</p>
+### ⚡ Performance
+- ✅ **Ultra-Fast Generation** — Under 2s per full song on A100, under 10s on RTX 3090 (0.5s to 10s on A100 depending on think mode & diffusion steps)
+- ✅ **Flexible Duration** — Supports 10 seconds to 10 minutes (600s) audio generation
+- ✅ **Batch Generation** — Generate up to 8 songs simultaneously
+### 🎵 Generation Quality
+- ✅ **Commercial-Grade Output** — Quality beyond most commercial music models (between Suno v4.5 and Suno v5)
+- ✅ **Rich Style Support** — 1000+ instruments and styles with fine-grained timbre description
+- ✅ **Multi-Language Lyrics** — Supports 50+ languages with lyrics prompt for structure & style control
+### 🎛️ Versatility & Control
+| Feature | Description |
+|---------|-------------|
+| ✅ Reference Audio Input | Use reference audio to guide generation style |
+| ✅ Cover Generation | Create covers from existing audio |
+| ✅ Repaint & Edit | Selective local audio editing and regeneration |
+| ✅ Track Separation | Separate audio into individual stems |
+| ✅ Multi-Track Generation | Add layers like Suno Studio's "Add Layer" feature |
+| ✅ Vocal2BGM | Auto-generate accompaniment for vocal tracks |
+| ✅ Metadata Control | Control duration, BPM, key/scale, time signature |
+| ✅ Simple Mode | Generate full songs from simple descriptions |
+| ✅ Query Rewriting | Auto LM expansion of tags and lyrics |
+| ✅ Audio Understanding | Extract BPM, key/scale, time signature & caption from audio |
+| ✅ LRC Generation | Auto-generate lyric timestamps for generated music |
+| ✅ LoRA Training | One-click annotation & training in Gradio. 8 songs, 1 hour on 3090 (12GB VRAM) |
+| ✅ Quality Scoring | Automatic quality assessment for generated audio |
+## 📦 Installation
+> **Requirements:** Python 3.11, CUDA GPU recommended (works on CPU/MPS but slower)
+### 1. Install uv (Package Manager)
+```bash
+# macOS / Linux
+curl -LsSf https://astral.sh/uv/install.sh | sh
+# Windows (PowerShell)
+powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
+```
+### 2. Clone & Install
+```bash
+git clone https://github.com/ACE-Step/ACE-Step-1.5.git
+cd ACE-Step-1.5
+uv sync
+```
+### 3. Launch
+#### 🖥️ Gradio Web UI (Recommended)
+```bash
+uv run acestep
+```
+Open http://localhost:7860 in your browser. Models will be downloaded automatically on first run.
+#### 🌐 REST API Server
+```bash
+uv run acestep-api
+```
+API runs at http://localhost:8001. See [API Documentation](./docs/en/API.md) for endpoints.
+### Command Line Options
+**Gradio UI (`acestep`):**
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--port` | 7860 | Server port |
+| `--server-name` | 127.0.0.1 | Server address (use `0.0.0.0` for network access) |
+| `--share` | false | Create public Gradio link |
+| `--language` | en | UI language: `en`, `zh`, `ja` |
+| `--init_service` | false | Auto-initialize models on startup |
+| `--config_path` | auto | DiT model (e.g., `acestep-v15-turbo`, `acestep-v15-turbo-shift3`) |
+| `--lm_model_path` | auto | LM model (e.g., `acestep-5Hz-lm-0.6B`, `acestep-5Hz-lm-1.7B`) |
+| `--offload_to_cpu` | auto | CPU offload (auto-enabled if VRAM < 16GB) |
+**Examples:**
+```bash
+# Public access with Chinese UI
+uv run acestep --server-name 0.0.0.0 --share --language zh
+# Pre-initialize models on startup
+uv run acestep --init_service true --config_path acestep-v15-turbo
+```
+### Development
+```bash
+# Add dependencies
+uv add package-name
+uv add --dev package-name
+# Update all dependencies
+uv sync --upgrade
+```
+## 🚀 Usage
+We provide multiple ways to use ACE-Step:
+| Method | Description | Documentation |
+|--------|-------------|---------------|
+| 🖥️ **Gradio Web UI** | Interactive web interface for music generation | [Gradio Guide](./docs/en/GRADIO_GUIDE.md) |
+| 🐍 **Python API** | Programmatic access for integration | [Inference API](./docs/en/INFERENCE.md) |
+| 🌐 **REST API** | HTTP-based async API for services | [REST API](./docs/en/API.md) |
+**📚 Documentation available in:** [English](./docs/en/) | [中文](./docs/zh/) | [日本語](./docs/ja/)
+## 🔨 Train
+See the **LoRA Training** tab in Gradio UI for one-click training, or check [Gradio Guide - LoRA Training](./docs/en/GRADIO_GUIDE.md#lora-training) for details.
+## 🏗️ Architecture
+<p align="center">
+    <img src="./assets/ACE-Step_framework.png" width="100%" alt="ACE-Step Framework">
+</p>
+## 🦁 Model Zoo
+<p align="center">
+    <img src="./assets/model_zoo.png" width="100%" alt="Model Zoo">
+</p>
+### DiT Models
+| DiT Model | Pre-Training | SFT | RL | CFG | Step | Refer audio | Text2Music | Cover | Repaint | Extract | Lego | Complete | Quality | Diversity | Fine-Tunability | Hugging Face |
+|-----------|:------------:|:---:|:--:|:---:|:----:|:-----------:|:----------:|:-----:|:-------:|:-------:|:----:|:--------:|:-------:|:---------:|:---------------:|--------------|
+| `acestep-v15-base` | ✅ | ❌ | ❌ | ✅ | 50 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | Medium | High | Easy | [Link](https://huggingface.co/ACE-Step/acestep-v15-base) |
+| `acestep-v15-sft` | ✅ | ✅ | ❌ | ✅ | 50 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | High | Medium | Easy | [Link](https://huggingface.co/ACE-Step/acestep-v15-sft) |
+| `acestep-v15-turbo` | ✅ | ✅ | ❌ | ❌ | 8 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | Very High | Medium | Medium | [Link](https://huggingface.co/ACE-Step/Ace-Step1.5) |
+| `acestep-v15-turbo-rl` | ✅ | ✅ | ✅ | ❌ | 8 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | Very High | Medium | Medium | To be released |
+### LM Models
+| LM Model | Pretrain from | Pre-Training | SFT | RL | CoT metas | Query rewrite | Audio Understanding | Composition Capability | Copy Melody | Hugging Face |
+|----------|---------------|:------------:|:---:|:--:|:---------:|:-------------:|:-------------------:|:----------------------:|:-----------:|--------------|
+| `acestep-5Hz-lm-0.6B` | Qwen3-0.6B | ✅ | ✅ | ✅ | ✅ | ✅ | Medium | Medium | Weak | ✅ |
+| `acestep-5Hz-lm-1.7B` | Qwen3-1.7B | ✅ | ✅ | ✅ | ✅ | ✅ | Medium | Medium | Medium | ✅ |
+| `acestep-5Hz-lm-4B` | Qwen3-4B | ✅ | ✅ | ✅ | ✅ | ✅ | Strong | Strong | Strong | To be released |
+## 📜 License & Disclaimer
+This project is licensed under [MIT](./LICENSE)
+ACE-Step enables original music generation across diverse genres, with applications in creative production, education, and entertainment. While designed to support positive and artistic use cases, we acknowledge potential risks such as unintentional copyright infringement due to stylistic similarity, inappropriate blending of cultural elements, and misuse for generating harmful content. To ensure responsible use, we encourage users to verify the originality of generated works, clearly disclose AI involvement, and obtain appropriate permissions when adapting protected styles or materials. By using ACE-Step, you agree to uphold these principles and respect artistic integrity, cultural diversity, and legal compliance. The authors are not responsible for any misuse of the model, including but not limited to copyright violations, cultural insensitivity, or the generation of harmful content.
+🔔 Important Notice
+The only official website for the ACE-Step project is our GitHub Pages site.
+ We do not operate any other websites.
+🚫 Fake domains include but are not limited to:
+ac\*\*p.com, a\*\*p.org, a\*\*\*c.org
+⚠️ Please be cautious. Do not visit, trust, or make payments on any of those sites.
+## 🙏 Acknowledgements
+This project is co-led by ACE Studio and StepFun.
+## 📖 Citation
+If you find this project useful for your research, please consider citing:
+```BibTeX
+@misc{gong2026acestep,
+	title={ACE-Step 1.5: Pushing the Boundaries of Open-Source Music Generation},
+	author={Junmin Gong, Yulin Song, Wenxiao Zhao, Sen Wang, Shengyuan Xu, Jing Guo},
+	howpublished={\url{https://github.com/ace-step/ACE-Step-1.5}},
+	year={2026},
+	note={GitHub repository}
+}
+```

code/acestep/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """ACE-Step package."""

code/acestep/acestep_v15_pipeline.py ADDED Viewed

	@@ -0,0 +1,298 @@

+"""
+ACE-Step V1.5 Pipeline
+Handler wrapper connecting model and UI
+"""
+import os
+import sys
+# Load environment variables from .env file in project root
+# This allows configuration without hardcoding values
+# Falls back to .env.example if .env is not found
+try:
+    from dotenv import load_dotenv
+    # Get project root directory
+    _current_file = os.path.abspath(__file__)
+    _project_root = os.path.dirname(os.path.dirname(_current_file))
+    _env_path = os.path.join(_project_root, '.env')
+    _env_example_path = os.path.join(_project_root, '.env.example')
+    if os.path.exists(_env_path):
+        load_dotenv(_env_path)
+        print(f"Loaded configuration from {_env_path}")
+    elif os.path.exists(_env_example_path):
+        load_dotenv(_env_example_path)
+        print(f"Loaded configuration from {_env_example_path} (fallback)")
+except ImportError:
+    # python-dotenv not installed, skip loading .env
+    pass
+# Clear proxy settings that may affect Gradio
+for proxy_var in ['http_proxy', 'https_proxy', 'HTTP_PROXY', 'HTTPS_PROXY', 'ALL_PROXY']:
+    os.environ.pop(proxy_var, None)
+try:
+    # When executed as a module: `python -m acestep.acestep_v15_pipeline`
+    from .handler import AceStepHandler
+    from .llm_inference import LLMHandler
+    from .dataset_handler import DatasetHandler
+    from .gradio_ui import create_gradio_interface
+except ImportError:
+    # When executed as a script: `python acestep/acestep_v15_pipeline.py`
+    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    if project_root not in sys.path:
+        sys.path.insert(0, project_root)
+    from acestep.handler import AceStepHandler
+    from acestep.llm_inference import LLMHandler
+    from acestep.dataset_handler import DatasetHandler
+    from acestep.gradio_ui import create_gradio_interface
+def create_demo(init_params=None, language='en'):
+    """
+    Create Gradio demo interface
+    Args:
+        init_params: Dictionary containing initialization parameters and state.
+                    If None, service will not be pre-initialized.
+                    Keys: 'pre_initialized' (bool), 'checkpoint', 'config_path', 'device',
+                          'init_llm', 'lm_model_path', 'backend', 'use_flash_attention',
+                          'offload_to_cpu', 'offload_dit_to_cpu', 'init_status',
+                          'dit_handler', 'llm_handler' (initialized handlers if pre-initialized),
+                          'language' (UI language code)
+        language: UI language code ('en', 'zh', 'ja', default: 'en')
+    Returns:
+        Gradio Blocks instance
+    """
+    # Use pre-initialized handlers if available, otherwise create new ones
+    if init_params and init_params.get('pre_initialized') and 'dit_handler' in init_params:
+        dit_handler = init_params['dit_handler']
+        llm_handler = init_params['llm_handler']
+    else:
+        dit_handler = AceStepHandler()  # DiT handler
+        llm_handler = LLMHandler()      # LM handler
+    dataset_handler = DatasetHandler()  # Dataset handler
+    # Create Gradio interface with all handlers and initialization parameters
+    demo = create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_params=init_params, language=language)
+    return demo
+def get_gpu_memory_gb():
+    """
+    Get GPU memory in GB. Returns 0 if no GPU is available.
+    """
+    try:
+        import torch
+        if torch.cuda.is_available():
+            # Get total memory of the first GPU in GB
+            total_memory = torch.cuda.get_device_properties(0).total_memory
+            memory_gb = total_memory / (1024**3)  # Convert bytes to GB
+            return memory_gb
+        else:
+            return 0
+    except Exception as e:
+        print(f"Warning: Failed to detect GPU memory: {e}", file=sys.stderr)
+        return 0
+def main():
+    """Main entry function"""
+    import argparse
+    # Detect GPU memory to auto-configure offload settings
+    gpu_memory_gb = get_gpu_memory_gb()
+    auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < 16
+    if auto_offload:
+        print(f"Detected GPU memory: {gpu_memory_gb:.2f} GB (< 16GB)")
+        print("Auto-enabling CPU offload to reduce GPU memory usage")
+    elif gpu_memory_gb > 0:
+        print(f"Detected GPU memory: {gpu_memory_gb:.2f} GB (>= 16GB)")
+        print("CPU offload disabled by default")
+    else:
+        print("No GPU detected, running on CPU")
+    parser = argparse.ArgumentParser(description="Gradio Demo for ACE-Step V1.5")
+    parser.add_argument("--port", type=int, default=7860, help="Port to run the gradio server on")
+    parser.add_argument("--share", action="store_true", help="Create a public link")
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    parser.add_argument("--server-name", type=str, default="127.0.0.1", help="Server name (default: 127.0.0.1, use 0.0.0.0 for all interfaces)")
+    parser.add_argument("--language", type=str, default="en", choices=["en", "zh", "ja"], help="UI language: en (English), zh (中文), ja (日本語)")
+    # Service mode argument
+    parser.add_argument("--service_mode", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False,
+                       help="Enable service mode (default: False). When enabled, uses preset models and restricts UI options.")
+    # Service initialization arguments
+    parser.add_argument("--init_service", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False, help="Initialize service on startup (default: False)")
+    parser.add_argument("--checkpoint", type=str, default=None, help="Checkpoint file path (optional, for display purposes)")
+    parser.add_argument("--config_path", type=str, default=None, help="Main model path (e.g., 'acestep-v15-turbo')")
+    parser.add_argument("--device", type=str, default="auto", choices=["auto", "cuda", "cpu"], help="Processing device (default: auto)")
+    parser.add_argument("--init_llm", type=lambda x: x.lower() in ['true', '1', 'yes'], default=True, help="Initialize 5Hz LM (default: True)")
+    parser.add_argument("--lm_model_path", type=str, default=None, help="5Hz LM model path (e.g., 'acestep-5Hz-lm-0.6B')")
+    parser.add_argument("--backend", type=str, default="vllm", choices=["vllm", "pt"], help="5Hz LM backend (default: vllm)")
+    parser.add_argument("--use_flash_attention", type=lambda x: x.lower() in ['true', '1', 'yes'], default=None, help="Use flash attention (default: auto-detect)")
+    parser.add_argument("--offload_to_cpu", type=lambda x: x.lower() in ['true', '1', 'yes'], default=auto_offload, help=f"Offload models to CPU (default: {'True' if auto_offload else 'False'}, auto-detected based on GPU VRAM)")
+    parser.add_argument("--offload_dit_to_cpu", type=lambda x: x.lower() in ['true', '1', 'yes'], default=False, help="Offload DiT to CPU (default: False)")
+    args = parser.parse_args()
+    # Service mode defaults (can be configured via .env file)
+    if args.service_mode:
+        print("Service mode enabled - applying preset configurations...")
+        # Force init_service in service mode
+        args.init_service = True
+        # Default DiT model for service mode (from env or fallback)
+        if args.config_path is None:
+            args.config_path = os.environ.get(
+                "SERVICE_MODE_DIT_MODEL",
+                "acestep-v15-turbo-fix-inst-shift-dynamic"
+            )
+        # Default LM model for service mode (from env or fallback)
+        if args.lm_model_path is None:
+            args.lm_model_path = os.environ.get(
+                "SERVICE_MODE_LM_MODEL",
+                "acestep-5Hz-lm-1.7B-v4-fix"
+            )
+        # Backend for service mode (from env or fallback to vllm)
+        args.backend = os.environ.get("SERVICE_MODE_BACKEND", "vllm")
+        print(f"  DiT model: {args.config_path}")
+        print(f"  LM model: {args.lm_model_path}")
+        print(f"  Backend: {args.backend}")
+    try:
+        init_params = None
+        # If init_service is True, perform initialization before creating UI
+        if args.init_service:
+            print("Initializing service from command line...")
+            # Create handler instances for initialization
+            dit_handler = AceStepHandler()
+            llm_handler = LLMHandler()
+            # Auto-select config_path if not provided
+            if args.config_path is None:
+                available_models = dit_handler.get_available_acestep_v15_models()
+                if available_models:
+                    args.config_path = "acestep-v15-turbo" if "acestep-v15-turbo" in available_models else available_models[0]
+                    print(f"Auto-selected config_path: {args.config_path}")
+                else:
+                    print("Error: No available models found. Please specify --config_path", file=sys.stderr)
+                    sys.exit(1)
+            # Get project root (same logic as in handler)
+            current_file = os.path.abspath(__file__)
+            project_root = os.path.dirname(os.path.dirname(current_file))
+            # Determine flash attention setting
+            use_flash_attention = args.use_flash_attention
+            if use_flash_attention is None:
+                use_flash_attention = dit_handler.is_flash_attention_available()
+            # Initialize DiT handler
+            print(f"Initializing DiT model: {args.config_path} on {args.device}...")
+            init_status, enable_generate = dit_handler.initialize_service(
+                project_root=project_root,
+                config_path=args.config_path,
+                device=args.device,
+                use_flash_attention=use_flash_attention,
+                compile_model=False,
+                offload_to_cpu=args.offload_to_cpu,
+                offload_dit_to_cpu=args.offload_dit_to_cpu
+            )
+            if not enable_generate:
+                print(f"Error initializing DiT model: {init_status}", file=sys.stderr)
+                sys.exit(1)
+            print(f"DiT model initialized successfully")
+            # Initialize LM handler if requested
+            lm_status = ""
+            if args.init_llm:
+                if args.lm_model_path is None:
+                    # Try to get default LM model
+                    available_lm_models = llm_handler.get_available_5hz_lm_models()
+                    if available_lm_models:
+                        args.lm_model_path = available_lm_models[0]
+                        print(f"Using default LM model: {args.lm_model_path}")
+                    else:
+                        print("Warning: No LM models available, skipping LM initialization", file=sys.stderr)
+                        args.init_llm = False
+                if args.init_llm and args.lm_model_path:
+                    checkpoint_dir = os.path.join(project_root, "checkpoints")
+                    print(f"Initializing 5Hz LM: {args.lm_model_path} on {args.device}...")
+                    lm_status, lm_success = llm_handler.initialize(
+                        checkpoint_dir=checkpoint_dir,
+                        lm_model_path=args.lm_model_path,
+                        backend=args.backend,
+                        device=args.device,
+                        offload_to_cpu=args.offload_to_cpu,
+                        dtype=dit_handler.dtype
+                    )
+                    if lm_success:
+                        print(f"5Hz LM initialized successfully")
+                        init_status += f"\n{lm_status}"
+                    else:
+                        print(f"Warning: 5Hz LM initialization failed: {lm_status}", file=sys.stderr)
+                        init_status += f"\n{lm_status}"
+            # Prepare initialization parameters for UI
+            init_params = {
+                'pre_initialized': True,
+                'service_mode': args.service_mode,
+                'checkpoint': args.checkpoint,
+                'config_path': args.config_path,
+                'device': args.device,
+                'init_llm': args.init_llm,
+                'lm_model_path': args.lm_model_path,
+                'backend': args.backend,
+                'use_flash_attention': use_flash_attention,
+                'offload_to_cpu': args.offload_to_cpu,
+                'offload_dit_to_cpu': args.offload_dit_to_cpu,
+                'init_status': init_status,
+                'enable_generate': enable_generate,
+                'dit_handler': dit_handler,
+                'llm_handler': llm_handler,
+                'language': args.language
+            }
+            print("Service initialization completed successfully!")
+        # Create and launch demo
+        print(f"Creating Gradio interface with language: {args.language}...")
+        demo = create_demo(init_params=init_params, language=args.language)
+        # Enable queue for multi-user support
+        # This ensures proper request queuing and prevents concurrent generation conflicts
+        print("Enabling queue for multi-user support...")
+        demo.queue(
+            max_size=20,  # Maximum queue size (adjust based on your needs)
+            status_update_rate="auto",  # Update rate for queue status
+        )
+        print(f"Launching server on {args.server_name}:{args.port}...")
+        demo.launch(
+            server_name=args.server_name,
+            server_port=args.port,
+            share=args.share,
+            debug=args.debug,
+            show_error=True,
+            prevent_thread_lock=False,  # Keep thread locked to maintain server running
+            inbrowser=False,  # Don't auto-open browser
+        )
+    except Exception as e:
+        print(f"Error launching Gradio: {e}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

code/acestep/api_server.py ADDED Viewed

	@@ -0,0 +1,1725 @@

+"""FastAPI server for ACE-Step V1.5.
+Endpoints:
+- POST /release_task     Create music generation task
+- POST /query_result     Batch query task results
+- GET  /v1/models        List available models
+- GET  /v1/audio         Download audio file
+- GET  /health           Health check
+NOTE:
+- In-memory queue and job store -> run uvicorn with workers=1.
+"""
+from __future__ import annotations
+import asyncio
+import json
+import os
+import sys
+import time
+import traceback
+import tempfile
+import urllib.parse
+from collections import deque
+from concurrent.futures import ThreadPoolExecutor
+from contextlib import asynccontextmanager
+from dataclasses import dataclass
+from pathlib import Path
+from threading import Lock
+from typing import Any, Dict, List, Literal, Optional
+from uuid import uuid4
+try:
+    from dotenv import load_dotenv
+except ImportError:  # Optional dependency
+    load_dotenv = None  # type: ignore
+from fastapi import FastAPI, HTTPException, Request
+from pydantic import BaseModel, Field
+from starlette.datastructures import UploadFile as StarletteUploadFile
+from acestep.handler import AceStepHandler
+from acestep.llm_inference import LLMHandler
+from acestep.constants import (
+    DEFAULT_DIT_INSTRUCTION,
+    DEFAULT_LM_INSTRUCTION,
+    TASK_INSTRUCTIONS,
+)
+from acestep.inference import (
+    GenerationParams,
+    GenerationConfig,
+    generate_music,
+    create_sample,
+    format_sample,
+)
+from acestep.gradio_ui.events.results_handlers import _build_generation_info
+# =============================================================================
+# Constants
+# =============================================================================
+RESULT_KEY_PREFIX = "ace_step_v1.5_"
+RESULT_EXPIRE_SECONDS = 7 * 24 * 60 * 60  # 7 days
+TASK_TIMEOUT_SECONDS = 3600  # 1 hour
+JOB_STORE_CLEANUP_INTERVAL = 300  # 5 minutes - interval for cleaning up old jobs
+JOB_STORE_MAX_AGE_SECONDS = 86400  # 24 hours - completed jobs older than this will be cleaned
+STATUS_MAP = {"queued": 0, "running": 0, "succeeded": 1, "failed": 2}
+LM_DEFAULT_TEMPERATURE = 0.85
+LM_DEFAULT_CFG_SCALE = 2.5
+LM_DEFAULT_TOP_P = 0.9
+# Parameter aliases for request parsing
+PARAM_ALIASES = {
+    "prompt": ["prompt"],
+    "sample_mode": ["sample_mode", "sampleMode"],
+    "sample_query": ["sample_query", "sampleQuery", "description", "desc"],
+    "use_format": ["use_format", "useFormat", "format"],
+    "model": ["model", "dit_model", "ditModel"],
+    "key_scale": ["key_scale", "keyscale", "keyScale"],
+    "time_signature": ["time_signature", "timesignature", "timeSignature"],
+    "audio_duration": ["audio_duration", "duration", "audioDuration", "target_duration", "targetDuration"],
+    "vocal_language": ["vocal_language", "vocalLanguage"],
+    "inference_steps": ["inference_steps", "inferenceSteps"],
+    "guidance_scale": ["guidance_scale", "guidanceScale"],
+    "use_random_seed": ["use_random_seed", "useRandomSeed"],
+    "audio_code_string": ["audio_code_string", "audioCodeString"],
+    "audio_cover_strength": ["audio_cover_strength", "audioCoverStrength"],
+    "task_type": ["task_type", "taskType"],
+    "infer_method": ["infer_method", "inferMethod"],
+    "use_tiled_decode": ["use_tiled_decode", "useTiledDecode"],
+    "constrained_decoding": ["constrained_decoding", "constrainedDecoding", "constrained"],
+    "constrained_decoding_debug": ["constrained_decoding_debug", "constrainedDecodingDebug"],
+    "use_cot_caption": ["use_cot_caption", "cot_caption", "cot-caption"],
+    "use_cot_language": ["use_cot_language", "cot_language", "cot-language"],
+    "is_format_caption": ["is_format_caption", "isFormatCaption"],
+}
+def _parse_description_hints(description: str) -> tuple[Optional[str], bool]:
+    """
+    Parse a description string to extract language code and instrumental flag.
+    This function analyzes user descriptions like "Pop rock. English" or "piano solo"
+    to detect:
+    - Language: Maps language names to ISO codes (e.g., "English" -> "en")
+    - Instrumental: Detects patterns indicating instrumental/no-vocal music
+    Args:
+        description: User's natural language music description
+    Returns:
+        (language_code, is_instrumental) tuple:
+        - language_code: ISO language code (e.g., "en", "zh") or None if not detected
+        - is_instrumental: True if description indicates instrumental music
+    """
+    import re
+    if not description:
+        return None, False
+    description_lower = description.lower().strip()
+    # Language mapping: input patterns -> ISO code
+    language_mapping = {
+        'english': 'en', 'en': 'en',
+        'chinese': 'zh', '中文': 'zh', 'zh': 'zh', 'mandarin': 'zh',
+        'japanese': 'ja', '日本語': 'ja', 'ja': 'ja',
+        'korean': 'ko', '한국어': 'ko', 'ko': 'ko',
+        'spanish': 'es', 'español': 'es', 'es': 'es',
+        'french': 'fr', 'français': 'fr', 'fr': 'fr',
+        'german': 'de', 'deutsch': 'de', 'de': 'de',
+        'italian': 'it', 'italiano': 'it', 'it': 'it',
+        'portuguese': 'pt', 'português': 'pt', 'pt': 'pt',
+        'russian': 'ru', 'русский': 'ru', 'ru': 'ru',
+        'bengali': 'bn', 'bn': 'bn',
+        'hindi': 'hi', 'hi': 'hi',
+        'arabic': 'ar', 'ar': 'ar',
+        'thai': 'th', 'th': 'th',
+        'vietnamese': 'vi', 'vi': 'vi',
+        'indonesian': 'id', 'id': 'id',
+        'turkish': 'tr', 'tr': 'tr',
+        'dutch': 'nl', 'nl': 'nl',
+        'polish': 'pl', 'pl': 'pl',
+    }
+    # Detect language
+    detected_language = None
+    for lang_name, lang_code in language_mapping.items():
+        if len(lang_name) <= 2:
+            pattern = r'(?:^|\s|[.,;:!?])' + re.escape(lang_name) + r'(?:$|\s|[.,;:!?])'
+        else:
+            pattern = r'\b' + re.escape(lang_name) + r'\b'
+        if re.search(pattern, description_lower):
+            detected_language = lang_code
+            break
+    # Detect instrumental
+    is_instrumental = False
+    if 'instrumental' in description_lower:
+        is_instrumental = True
+    elif 'pure music' in description_lower or 'pure instrument' in description_lower:
+        is_instrumental = True
+    elif description_lower.endswith(' solo') or description_lower == 'solo':
+        is_instrumental = True
+    return detected_language, is_instrumental
+JobStatus = Literal["queued", "running", "succeeded", "failed"]
+class GenerateMusicRequest(BaseModel):
+    prompt: str = Field(default="", description="Text prompt describing the music")
+    lyrics: str = Field(default="", description="Lyric text")
+    # New API semantics:
+    # - thinking=True: use 5Hz LM to generate audio codes (lm-dit behavior)
+    # - thinking=False: do not use LM to generate codes (dit behavior)
+    # Regardless of thinking, if some metas are missing, server may use LM to fill them.
+    thinking: bool = False
+    # Sample-mode requests auto-generate caption/lyrics/metas via LM (no user prompt).
+    sample_mode: bool = False
+    # Description for sample mode: auto-generate caption/lyrics from description query
+    sample_query: str = Field(default="", description="Query/description for sample mode (use create_sample)")
+    # Whether to use format_sample() to enhance input caption/lyrics
+    use_format: bool = Field(default=False, description="Use format_sample() to enhance input (default: False)")
+    # Model name for multi-model support (select which DiT model to use)
+    model: Optional[str] = Field(default=None, description="Model name to use (e.g., 'acestep-v15-turbo')")
+    bpm: Optional[int] = None
+    # Accept common client keys via manual parsing (see RequestParser).
+    key_scale: str = ""
+    time_signature: str = ""
+    vocal_language: str = "en"
+    inference_steps: int = 8
+    guidance_scale: float = 7.0
+    use_random_seed: bool = True
+    seed: int = -1
+    reference_audio_path: Optional[str] = None
+    src_audio_path: Optional[str] = None
+    audio_duration: Optional[float] = None
+    batch_size: Optional[int] = None
+    audio_code_string: str = ""
+    repainting_start: float = 0.0
+    repainting_end: Optional[float] = None
+    instruction: str = DEFAULT_DIT_INSTRUCTION
+    audio_cover_strength: float = 1.0
+    task_type: str = "text2music"
+    use_adg: bool = False
+    cfg_interval_start: float = 0.0
+    cfg_interval_end: float = 1.0
+    infer_method: str = "ode"  # "ode" or "sde" - diffusion inference method
+    shift: float = Field(
+        default=3.0,
+        description="Timestep shift factor (range 1.0~5.0, default 3.0). Only effective for base models, not turbo models."
+    )
+    timesteps: Optional[str] = Field(
+        default=None,
+        description="Custom timesteps (comma-separated, e.g., '0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0'). Overrides inference_steps and shift."
+    )
+    audio_format: str = "mp3"
+    use_tiled_decode: bool = True
+    # 5Hz LM (server-side): used for metadata completion and (when thinking=True) codes generation.
+    lm_model_path: Optional[str] = None  # e.g. "acestep-5Hz-lm-0.6B"
+    lm_backend: Literal["vllm", "pt"] = "vllm"
+    constrained_decoding: bool = True
+    constrained_decoding_debug: bool = False
+    use_cot_caption: bool = True
+    use_cot_language: bool = True
+    is_format_caption: bool = False
+    lm_temperature: float = 0.85
+    lm_cfg_scale: float = 2.5
+    lm_top_k: Optional[int] = None
+    lm_top_p: Optional[float] = 0.9
+    lm_repetition_penalty: float = 1.0
+    lm_negative_prompt: str = "NO USER INPUT"
+    class Config:
+        allow_population_by_field_name = True
+        allow_population_by_alias = True
+class CreateJobResponse(BaseModel):
+    task_id: str
+    status: JobStatus
+    queue_position: int = 0  # 1-based best-effort position when queued
+class JobResult(BaseModel):
+    first_audio_path: Optional[str] = None
+    second_audio_path: Optional[str] = None
+    audio_paths: list[str] = Field(default_factory=list)
+    generation_info: str = ""
+    status_message: str = ""
+    seed_value: str = ""
+    metas: Dict[str, Any] = Field(default_factory=dict)
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    genres: Optional[str] = None
+    keyscale: Optional[str] = None
+    timesignature: Optional[str] = None
+    # Model information
+    lm_model: Optional[str] = None
+    dit_model: Optional[str] = None
+class JobResponse(BaseModel):
+    job_id: str
+    status: JobStatus
+    created_at: float
+    started_at: Optional[float] = None
+    finished_at: Optional[float] = None
+    # queue observability
+    queue_position: int = 0
+    eta_seconds: Optional[float] = None
+    avg_job_seconds: Optional[float] = None
+    result: Optional[JobResult] = None
+    error: Optional[str] = None
+@dataclass
+class _JobRecord:
+    job_id: str
+    status: JobStatus
+    created_at: float
+    started_at: Optional[float] = None
+    finished_at: Optional[float] = None
+    result: Optional[Dict[str, Any]] = None
+    error: Optional[str] = None
+    env: str = "development"
+class _JobStore:
+    def __init__(self, max_age_seconds: int = JOB_STORE_MAX_AGE_SECONDS) -> None:
+        self._lock = Lock()
+        self._jobs: Dict[str, _JobRecord] = {}
+        self._max_age = max_age_seconds
+    def create(self) -> _JobRecord:
+        job_id = str(uuid4())
+        rec = _JobRecord(job_id=job_id, status="queued", created_at=time.time())
+        with self._lock:
+            self._jobs[job_id] = rec
+        return rec
+    def create_with_id(self, job_id: str, env: str = "development") -> _JobRecord:
+        """Create job record with specified ID"""
+        rec = _JobRecord(
+            job_id=job_id,
+            status="queued",
+            created_at=time.time(),
+            env=env
+        )
+        with self._lock:
+            self._jobs[job_id] = rec
+        return rec
+    def get(self, job_id: str) -> Optional[_JobRecord]:
+        with self._lock:
+            return self._jobs.get(job_id)
+    def mark_running(self, job_id: str) -> None:
+        with self._lock:
+            rec = self._jobs[job_id]
+            rec.status = "running"
+            rec.started_at = time.time()
+    def mark_succeeded(self, job_id: str, result: Dict[str, Any]) -> None:
+        with self._lock:
+            rec = self._jobs[job_id]
+            rec.status = "succeeded"
+            rec.finished_at = time.time()
+            rec.result = result
+            rec.error = None
+    def mark_failed(self, job_id: str, error: str) -> None:
+        with self._lock:
+            rec = self._jobs[job_id]
+            rec.status = "failed"
+            rec.finished_at = time.time()
+            rec.result = None
+            rec.error = error
+    def cleanup_old_jobs(self, max_age_seconds: Optional[int] = None) -> int:
+        """
+        Clean up completed jobs older than max_age_seconds.
+        Only removes jobs with status 'succeeded' or 'failed'.
+        Jobs that are 'queued' or 'running' are never removed.
+        Returns the number of jobs removed.
+        """
+        max_age = max_age_seconds if max_age_seconds is not None else self._max_age
+        now = time.time()
+        removed = 0
+        with self._lock:
+            to_remove = []
+            for job_id, rec in self._jobs.items():
+                if rec.status in ("succeeded", "failed"):
+                    finish_time = rec.finished_at or rec.created_at
+                    age = now - finish_time
+                    if age > max_age:
+                        to_remove.append(job_id)
+            for job_id in to_remove:
+                del self._jobs[job_id]
+                removed += 1
+        return removed
+    def get_stats(self) -> Dict[str, int]:
+        """Get statistics about jobs in the store."""
+        with self._lock:
+            stats = {
+                "total": len(self._jobs),
+                "queued": 0,
+                "running": 0,
+                "succeeded": 0,
+                "failed": 0,
+            }
+            for rec in self._jobs.values():
+                if rec.status in stats:
+                    stats[rec.status] += 1
+            return stats
+def _env_bool(name: str, default: bool) -> bool:
+    v = os.getenv(name)
+    if v is None:
+        return default
+    return v.strip().lower() in {"1", "true", "yes", "y", "on"}
+def _get_project_root() -> str:
+    current_file = os.path.abspath(__file__)
+    return os.path.dirname(os.path.dirname(current_file))
+def _get_model_name(config_path: str) -> str:
+    """
+    Extract model name from config_path.
+    Args:
+        config_path: Path like "acestep-v15-turbo" or "/path/to/acestep-v15-turbo"
+    Returns:
+        Model name (last directory name from config_path)
+    """
+    if not config_path:
+        return ""
+    normalized = config_path.rstrip("/\\")
+    return os.path.basename(normalized)
+def _load_project_env() -> None:
+    if load_dotenv is None:
+        return
+    try:
+        project_root = _get_project_root()
+        env_path = os.path.join(project_root, ".env")
+        if os.path.exists(env_path):
+            load_dotenv(env_path, override=False)
+    except Exception:
+        # Optional best-effort: continue even if .env loading fails.
+        pass
+_load_project_env()
+def _to_int(v: Any, default: Optional[int] = None) -> Optional[int]:
+    if v is None:
+        return default
+    if isinstance(v, int):
+        return v
+    s = str(v).strip()
+    if s == "":
+        return default
+    try:
+        return int(s)
+    except Exception:
+        return default
+def _to_float(v: Any, default: Optional[float] = None) -> Optional[float]:
+    if v is None:
+        return default
+    if isinstance(v, float):
+        return v
+    s = str(v).strip()
+    if s == "":
+        return default
+    try:
+        return float(s)
+    except Exception:
+        return default
+def _to_bool(v: Any, default: bool = False) -> bool:
+    if v is None:
+        return default
+    if isinstance(v, bool):
+        return v
+    s = str(v).strip().lower()
+    if s == "":
+        return default
+    return s in {"1", "true", "yes", "y", "on"}
+def _map_status(status: str) -> int:
+    """Map job status string to integer code."""
+    return STATUS_MAP.get(status, 2)
+def _parse_timesteps(s: Optional[str]) -> Optional[List[float]]:
+    """Parse comma-separated timesteps string to list of floats."""
+    if not s or not s.strip():
+        return None
+    try:
+        return [float(t.strip()) for t in s.split(",") if t.strip()]
+    except (ValueError, Exception):
+        return None
+def _is_instrumental(lyrics: str) -> bool:
+    """
+    Determine if the music should be instrumental based on lyrics.
+    Returns True if:
+    - lyrics is empty or whitespace only
+    - lyrics (lowercased and trimmed) is "[inst]" or "[instrumental]"
+    """
+    if not lyrics:
+        return True
+    lyrics_clean = lyrics.strip().lower()
+    if not lyrics_clean:
+        return True
+    return lyrics_clean in ("[inst]", "[instrumental]")
+class RequestParser:
+    """Parse request parameters from multiple sources with alias support."""
+    def __init__(self, raw: dict):
+        self._raw = dict(raw) if raw else {}
+        self._param_obj = self._parse_json(self._raw.get("param_obj"))
+        self._metas = self._find_metas()
+    def _parse_json(self, v) -> dict:
+        if isinstance(v, dict):
+            return v
+        if isinstance(v, str) and v.strip():
+            try:
+                return json.loads(v)
+            except Exception:
+                pass
+        return {}
+    def _find_metas(self) -> dict:
+        for key in ("metas", "meta", "metadata", "user_metadata", "userMetadata"):
+            v = self._raw.get(key)
+            if v:
+                return self._parse_json(v)
+        return {}
+    def get(self, name: str, default=None):
+        """Get parameter by canonical name from all sources."""
+        aliases = PARAM_ALIASES.get(name, [name])
+        for source in (self._raw, self._param_obj, self._metas):
+            for alias in aliases:
+                v = source.get(alias)
+                if v is not None:
+                    return v
+        return default
+    def str(self, name: str, default: str = "") -> str:
+        v = self.get(name)
+        return str(v) if v is not None else default
+    def int(self, name: str, default: Optional[int] = None) -> Optional[int]:
+        return _to_int(self.get(name), default)
+    def float(self, name: str, default: Optional[float] = None) -> Optional[float]:
+        return _to_float(self.get(name), default)
+    def bool(self, name: str, default: bool = False) -> bool:
+        return _to_bool(self.get(name), default)
+async def _save_upload_to_temp(upload: StarletteUploadFile, *, prefix: str) -> str:
+    suffix = Path(upload.filename or "").suffix
+    fd, path = tempfile.mkstemp(prefix=f"{prefix}_", suffix=suffix)
+    os.close(fd)
+    try:
+        with open(path, "wb") as f:
+            while True:
+                chunk = await upload.read(1024 * 1024)
+                if not chunk:
+                    break
+                f.write(chunk)
+    except Exception:
+        try:
+            os.remove(path)
+        except Exception:
+            pass
+        raise
+    finally:
+        try:
+            await upload.close()
+        except Exception:
+            pass
+    return path
+def create_app() -> FastAPI:
+    store = _JobStore()
+    QUEUE_MAXSIZE = int(os.getenv("ACESTEP_QUEUE_MAXSIZE", "200"))
+    WORKER_COUNT = int(os.getenv("ACESTEP_QUEUE_WORKERS", "1"))  # Single GPU recommended
+    INITIAL_AVG_JOB_SECONDS = float(os.getenv("ACESTEP_AVG_JOB_SECONDS", "5.0"))
+    AVG_WINDOW = int(os.getenv("ACESTEP_AVG_WINDOW", "50"))
+    def _path_to_audio_url(path: str) -> str:
+        """Convert local file path to downloadable relative URL"""
+        if not path:
+            return path
+        if path.startswith("http://") or path.startswith("https://"):
+            return path
+        encoded_path = urllib.parse.quote(path, safe="")
+        return f"/v1/audio?path={encoded_path}"
+    @asynccontextmanager
+    async def lifespan(app: FastAPI):
+        # Clear proxy env that may affect downstream libs
+        for proxy_var in ["http_proxy", "https_proxy", "HTTP_PROXY", "HTTPS_PROXY", "ALL_PROXY"]:
+            os.environ.pop(proxy_var, None)
+        # Ensure compilation/temp caches do not fill up small default /tmp.
+        # Triton/Inductor (and the system compiler) can create large temporary files.
+        project_root = _get_project_root()
+        cache_root = os.path.join(project_root, ".cache", "acestep")
+        tmp_root = (os.getenv("ACESTEP_TMPDIR") or os.path.join(cache_root, "tmp")).strip()
+        triton_cache_root = (os.getenv("TRITON_CACHE_DIR") or os.path.join(cache_root, "triton")).strip()
+        inductor_cache_root = (os.getenv("TORCHINDUCTOR_CACHE_DIR") or os.path.join(cache_root, "torchinductor")).strip()
+        for p in [cache_root, tmp_root, triton_cache_root, inductor_cache_root]:
+            try:
+                os.makedirs(p, exist_ok=True)
+            except Exception:
+                # Best-effort: do not block startup if directory creation fails.
+                pass
+        # Respect explicit user overrides; if ACESTEP_TMPDIR is set, it should win.
+        if os.getenv("ACESTEP_TMPDIR"):
+            os.environ["TMPDIR"] = tmp_root
+            os.environ["TEMP"] = tmp_root
+            os.environ["TMP"] = tmp_root
+        else:
+            os.environ.setdefault("TMPDIR", tmp_root)
+            os.environ.setdefault("TEMP", tmp_root)
+            os.environ.setdefault("TMP", tmp_root)
+        os.environ.setdefault("TRITON_CACHE_DIR", triton_cache_root)
+        os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", inductor_cache_root)
+        handler = AceStepHandler()
+        llm_handler = LLMHandler()
+        init_lock = asyncio.Lock()
+        app.state._initialized = False
+        app.state._init_error = None
+        app.state._init_lock = init_lock
+        app.state.llm_handler = llm_handler
+        app.state._llm_initialized = False
+        app.state._llm_init_error = None
+        app.state._llm_init_lock = Lock()
+        # Multi-model support: secondary DiT handlers
+        handler2 = None
+        handler3 = None
+        config_path2 = os.getenv("ACESTEP_CONFIG_PATH2", "").strip()
+        config_path3 = os.getenv("ACESTEP_CONFIG_PATH3", "").strip()
+        if config_path2:
+            handler2 = AceStepHandler()
+        if config_path3:
+            handler3 = AceStepHandler()
+        app.state.handler2 = handler2
+        app.state.handler3 = handler3
+        app.state._initialized2 = False
+        app.state._initialized3 = False
+        app.state._config_path = os.getenv("ACESTEP_CONFIG_PATH", "acestep-v15-turbo")
+        app.state._config_path2 = config_path2
+        app.state._config_path3 = config_path3
+        max_workers = int(os.getenv("ACESTEP_API_WORKERS", "1"))
+        executor = ThreadPoolExecutor(max_workers=max_workers)
+        # Queue & observability
+        app.state.job_queue = asyncio.Queue(maxsize=QUEUE_MAXSIZE)  # (job_id, req)
+        app.state.pending_ids = deque()  # queued job_ids
+        app.state.pending_lock = asyncio.Lock()
+        # temp files per job (from multipart uploads)
+        app.state.job_temp_files = {}  # job_id -> list[path]
+        app.state.job_temp_files_lock = asyncio.Lock()
+        # stats
+        app.state.stats_lock = asyncio.Lock()
+        app.state.recent_durations = deque(maxlen=AVG_WINDOW)
+        app.state.avg_job_seconds = INITIAL_AVG_JOB_SECONDS
+        app.state.handler = handler
+        app.state.executor = executor
+        app.state.job_store = store
+        app.state._python_executable = sys.executable
+        # Temporary directory for saving generated audio files
+        app.state.temp_audio_dir = os.path.join(tmp_root, "api_audio")
+        os.makedirs(app.state.temp_audio_dir, exist_ok=True)
+        # Initialize local cache
+        try:
+            from acestep.local_cache import get_local_cache
+            local_cache_dir = os.path.join(cache_root, "local_redis")
+            app.state.local_cache = get_local_cache(local_cache_dir)
+        except ImportError:
+            app.state.local_cache = None
+        async def _ensure_initialized() -> None:
+            h: AceStepHandler = app.state.handler
+            if getattr(app.state, "_initialized", False):
+                return
+            if getattr(app.state, "_init_error", None):
+                raise RuntimeError(app.state._init_error)
+            async with app.state._init_lock:
+                if getattr(app.state, "_initialized", False):
+                    return
+                if getattr(app.state, "_init_error", None):
+                    raise RuntimeError(app.state._init_error)
+                project_root = _get_project_root()
+                config_path = os.getenv("ACESTEP_CONFIG_PATH", "acestep-v15-turbo")
+                device = os.getenv("ACESTEP_DEVICE", "auto")
+                use_flash_attention = _env_bool("ACESTEP_USE_FLASH_ATTENTION", True)
+                offload_to_cpu = _env_bool("ACESTEP_OFFLOAD_TO_CPU", False)
+                offload_dit_to_cpu = _env_bool("ACESTEP_OFFLOAD_DIT_TO_CPU", False)
+                # Initialize primary model
+                status_msg, ok = h.initialize_service(
+                    project_root=project_root,
+                    config_path=config_path,
+                    device=device,
+                    use_flash_attention=use_flash_attention,
+                    compile_model=False,
+                    offload_to_cpu=offload_to_cpu,
+                    offload_dit_to_cpu=offload_dit_to_cpu,
+                )
+                if not ok:
+                    app.state._init_error = status_msg
+                    raise RuntimeError(status_msg)
+                app.state._initialized = True
+                # Initialize secondary model if configured
+                if app.state.handler2 and app.state._config_path2:
+                    try:
+                        status_msg2, ok2 = app.state.handler2.initialize_service(
+                            project_root=project_root,
+                            config_path=app.state._config_path2,
+                            device=device,
+                            use_flash_attention=use_flash_attention,
+                            compile_model=False,
+                            offload_to_cpu=offload_to_cpu,
+                            offload_dit_to_cpu=offload_dit_to_cpu,
+                        )
+                        app.state._initialized2 = ok2
+                        if ok2:
+                            print(f"[API Server] Secondary model loaded: {_get_model_name(app.state._config_path2)}")
+                        else:
+                            print(f"[API Server] Warning: Secondary model failed to load: {status_msg2}")
+                    except Exception as e:
+                        print(f"[API Server] Warning: Failed to initialize secondary model: {e}")
+                        app.state._initialized2 = False
+                # Initialize third model if configured
+                if app.state.handler3 and app.state._config_path3:
+                    try:
+                        status_msg3, ok3 = app.state.handler3.initialize_service(
+                            project_root=project_root,
+                            config_path=app.state._config_path3,
+                            device=device,
+                            use_flash_attention=use_flash_attention,
+                            compile_model=False,
+                            offload_to_cpu=offload_to_cpu,
+                            offload_dit_to_cpu=offload_dit_to_cpu,
+                        )
+                        app.state._initialized3 = ok3
+                        if ok3:
+                            print(f"[API Server] Third model loaded: {_get_model_name(app.state._config_path3)}")
+                        else:
+                            print(f"[API Server] Warning: Third model failed to load: {status_msg3}")
+                    except Exception as e:
+                        print(f"[API Server] Warning: Failed to initialize third model: {e}")
+                        app.state._initialized3 = False
+        async def _cleanup_job_temp_files(job_id: str) -> None:
+            async with app.state.job_temp_files_lock:
+                paths = app.state.job_temp_files.pop(job_id, [])
+            for p in paths:
+                try:
+                    os.remove(p)
+                except Exception:
+                    pass
+        def _update_local_cache(job_id: str, result: Optional[Dict], status: str) -> None:
+            """Update local cache with job result"""
+            local_cache = getattr(app.state, 'local_cache', None)
+            if not local_cache:
+                return
+            rec = store.get(job_id)
+            env = getattr(rec, 'env', 'development') if rec else 'development'
+            create_time = rec.created_at if rec else time.time()
+            status_int = _map_status(status)
+            if status == "succeeded" and result:
+                audio_paths = result.get("audio_paths", [])
+                # Final prompt/lyrics (may be modified by thinking/format)
+                final_prompt = result.get("prompt", "")
+                final_lyrics = result.get("lyrics", "")
+                # Original user input from metas
+                metas_raw = result.get("metas", {}) or {}
+                original_prompt = metas_raw.get("prompt", "")
+                original_lyrics = metas_raw.get("lyrics", "")
+                # metas contains original input + other metadata
+                metas = {
+                    "bpm": metas_raw.get("bpm"),
+                    "duration": metas_raw.get("duration"),
+                    "genres": metas_raw.get("genres", ""),
+                    "keyscale": metas_raw.get("keyscale", ""),
+                    "timesignature": metas_raw.get("timesignature", ""),
+                    "prompt": original_prompt,
+                    "lyrics": original_lyrics,
+                }
+                # Extra fields for Discord bot
+                generation_info = result.get("generation_info", "")
+                seed_value = result.get("seed_value", "")
+                lm_model = result.get("lm_model", "")
+                dit_model = result.get("dit_model", "")
+                if audio_paths:
+                    result_data = [
+                        {
+                            "file": p,
+                            "wave": "",
+                            "status": status_int,
+                            "create_time": int(create_time),
+                            "env": env,
+                            "prompt": final_prompt,
+                            "lyrics": final_lyrics,
+                            "metas": metas,
+                            "generation_info": generation_info,
+                            "seed_value": seed_value,
+                            "lm_model": lm_model,
+                            "dit_model": dit_model,
+                        }
+                        for p in audio_paths
+                    ]
+                else:
+                    result_data = [{
+                        "file": "",
+                        "wave": "",
+                        "status": status_int,
+                        "create_time": int(create_time),
+                        "env": env,
+                        "prompt": final_prompt,
+                        "lyrics": final_lyrics,
+                        "metas": metas,
+                        "generation_info": generation_info,
+                        "seed_value": seed_value,
+                        "lm_model": lm_model,
+                        "dit_model": dit_model,
+                    }]
+            else:
+                result_data = [{"file": "", "wave": "", "status": status_int, "create_time": int(create_time), "env": env}]
+            result_key = f"{RESULT_KEY_PREFIX}{job_id}"
+            local_cache.set(result_key, result_data, ex=RESULT_EXPIRE_SECONDS)
+        async def _run_one_job(job_id: str, req: GenerateMusicRequest) -> None:
+            job_store: _JobStore = app.state.job_store
+            llm: LLMHandler = app.state.llm_handler
+            executor: ThreadPoolExecutor = app.state.executor
+            await _ensure_initialized()
+            job_store.mark_running(job_id)
+            # Select DiT handler based on user's model choice
+            # Default: use primary handler
+            selected_handler: AceStepHandler = app.state.handler
+            selected_model_name = _get_model_name(app.state._config_path)
+            if req.model:
+                model_matched = False
+                # Check if it matches the second model
+                if app.state.handler2 and getattr(app.state, "_initialized2", False):
+                    model2_name = _get_model_name(app.state._config_path2)
+                    if req.model == model2_name:
+                        selected_handler = app.state.handler2
+                        selected_model_name = model2_name
+                        model_matched = True
+                        print(f"[API Server] Job {job_id}: Using second model: {model2_name}")
+                # Check if it matches the third model
+                if not model_matched and app.state.handler3 and getattr(app.state, "_initialized3", False):
+                    model3_name = _get_model_name(app.state._config_path3)
+                    if req.model == model3_name:
+                        selected_handler = app.state.handler3
+                        selected_model_name = model3_name
+                        model_matched = True
+                        print(f"[API Server] Job {job_id}: Using third model: {model3_name}")
+                if not model_matched:
+                    available_models = [_get_model_name(app.state._config_path)]
+                    if app.state.handler2 and getattr(app.state, "_initialized2", False):
+                        available_models.append(_get_model_name(app.state._config_path2))
+                    if app.state.handler3 and getattr(app.state, "_initialized3", False):
+                        available_models.append(_get_model_name(app.state._config_path3))
+                    print(f"[API Server] Job {job_id}: Model '{req.model}' not found in {available_models}, using primary: {selected_model_name}")
+            # Use selected handler for generation
+            h: AceStepHandler = selected_handler
+            def _blocking_generate() -> Dict[str, Any]:
+                """Generate music using unified inference logic from acestep.inference"""
+                def _ensure_llm_ready() -> None:
+                    """Ensure LLM handler is initialized when needed"""
+                    with app.state._llm_init_lock:
+                        initialized = getattr(app.state, "_llm_initialized", False)
+                        had_error = getattr(app.state, "_llm_init_error", None)
+                        if initialized or had_error is not None:
+                            return
+                        project_root = _get_project_root()
+                        checkpoint_dir = os.path.join(project_root, "checkpoints")
+                        lm_model_path = (req.lm_model_path or os.getenv("ACESTEP_LM_MODEL_PATH") or "acestep-5Hz-lm-0.6B").strip()
+                        backend = (req.lm_backend or os.getenv("ACESTEP_LM_BACKEND") or "vllm").strip().lower()
+                        if backend not in {"vllm", "pt"}:
+                            backend = "vllm"
+                        lm_device = os.getenv("ACESTEP_LM_DEVICE", os.getenv("ACESTEP_DEVICE", "auto"))
+                        lm_offload = _env_bool("ACESTEP_LM_OFFLOAD_TO_CPU", False)
+                        status, ok = llm.initialize(
+                            checkpoint_dir=checkpoint_dir,
+                            lm_model_path=lm_model_path,
+                            backend=backend,
+                            device=lm_device,
+                            offload_to_cpu=lm_offload,
+                            dtype=h.dtype,
+                        )
+                        if not ok:
+                            app.state._llm_init_error = status
+                        else:
+                            app.state._llm_initialized = True
+                def _normalize_metas(meta: Dict[str, Any]) -> Dict[str, Any]:
+                    """Ensure a stable `metas` dict (keys always present)."""
+                    meta = meta or {}
+                    out: Dict[str, Any] = dict(meta)
+                    # Normalize key aliases
+                    if "keyscale" not in out and "key_scale" in out:
+                        out["keyscale"] = out.get("key_scale")
+                    if "timesignature" not in out and "time_signature" in out:
+                        out["timesignature"] = out.get("time_signature")
+                    # Ensure required keys exist
+                    for k in ["bpm", "duration", "genres", "keyscale", "timesignature"]:
+                        if out.get(k) in (None, ""):
+                            out[k] = "N/A"
+                    return out
+                # Normalize LM sampling parameters
+                lm_top_k = req.lm_top_k if req.lm_top_k and req.lm_top_k > 0 else 0
+                lm_top_p = req.lm_top_p if req.lm_top_p and req.lm_top_p < 1.0 else 0.9
+                # Determine if LLM is needed
+                thinking = bool(req.thinking)
+                sample_mode = bool(req.sample_mode)
+                has_sample_query = bool(req.sample_query and req.sample_query.strip())
+                use_format = bool(req.use_format)
+                use_cot_caption = bool(req.use_cot_caption)
+                use_cot_language = bool(req.use_cot_language)
+                # LLM is needed for:
+                # - thinking mode (LM generates audio codes)
+                # - sample_mode (LM generates random caption/lyrics/metas)
+                # - sample_query/description (LM generates from description)
+                # - use_format (LM enhances caption/lyrics)
+                # - use_cot_caption or use_cot_language (LM enhances metadata)
+                need_llm = thinking or sample_mode or has_sample_query or use_format or use_cot_caption or use_cot_language
+                # Ensure LLM is ready if needed
+                if need_llm:
+                    _ensure_llm_ready()
+                    if getattr(app.state, "_llm_init_error", None):
+                        raise RuntimeError(f"5Hz LM init failed: {app.state._llm_init_error}")
+                # Handle sample mode or description: generate caption/lyrics/metas via LM
+                caption = req.prompt
+                lyrics = req.lyrics
+                bpm = req.bpm
+                key_scale = req.key_scale
+                time_signature = req.time_signature
+                audio_duration = req.audio_duration
+                # Save original user input for metas
+                original_prompt = req.prompt or ""
+                original_lyrics = req.lyrics or ""
+                if sample_mode or has_sample_query:
+                    # Parse description hints from sample_query (if provided)
+                    sample_query = req.sample_query if has_sample_query else "NO USER INPUT"
+                    parsed_language, parsed_instrumental = _parse_description_hints(sample_query)
+                    # Determine vocal_language with priority:
+                    # 1. User-specified vocal_language (if not default "en")
+                    # 2. Language parsed from description
+                    # 3. None (no constraint)
+                    if req.vocal_language and req.vocal_language not in ("en", "unknown", ""):
+                        sample_language = req.vocal_language
+                    else:
+                        sample_language = parsed_language
+                    sample_result = create_sample(
+                        llm_handler=llm,
+                        query=sample_query,
+                        instrumental=parsed_instrumental,
+                        vocal_language=sample_language,
+                        temperature=req.lm_temperature,
+                        top_k=lm_top_k if lm_top_k > 0 else None,
+                        top_p=lm_top_p if lm_top_p < 1.0 else None,
+                        use_constrained_decoding=True,
+                    )
+                    if not sample_result.success:
+                        raise RuntimeError(f"create_sample failed: {sample_result.error or sample_result.status_message}")
+                    # Use generated sample data
+                    caption = sample_result.caption
+                    lyrics = sample_result.lyrics
+                    bpm = sample_result.bpm
+                    key_scale = sample_result.keyscale
+                    time_signature = sample_result.timesignature
+                    audio_duration = sample_result.duration
+                # Apply format_sample() if use_format is True and caption/lyrics are provided
+                format_has_duration = False
+                if req.use_format and (caption or lyrics):
+                    _ensure_llm_ready()
+                    if getattr(app.state, "_llm_init_error", None):
+                        raise RuntimeError(f"5Hz LM init failed (needed for format): {app.state._llm_init_error}")
+                    # Build user_metadata from request params (matching bot.py behavior)
+                    user_metadata_for_format = {}
+                    if bpm is not None:
+                        user_metadata_for_format['bpm'] = bpm
+                    if audio_duration is not None and float(audio_duration) > 0:
+                        user_metadata_for_format['duration'] = float(audio_duration)
+                    if key_scale:
+                        user_metadata_for_format['keyscale'] = key_scale
+                    if time_signature:
+                        user_metadata_for_format['timesignature'] = time_signature
+                    if req.vocal_language and req.vocal_language != "unknown":
+                        user_metadata_for_format['language'] = req.vocal_language
+                    format_result = format_sample(
+                        llm_handler=llm,
+                        caption=caption,
+                        lyrics=lyrics,
+                        user_metadata=user_metadata_for_format if user_metadata_for_format else None,
+                        temperature=req.lm_temperature,
+                        top_k=lm_top_k if lm_top_k > 0 else None,
+                        top_p=lm_top_p if lm_top_p < 1.0 else None,
+                        use_constrained_decoding=True,
+                    )
+                    if format_result.success:
+                        # Extract all formatted data (matching bot.py behavior)
+                        caption = format_result.caption or caption
+                        lyrics = format_result.lyrics or lyrics
+                        if format_result.duration:
+                            audio_duration = format_result.duration
+                            format_has_duration = True
+                        if format_result.bpm:
+                            bpm = format_result.bpm
+                        if format_result.keyscale:
+                            key_scale = format_result.keyscale
+                        if format_result.timesignature:
+                            time_signature = format_result.timesignature
+                # Parse timesteps string to list of floats if provided
+                parsed_timesteps = _parse_timesteps(req.timesteps)
+                # Determine actual inference steps (timesteps override inference_steps)
+                actual_inference_steps = len(parsed_timesteps) if parsed_timesteps else req.inference_steps
+                # Auto-select instruction based on task_type if user didn't provide custom instruction
+                # This matches gradio behavior which uses TASK_INSTRUCTIONS for each task type
+                instruction_to_use = req.instruction
+                if instruction_to_use == DEFAULT_DIT_INSTRUCTION and req.task_type in TASK_INSTRUCTIONS:
+                    instruction_to_use = TASK_INSTRUCTIONS[req.task_type]
+                # Build GenerationParams using unified interface
+                # Note: thinking controls LM code generation, sample_mode only affects CoT metas
+                params = GenerationParams(
+                    task_type=req.task_type,
+                    instruction=instruction_to_use,
+                    reference_audio=req.reference_audio_path,
+                    src_audio=req.src_audio_path,
+                    audio_codes=req.audio_code_string,
+                    caption=caption,
+                    lyrics=lyrics,
+                    instrumental=_is_instrumental(lyrics),
+                    vocal_language=req.vocal_language,
+                    bpm=bpm,
+                    keyscale=key_scale,
+                    timesignature=time_signature,
+                    duration=audio_duration if audio_duration else -1.0,
+                    inference_steps=req.inference_steps,
+                    seed=req.seed,
+                    guidance_scale=req.guidance_scale,
+                    use_adg=req.use_adg,
+                    cfg_interval_start=req.cfg_interval_start,
+                    cfg_interval_end=req.cfg_interval_end,
+                    shift=req.shift,
+                    infer_method=req.infer_method,
+                    timesteps=parsed_timesteps,
+                    repainting_start=req.repainting_start,
+                    repainting_end=req.repainting_end if req.repainting_end else -1,
+                    audio_cover_strength=req.audio_cover_strength,
+                    # LM parameters
+                    thinking=thinking,  # Use LM for code generation when thinking=True
+                    lm_temperature=req.lm_temperature,
+                    lm_cfg_scale=req.lm_cfg_scale,
+                    lm_top_k=lm_top_k,
+                    lm_top_p=lm_top_p,
+                    lm_negative_prompt=req.lm_negative_prompt,
+                    # use_cot_metas logic:
+                    # - sample_mode: metas already generated, skip Phase 1
+                    # - format with duration: metas already generated, skip Phase 1
+                    # - format without duration: need Phase 1 to generate duration
+                    # - no format: need Phase 1 to generate all metas
+                    use_cot_metas=not sample_mode and not format_has_duration,
+                    use_cot_caption=req.use_cot_caption,
+                    use_cot_language=req.use_cot_language,
+                    use_constrained_decoding=True,
+                )
+                # Build GenerationConfig - default to 2 audios like gradio_ui
+                batch_size = req.batch_size if req.batch_size is not None else 2
+                config = GenerationConfig(
+                    batch_size=batch_size,
+                    use_random_seed=req.use_random_seed,
+                    seeds=None,  # Let unified logic handle seed generation
+                    audio_format=req.audio_format,
+                    constrained_decoding_debug=req.constrained_decoding_debug,
+                )
+                # Check LLM initialization status
+                llm_is_initialized = getattr(app.state, "_llm_initialized", False)
+                llm_to_pass = llm if llm_is_initialized else None
+                # Generate music using unified interface
+                result = generate_music(
+                    dit_handler=h,
+                    llm_handler=llm_to_pass,
+                    params=params,
+                    config=config,
+                    save_dir=app.state.temp_audio_dir,
+                    progress=None,
+                )
+                if not result.success:
+                    raise RuntimeError(f"Music generation failed: {result.error or result.status_message}")
+                # Extract results
+                audio_paths = [audio["path"] for audio in result.audios if audio.get("path")]
+                first_audio = audio_paths[0] if len(audio_paths) > 0 else None
+                second_audio = audio_paths[1] if len(audio_paths) > 1 else None
+                # Get metadata from LM or CoT results
+                lm_metadata = result.extra_outputs.get("lm_metadata", {})
+                metas_out = _normalize_metas(lm_metadata)
+                # Update metas with actual values used
+                if params.cot_bpm:
+                    metas_out["bpm"] = params.cot_bpm
+                elif bpm:
+                    metas_out["bpm"] = bpm
+                if params.cot_duration:
+                    metas_out["duration"] = params.cot_duration
+                elif audio_duration:
+                    metas_out["duration"] = audio_duration
+                if params.cot_keyscale:
+                    metas_out["keyscale"] = params.cot_keyscale
+                elif key_scale:
+                    metas_out["keyscale"] = key_scale
+                if params.cot_timesignature:
+                    metas_out["timesignature"] = params.cot_timesignature
+                elif time_signature:
+                    metas_out["timesignature"] = time_signature
+                # Store original user input in metas (not the final/modified values)
+                metas_out["prompt"] = original_prompt
+                metas_out["lyrics"] = original_lyrics
+                # Extract seed values for response (comma-separated for multiple audios)
+                seed_values = []
+                for audio in result.audios:
+                    audio_params = audio.get("params", {})
+                    seed = audio_params.get("seed")
+                    if seed is not None:
+                        seed_values.append(str(seed))
+                seed_value = ",".join(seed_values) if seed_values else ""
+                # Build generation_info using the helper function (like gradio_ui)
+                time_costs = result.extra_outputs.get("time_costs", {})
+                generation_info = _build_generation_info(
+                    lm_metadata=lm_metadata,
+                    time_costs=time_costs,
+                    seed_value=seed_value,
+                    inference_steps=req.inference_steps,
+                    num_audios=len(result.audios),
+                )
+                def _none_if_na_str(v: Any) -> Optional[str]:
+                    if v is None:
+                        return None
+                    s = str(v).strip()
+                    if s in {"", "N/A"}:
+                        return None
+                    return s
+                # Get model information
+                lm_model_name = os.getenv("ACESTEP_LM_MODEL_PATH", "acestep-5Hz-lm-0.6B")
+                # Use selected_model_name (set at the beginning of _run_one_job)
+                dit_model_name = selected_model_name
+                return {
+                    "first_audio_path": _path_to_audio_url(first_audio) if first_audio else None,
+                    "second_audio_path": _path_to_audio_url(second_audio) if second_audio else None,
+                    "audio_paths": [_path_to_audio_url(p) for p in audio_paths],
+                    "generation_info": generation_info,
+                    "status_message": result.status_message,
+                    "seed_value": seed_value,
+                    # Final prompt/lyrics (may be modified by thinking/format)
+                    "prompt": caption or "",
+                    "lyrics": lyrics or "",
+                    # metas contains original user input + other metadata
+                    "metas": metas_out,
+                    "bpm": metas_out.get("bpm") if isinstance(metas_out.get("bpm"), int) else None,
+                    "duration": metas_out.get("duration") if isinstance(metas_out.get("duration"), (int, float)) else None,
+                    "genres": _none_if_na_str(metas_out.get("genres")),
+                    "keyscale": _none_if_na_str(metas_out.get("keyscale")),
+                    "timesignature": _none_if_na_str(metas_out.get("timesignature")),
+                    "lm_model": lm_model_name,
+                    "dit_model": dit_model_name,
+                }
+            t0 = time.time()
+            try:
+                loop = asyncio.get_running_loop()
+                result = await loop.run_in_executor(executor, _blocking_generate)
+                job_store.mark_succeeded(job_id, result)
+                # Update local cache
+                _update_local_cache(job_id, result, "succeeded")
+            except Exception:
+                job_store.mark_failed(job_id, traceback.format_exc())
+                # Update local cache
+                _update_local_cache(job_id, None, "failed")
+            finally:
+                dt = max(0.0, time.time() - t0)
+                async with app.state.stats_lock:
+                    app.state.recent_durations.append(dt)
+                    if app.state.recent_durations:
+                        app.state.avg_job_seconds = sum(app.state.recent_durations) / len(app.state.recent_durations)
+        async def _queue_worker(worker_idx: int) -> None:
+            while True:
+                job_id, req = await app.state.job_queue.get()
+                try:
+                    async with app.state.pending_lock:
+                        try:
+                            app.state.pending_ids.remove(job_id)
+                        except ValueError:
+                            pass
+                    await _run_one_job(job_id, req)
+                finally:
+                    await _cleanup_job_temp_files(job_id)
+                    app.state.job_queue.task_done()
+        async def _job_store_cleanup_worker() -> None:
+            """Background task to periodically clean up old completed jobs."""
+            while True:
+                try:
+                    await asyncio.sleep(JOB_STORE_CLEANUP_INTERVAL)
+                    removed = store.cleanup_old_jobs()
+                    if removed > 0:
+                        stats = store.get_stats()
+                        print(f"[API Server] Cleaned up {removed} old jobs. Current stats: {stats}")
+                except asyncio.CancelledError:
+                    break
+                except Exception as e:
+                    print(f"[API Server] Job cleanup error: {e}")
+        worker_count = max(1, WORKER_COUNT)
+        workers = [asyncio.create_task(_queue_worker(i)) for i in range(worker_count)]
+        cleanup_task = asyncio.create_task(_job_store_cleanup_worker())
+        app.state.worker_tasks = workers
+        app.state.cleanup_task = cleanup_task
+        try:
+            yield
+        finally:
+            cleanup_task.cancel()
+            for t in workers:
+                t.cancel()
+            executor.shutdown(wait=False, cancel_futures=True)
+    app = FastAPI(title="ACE-Step API", version="1.0", lifespan=lifespan)
+    async def _queue_position(job_id: str) -> int:
+        async with app.state.pending_lock:
+            try:
+                return list(app.state.pending_ids).index(job_id) + 1
+            except ValueError:
+                return 0
+    async def _eta_seconds_for_position(pos: int) -> Optional[float]:
+        if pos <= 0:
+            return None
+        async with app.state.stats_lock:
+            avg = float(getattr(app.state, "avg_job_seconds", INITIAL_AVG_JOB_SECONDS))
+        return pos * avg
+    @app.post("/release_task", response_model=CreateJobResponse)
+    async def create_music_generate_job(request: Request) -> CreateJobResponse:
+        content_type = (request.headers.get("content-type") or "").lower()
+        temp_files: list[str] = []
+        def _build_request(p: RequestParser, **kwargs) -> GenerateMusicRequest:
+            """Build GenerateMusicRequest from parsed parameters."""
+            return GenerateMusicRequest(
+                prompt=p.str("prompt"),
+                lyrics=p.str("lyrics"),
+                thinking=p.bool("thinking"),
+                sample_mode=p.bool("sample_mode"),
+                sample_query=p.str("sample_query"),
+                use_format=p.bool("use_format"),
+                model=p.str("model") or None,
+                bpm=p.int("bpm"),
+                key_scale=p.str("key_scale"),
+                time_signature=p.str("time_signature"),
+                audio_duration=p.float("audio_duration"),
+                vocal_language=p.str("vocal_language", "en"),
+                inference_steps=p.int("inference_steps", 8),
+                guidance_scale=p.float("guidance_scale", 7.0),
+                use_random_seed=p.bool("use_random_seed", True),
+                seed=p.int("seed", -1),
+                batch_size=p.int("batch_size"),
+                audio_code_string=p.str("audio_code_string"),
+                repainting_start=p.float("repainting_start", 0.0),
+                repainting_end=p.float("repainting_end"),
+                instruction=p.str("instruction", DEFAULT_DIT_INSTRUCTION),
+                audio_cover_strength=p.float("audio_cover_strength", 1.0),
+                task_type=p.str("task_type", "text2music"),
+                use_adg=p.bool("use_adg"),
+                cfg_interval_start=p.float("cfg_interval_start", 0.0),
+                cfg_interval_end=p.float("cfg_interval_end", 1.0),
+                infer_method=p.str("infer_method", "ode"),
+                shift=p.float("shift", 3.0),
+                audio_format=p.str("audio_format", "mp3"),
+                use_tiled_decode=p.bool("use_tiled_decode", True),
+                lm_model_path=p.str("lm_model_path") or None,
+                lm_backend=p.str("lm_backend", "vllm"),
+                lm_temperature=p.float("lm_temperature", LM_DEFAULT_TEMPERATURE),
+                lm_cfg_scale=p.float("lm_cfg_scale", LM_DEFAULT_CFG_SCALE),
+                lm_top_k=p.int("lm_top_k"),
+                lm_top_p=p.float("lm_top_p", LM_DEFAULT_TOP_P),
+                lm_repetition_penalty=p.float("lm_repetition_penalty", 1.0),
+                lm_negative_prompt=p.str("lm_negative_prompt", "NO USER INPUT"),
+                constrained_decoding=p.bool("constrained_decoding", True),
+                constrained_decoding_debug=p.bool("constrained_decoding_debug"),
+                use_cot_caption=p.bool("use_cot_caption", True),
+                use_cot_language=p.bool("use_cot_language", True),
+                is_format_caption=p.bool("is_format_caption"),
+                **kwargs,
+            )
+        if content_type.startswith("application/json"):
+            body = await request.json()
+            if not isinstance(body, dict):
+                raise HTTPException(status_code=400, detail="JSON payload must be an object")
+            req = _build_request(RequestParser(body))
+        elif content_type.endswith("+json"):
+            body = await request.json()
+            if not isinstance(body, dict):
+                raise HTTPException(status_code=400, detail="JSON payload must be an object")
+            req = _build_request(RequestParser(body))
+        elif content_type.startswith("multipart/form-data"):
+            form = await request.form()
+            ref_up = form.get("reference_audio")
+            src_up = form.get("src_audio")
+            reference_audio_path = None
+            src_audio_path = None
+            if isinstance(ref_up, StarletteUploadFile):
+                reference_audio_path = await _save_upload_to_temp(ref_up, prefix="reference_audio")
+                temp_files.append(reference_audio_path)
+            else:
+                reference_audio_path = str(form.get("reference_audio_path") or "").strip() or None
+            if isinstance(src_up, StarletteUploadFile):
+                src_audio_path = await _save_upload_to_temp(src_up, prefix="src_audio")
+                temp_files.append(src_audio_path)
+            else:
+                src_audio_path = str(form.get("src_audio_path") or "").strip() or None
+            req = _build_request(
+                RequestParser(dict(form)),
+                reference_audio_path=reference_audio_path,
+                src_audio_path=src_audio_path,
+            )
+        elif content_type.startswith("application/x-www-form-urlencoded"):
+            form = await request.form()
+            reference_audio_path = str(form.get("reference_audio_path") or "").strip() or None
+            src_audio_path = str(form.get("src_audio_path") or "").strip() or None
+            req = _build_request(
+                RequestParser(dict(form)),
+                reference_audio_path=reference_audio_path,
+                src_audio_path=src_audio_path,
+            )
+        else:
+            raw = await request.body()
+            raw_stripped = raw.lstrip()
+            # Best-effort: accept missing/incorrect Content-Type if payload is valid JSON.
+            if raw_stripped.startswith(b"{") or raw_stripped.startswith(b"["):
+                try:
+                    body = json.loads(raw.decode("utf-8"))
+                    if isinstance(body, dict):
+                        req = _build_request(RequestParser(body))
+                    else:
+                        raise HTTPException(status_code=400, detail="JSON payload must be an object")
+                except HTTPException:
+                    raise
+                except Exception:
+                    raise HTTPException(
+                        status_code=400,
+                        detail="Invalid JSON body (hint: set 'Content-Type: application/json')",
+                    )
+            # Best-effort: parse key=value bodies even if Content-Type is missing.
+            elif raw_stripped and b"=" in raw:
+                parsed = urllib.parse.parse_qs(raw.decode("utf-8"), keep_blank_values=True)
+                flat = {k: (v[0] if isinstance(v, list) and v else v) for k, v in parsed.items()}
+                reference_audio_path = str(flat.get("reference_audio_path") or "").strip() or None
+                src_audio_path = str(flat.get("src_audio_path") or "").strip() or None
+                req = _build_request(
+                    RequestParser(flat),
+                    reference_audio_path=reference_audio_path,
+                    src_audio_path=src_audio_path,
+                )
+            else:
+                raise HTTPException(
+                    status_code=415,
+                    detail=(
+                        f"Unsupported Content-Type: {content_type or '(missing)'}; "
+                        "use application/json, application/x-www-form-urlencoded, or multipart/form-data"
+                    ),
+                )
+        rec = store.create()
+        q: asyncio.Queue = app.state.job_queue
+        if q.full():
+            for p in temp_files:
+                try:
+                    os.remove(p)
+                except Exception:
+                    pass
+            raise HTTPException(status_code=429, detail="Server busy: queue is full")
+        if temp_files:
+            async with app.state.job_temp_files_lock:
+                app.state.job_temp_files[rec.job_id] = temp_files
+        async with app.state.pending_lock:
+            app.state.pending_ids.append(rec.job_id)
+            position = len(app.state.pending_ids)
+        await q.put((rec.job_id, req))
+        return CreateJobResponse(task_id=rec.job_id, status="queued", queue_position=position)
+    @app.post("/query_result")
+    async def query_result(request: Request) -> List[Dict[str, Any]]:
+        """Batch query job results"""
+        content_type = (request.headers.get("content-type") or "").lower()
+        if "json" in content_type:
+            body = await request.json()
+        else:
+            form = await request.form()
+            body = {k: v for k, v in form.items()}
+        task_id_list_str = body.get("task_id_list", "[]")
+        # Parse task ID list
+        if isinstance(task_id_list_str, list):
+            task_id_list = task_id_list_str
+        else:
+            try:
+                task_id_list = json.loads(task_id_list_str)
+            except Exception:
+                task_id_list = []
+        local_cache = getattr(app.state, 'local_cache', None)
+        data_list = []
+        current_time = time.time()
+        for task_id in task_id_list:
+            result_key = f"{RESULT_KEY_PREFIX}{task_id}"
+            # Read from local cache first
+            if local_cache:
+                data = local_cache.get(result_key)
+                if data:
+                    try:
+                        data_json = json.loads(data)
+                    except Exception:
+                        data_json = []
+                    if len(data_json) <= 0:
+                        data_list.append({"task_id": task_id, "result": data, "status": 2})
+                    else:
+                        status = data_json[0].get("status")
+                        create_time = data_json[0].get("create_time", 0)
+                        if status == 0 and (current_time - create_time) > TASK_TIMEOUT_SECONDS:
+                            data_list.append({"task_id": task_id, "result": data, "status": 2})
+                        else:
+                            data_list.append({
+                                "task_id": task_id,
+                                "result": data,
+                                "status": int(status) if status is not None else 1,
+                            })
+                    continue
+            # Fallback to job_store query
+            rec = store.get(task_id)
+            if rec:
+                env = getattr(rec, 'env', 'development')
+                create_time = rec.created_at
+                status_int = _map_status(rec.status)
+                if rec.result and rec.status == "succeeded":
+                    audio_paths = rec.result.get("audio_paths", [])
+                    metas = rec.result.get("metas", {}) or {}
+                    result_data = [
+                        {
+                            "file": p, "wave": "", "status": status_int,
+                            "create_time": int(create_time), "env": env,
+                            "prompt": metas.get("caption", ""),
+                            "lyrics": metas.get("lyrics", ""),
+                            "metas": {
+                                "bpm": metas.get("bpm"),
+                                "duration": metas.get("duration"),
+                                "genres": metas.get("genres", ""),
+                                "keyscale": metas.get("keyscale", ""),
+                                "timesignature": metas.get("timesignature", ""),
+                            }
+                        }
+                        for p in audio_paths
+                    ] if audio_paths else [{
+                        "file": "", "wave": "", "status": status_int,
+                        "create_time": int(create_time), "env": env,
+                        "prompt": metas.get("caption", ""),
+                        "lyrics": metas.get("lyrics", ""),
+                        "metas": {
+                            "bpm": metas.get("bpm"),
+                            "duration": metas.get("duration"),
+                            "genres": metas.get("genres", ""),
+                            "keyscale": metas.get("keyscale", ""),
+                            "timesignature": metas.get("timesignature", ""),
+                        }
+                    }]
+                else:
+                    result_data = [{
+                        "file": "", "wave": "", "status": status_int,
+                        "create_time": int(create_time), "env": env,
+                        "prompt": "", "lyrics": "",
+                        "metas": {}
+                    }]
+                data_list.append({
+                    "task_id": task_id,
+                    "result": json.dumps(result_data, ensure_ascii=False),
+                    "status": status_int,
+                })
+            else:
+                data_list.append({"task_id": task_id, "result": "[]", "status": 0})
+        return data_list
+    @app.get("/health")
+    async def health_check():
+        """Health check endpoint for service status."""
+        return {
+            "status": "ok",
+            "service": "ACE-Step API",
+            "version": "1.0",
+        }
+    @app.get("/v1/stats")
+    async def get_stats():
+        """Get server statistics including job store stats."""
+        job_stats = store.get_stats()
+        async with app.state.stats_lock:
+            avg_job_seconds = getattr(app.state, "avg_job_seconds", INITIAL_AVG_JOB_SECONDS)
+        return {
+            "jobs": job_stats,
+            "queue_size": app.state.job_queue.qsize(),
+            "queue_maxsize": QUEUE_MAXSIZE,
+            "avg_job_seconds": avg_job_seconds,
+        }
+    @app.get("/v1/models")
+    async def list_models():
+        """List available DiT models."""
+        models = []
+        # Primary model (always available if initialized)
+        if getattr(app.state, "_initialized", False):
+            primary_model = _get_model_name(app.state._config_path)
+            if primary_model:
+                models.append({
+                    "name": primary_model,
+                    "is_default": True,
+                })
+        # Secondary model
+        if getattr(app.state, "_initialized2", False) and app.state._config_path2:
+            secondary_model = _get_model_name(app.state._config_path2)
+            if secondary_model:
+                models.append({
+                    "name": secondary_model,
+                    "is_default": False,
+                })
+        # Third model
+        if getattr(app.state, "_initialized3", False) and app.state._config_path3:
+            third_model = _get_model_name(app.state._config_path3)
+            if third_model:
+                models.append({
+                    "name": third_model,
+                    "is_default": False,
+                })
+        return {
+            "models": models,
+            "default_model": models[0]["name"] if models else None,
+        }
+    @app.get("/v1/audio")
+    async def get_audio(path: str):
+        """Serve audio file by path."""
+        from fastapi.responses import FileResponse
+        if not os.path.exists(path):
+            raise HTTPException(status_code=404, detail=f"Audio file not found: {path}")
+        ext = os.path.splitext(path)[1].lower()
+        media_types = {
+            ".mp3": "audio/mpeg",
+            ".wav": "audio/wav",
+            ".flac": "audio/flac",
+            ".ogg": "audio/ogg",
+        }
+        media_type = media_types.get(ext, "audio/mpeg")
+        return FileResponse(path, media_type=media_type)
+    return app
+app = create_app()
+def main() -> None:
+    import argparse
+    import uvicorn
+    parser = argparse.ArgumentParser(description="ACE-Step API server")
+    parser.add_argument(
+        "--host",
+        default=os.getenv("ACESTEP_API_HOST", "127.0.0.1"),
+        help="Bind host (default from ACESTEP_API_HOST or 127.0.0.1)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=int(os.getenv("ACESTEP_API_PORT", "8001")),
+        help="Bind port (default from ACESTEP_API_PORT or 8001)",
+    )
+    args = parser.parse_args()
+    # IMPORTANT: in-memory queue/store -> workers MUST be 1
+    uvicorn.run(
+        "acestep.api_server:app",
+        host=str(args.host),
+        port=int(args.port),
+        reload=False,
+        workers=1,
+    )
+if __name__ == "__main__":
+    main()

code/acestep/audio_utils.py ADDED Viewed

	@@ -0,0 +1,327 @@

+"""
+Audio saving and transcoding utility module
+Independent audio file operations outside of handler, supporting:
+- Save audio tensor/numpy to files (default FLAC format, fast)
+- Format conversion (FLAC/WAV/MP3)
+- Batch processing
+"""
+import os
+import hashlib
+import json
+from pathlib import Path
+from typing import Union, Optional, List, Tuple
+import torch
+import numpy as np
+import torchaudio
+from loguru import logger
+class AudioSaver:
+    """Audio saving and transcoding utility class"""
+    def __init__(self, default_format: str = "flac"):
+        """
+        Initialize audio saver
+        Args:
+            default_format: Default save format ('flac', 'wav', 'mp3')
+        """
+        self.default_format = default_format.lower()
+        if self.default_format not in ["flac", "wav", "mp3"]:
+            logger.warning(f"Unsupported format {default_format}, using 'flac'")
+            self.default_format = "flac"
+    def save_audio(
+        self,
+        audio_data: Union[torch.Tensor, np.ndarray],
+        output_path: Union[str, Path],
+        sample_rate: int = 48000,
+        format: Optional[str] = None,
+        channels_first: bool = True,
+    ) -> str:
+        """
+        Save audio data to file
+        Args:
+            audio_data: Audio data, torch.Tensor [channels, samples] or numpy.ndarray
+            output_path: Output file path (extension can be omitted)
+            sample_rate: Sample rate
+            format: Audio format ('flac', 'wav', 'mp3'), defaults to default_format
+            channels_first: If True, tensor format is [channels, samples], else [samples, channels]
+        Returns:
+            Actual saved file path
+        """
+        format = (format or self.default_format).lower()
+        if format not in ["flac", "wav", "mp3"]:
+            logger.warning(f"Unsupported format {format}, using {self.default_format}")
+            format = self.default_format
+        # Ensure output path has correct extension
+        output_path = Path(output_path)
+        if output_path.suffix.lower() not in ['.flac', '.wav', '.mp3']:
+            output_path = output_path.with_suffix(f'.{format}')
+        # Convert to torch tensor
+        if isinstance(audio_data, np.ndarray):
+            if channels_first:
+                # numpy [samples, channels] -> tensor [channels, samples]
+                audio_tensor = torch.from_numpy(audio_data.T).float()
+            else:
+                # numpy [samples, channels] -> tensor [samples, channels] -> [channels, samples]
+                audio_tensor = torch.from_numpy(audio_data).float()
+                if audio_tensor.dim() == 2 and audio_tensor.shape[0] < audio_tensor.shape[1]:
+                    audio_tensor = audio_tensor.T
+        else:
+            # torch tensor
+            audio_tensor = audio_data.cpu().float()
+            if not channels_first and audio_tensor.dim() == 2:
+                # [samples, channels] -> [channels, samples]
+                if audio_tensor.shape[0] > audio_tensor.shape[1]:
+                    audio_tensor = audio_tensor.T
+        # Ensure memory is contiguous
+        audio_tensor = audio_tensor.contiguous()
+        # Select backend and save
+        try:
+            if format == "mp3":
+                # MP3 uses ffmpeg backend
+                torchaudio.save(
+                    str(output_path),
+                    audio_tensor,
+                    sample_rate,
+                    channels_first=True,
+                    backend='ffmpeg',
+                )
+            elif format in ["flac", "wav"]:
+                # FLAC and WAV use soundfile backend (fastest)
+                torchaudio.save(
+                    str(output_path),
+                    audio_tensor,
+                    sample_rate,
+                    channels_first=True,
+                    backend='soundfile',
+                )
+            else:
+                # Other formats use default backend
+                torchaudio.save(
+                    str(output_path),
+                    audio_tensor,
+                    sample_rate,
+                    channels_first=True,
+                )
+            logger.debug(f"[AudioSaver] Saved audio to {output_path} ({format}, {sample_rate}Hz)")
+            return str(output_path)
+        except Exception as e:
+            try:
+                import soundfile as sf
+                audio_np = audio_tensor.transpose(0, 1).numpy()  # -> [samples, channels]
+                sf.write(str(output_path), audio_np, sample_rate, format=format.upper())
+                logger.debug(f"[AudioSaver] Fallback soundfile Saved audio to {output_path} ({format}, {sample_rate}Hz)")
+                return str(output_path)
+            except Exception as e:
+                logger.error(f"[AudioSaver] Failed to save audio: {e}")
+                raise
+    def convert_audio(
+        self,
+        input_path: Union[str, Path],
+        output_path: Union[str, Path],
+        output_format: str,
+        remove_input: bool = False,
+    ) -> str:
+        """
+        Convert audio format
+        Args:
+            input_path: Input audio file path
+            output_path: Output audio file path
+            output_format: Target format ('flac', 'wav', 'mp3')
+            remove_input: Whether to delete input file
+        Returns:
+            Output file path
+        """
+        input_path = Path(input_path)
+        output_path = Path(output_path)
+        if not input_path.exists():
+            raise FileNotFoundError(f"Input file not found: {input_path}")
+        # Load audio
+        audio_tensor, sample_rate = torchaudio.load(str(input_path))
+        # Save as new format
+        output_path = self.save_audio(
+            audio_tensor,
+            output_path,
+            sample_rate=sample_rate,
+            format=output_format,
+            channels_first=True
+        )
+        # Delete input file if needed
+        if remove_input:
+            input_path.unlink()
+            logger.debug(f"[AudioSaver] Removed input file: {input_path}")
+        return output_path
+    def save_batch(
+        self,
+        audio_batch: Union[List[torch.Tensor], torch.Tensor],
+        output_dir: Union[str, Path],
+        file_prefix: str = "audio",
+        sample_rate: int = 48000,
+        format: Optional[str] = None,
+        channels_first: bool = True,
+    ) -> List[str]:
+        """
+        Save audio batch
+        Args:
+            audio_batch: Audio batch, List[tensor] or tensor [batch, channels, samples]
+            output_dir: Output directory
+            file_prefix: File prefix
+            sample_rate: Sample rate
+            format: Audio format
+            channels_first: Tensor format flag
+        Returns:
+            List of saved file paths
+        """
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Process batch
+        if isinstance(audio_batch, torch.Tensor) and audio_batch.dim() == 3:
+            # [batch, channels, samples]
+            audio_list = [audio_batch[i] for i in range(audio_batch.shape[0])]
+        elif isinstance(audio_batch, list):
+            audio_list = audio_batch
+        else:
+            audio_list = [audio_batch]
+        saved_paths = []
+        for i, audio in enumerate(audio_list):
+            output_path = output_dir / f"{file_prefix}_{i:04d}"
+            saved_path = self.save_audio(
+                audio,
+                output_path,
+                sample_rate=sample_rate,
+                format=format,
+                channels_first=channels_first
+            )
+            saved_paths.append(saved_path)
+        return saved_paths
+def get_audio_file_hash(audio_file) -> str:
+    """
+    Get hash identifier for an audio file.
+    Args:
+        audio_file: Path to audio file (str) or file-like object
+    Returns:
+        Hash string or empty string
+    """
+    if audio_file is None:
+        return ""
+    try:
+        if isinstance(audio_file, str):
+            if os.path.exists(audio_file):
+                with open(audio_file, 'rb') as f:
+                    return hashlib.md5(f.read()).hexdigest()
+            return hashlib.md5(audio_file.encode('utf-8')).hexdigest()
+        elif hasattr(audio_file, 'name'):
+            return hashlib.md5(str(audio_file.name).encode('utf-8')).hexdigest()
+        return hashlib.md5(str(audio_file).encode('utf-8')).hexdigest()
+    except Exception:
+        return hashlib.md5(str(audio_file).encode('utf-8')).hexdigest()
+def generate_uuid_from_params(params_dict) -> str:
+    """
+    Generate deterministic UUID from generation parameters.
+    Same parameters will always generate the same UUID.
+    Args:
+        params_dict: Dictionary of parameters
+    Returns:
+        UUID string
+    """
+    params_json = json.dumps(params_dict, sort_keys=True, ensure_ascii=False)
+    hash_obj = hashlib.sha256(params_json.encode('utf-8'))
+    hash_hex = hash_obj.hexdigest()
+    uuid_str = f"{hash_hex[0:8]}-{hash_hex[8:12]}-{hash_hex[12:16]}-{hash_hex[16:20]}-{hash_hex[20:32]}"
+    return uuid_str
+def generate_uuid_from_audio_data(
+    audio_data: Union[torch.Tensor, np.ndarray],
+    seed: Optional[int] = None
+) -> str:
+    """
+    Generate UUID from audio data (for caching/deduplication)
+    Args:
+        audio_data: Audio data
+        seed: Optional seed value
+    Returns:
+        UUID string
+    """
+    if isinstance(audio_data, torch.Tensor):
+        # Convert to numpy and calculate hash
+        audio_np = audio_data.cpu().numpy()
+    else:
+        audio_np = audio_data
+    # Calculate data hash
+    data_hash = hashlib.md5(audio_np.tobytes()).hexdigest()
+    if seed is not None:
+        combined = f"{data_hash}_{seed}"
+        return hashlib.md5(combined.encode()).hexdigest()
+    return data_hash
+# Global default instance
+_default_saver = AudioSaver(default_format="flac")
+def save_audio(
+    audio_data: Union[torch.Tensor, np.ndarray],
+    output_path: Union[str, Path],
+    sample_rate: int = 48000,
+    format: Optional[str] = None,
+    channels_first: bool = True,
+) -> str:
+    """
+    Convenience function: save audio (using default configuration)
+    Args:
+        audio_data: Audio data
+        output_path: Output path
+        sample_rate: Sample rate
+        format: Format (default flac)
+        channels_first: Tensor format flag
+    Returns:
+        Saved file path
+    """
+    return _default_saver.save_audio(
+        audio_data, output_path, sample_rate, format, channels_first
+    )

code/acestep/constants.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""
+Constants for ACE-Step
+Centralized constants used across the codebase
+"""
+# ==============================================================================
+# Language Constants
+# ==============================================================================
+VALID_LANGUAGES = [
+    'ar', 'az', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en',
+    'es', 'fa', 'fi', 'fr', 'he', 'hi', 'hr', 'ht', 'hu', 'id',
+    'is', 'it', 'ja', 'ko', 'la', 'lt', 'ms', 'ne', 'nl', 'no',
+    'pa', 'pl', 'pt', 'ro', 'ru', 'sa', 'sk', 'sr', 'sv', 'sw',
+    'ta', 'te', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'yue', 'zh',
+    'unknown'
+]
+# ==============================================================================
+# Keyscale Constants
+# ==============================================================================
+KEYSCALE_NOTES = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
+KEYSCALE_ACCIDENTALS = ['', '#', 'b', '♯', '♭']  # empty + ASCII sharp/flat + Unicode sharp/flat
+KEYSCALE_MODES = ['major', 'minor']
+# Generate all valid keyscales: 7 notes × 5 accidentals × 2 modes = 70 combinations
+VALID_KEYSCALES = set()
+for note in KEYSCALE_NOTES:
+    for acc in KEYSCALE_ACCIDENTALS:
+        for mode in KEYSCALE_MODES:
+            VALID_KEYSCALES.add(f"{note}{acc} {mode}")
+# ==============================================================================
+# Metadata Range Constants
+# ==============================================================================
+# BPM (Beats Per Minute) range
+BPM_MIN = 30
+BPM_MAX = 300
+# Duration range (in seconds)
+DURATION_MIN = 10
+DURATION_MAX = 600
+# Valid time signatures
+VALID_TIME_SIGNATURES = [2, 3, 4, 6]
+# ==============================================================================
+# Task Type Constants
+# ==============================================================================
+TASK_TYPES = ["text2music", "repaint", "cover", "extract", "lego", "complete"]
+# Task types available for turbo models (subset)
+TASK_TYPES_TURBO = ["text2music", "repaint", "cover"]
+# Task types available for base models (full set)
+TASK_TYPES_BASE = ["text2music", "repaint", "cover", "extract", "lego", "complete"]
+# ==============================================================================
+# Instruction Constants
+# ==============================================================================
+# Default instructions
+DEFAULT_DIT_INSTRUCTION = "Fill the audio semantic mask based on the given conditions:"
+DEFAULT_LM_INSTRUCTION = "Generate audio semantic tokens based on the given conditions:"
+DEFAULT_LM_UNDERSTAND_INSTRUCTION = "Understand the given musical conditions and describe the audio semantics accordingly:"
+DEFAULT_LM_INSPIRED_INSTRUCTION = "Expand the user's input into a more detailed and specific musical description:"
+DEFAULT_LM_REWRITE_INSTRUCTION = "Format the user's input into a more detailed and specific musical description:"
+# Instruction templates for each task type
+# Note: Some instructions use placeholders like {TRACK_NAME} or {TRACK_CLASSES}
+# These should be formatted using .format() or f-strings when used
+TASK_INSTRUCTIONS = {
+    "text2music": "Fill the audio semantic mask based on the given conditions:",
+    "repaint": "Repaint the mask area based on the given conditions:",
+    "cover": "Generate audio semantic tokens based on the given conditions:",
+    "extract": "Extract the {TRACK_NAME} track from the audio:",
+    "extract_default": "Extract the track from the audio:",
+    "lego": "Generate the {TRACK_NAME} track based on the audio context:",
+    "lego_default": "Generate the track based on the audio context:",
+    "complete": "Complete the input track with {TRACK_CLASSES}:",
+    "complete_default": "Complete the input track:",
+}
+# ==============================================================================
+# Track/Instrument Constants
+# ==============================================================================
+TRACK_NAMES = [
+    "woodwinds", "brass", "fx", "synth", "strings", "percussion",
+    "keyboard", "guitar", "bass", "drums", "backing_vocals", "vocals"
+]
+SFT_GEN_PROMPT = """# Instruction
+{}
+# Caption
+{}
+# Metas
+{}<|endoftext|>
+"""

code/acestep/constrained_logits_processor.py ADDED Viewed

The diff for this file is too large to render. See raw diff

code/acestep/dataset_handler.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""
+Dataset Handler
+Handles dataset import and exploration functionality
+"""
+from typing import Optional, Tuple, Any, Dict
+class DatasetHandler:
+    """Dataset Handler for Dataset Explorer functionality"""
+    def __init__(self):
+        """Initialize dataset handler"""
+        self.dataset = None
+        self.dataset_imported = False
+    def import_dataset(self, dataset_type: str) -> str:
+        """
+        Import dataset (temporarily disabled)
+        Args:
+            dataset_type: Type of dataset to import (e.g., "train", "test")
+        Returns:
+            Status message string
+        """
+        self.dataset_imported = False
+        return f"⚠️ Dataset import is currently disabled. Text2MusicDataset dependency not available."
+    def get_item_data(self, *args, **kwargs) -> Tuple:
+        """
+        Get dataset item (temporarily disabled)
+        Returns:
+            Tuple of placeholder values matching the expected return format
+        """
+        return "", "", "", "", "", None, None, None, "❌ Dataset not available", "", 0, "", None, None, None, {}, "text2music"

code/acestep/dit_alignment_score.py ADDED Viewed

	@@ -0,0 +1,870 @@

+"""
+DiT Alignment Score Module
+This module provides lyrics-to-audio alignment using cross-attention matrices
+from DiT model for generating LRC timestamps.
+Refactored from lyrics_alignment_infos.py for integration with ACE-Step.
+"""
+import numba
+import torch
+import numpy as np
+import torch.nn.functional as F
+from dataclasses import dataclass, asdict
+from typing import List, Dict, Any, Optional, Tuple, Union
+# ================= Data Classes =================
+@dataclass
+class TokenTimestamp:
+    """Stores per-token timing information."""
+    token_id: int
+    text: str
+    start: float
+    end: float
+    probability: float
+@dataclass
+class SentenceTimestamp:
+    """Stores per-sentence timing information with token list."""
+    text: str
+    start: float
+    end: float
+    tokens: List[TokenTimestamp]
+    confidence: float
+# ================= DTW Algorithm (Numba Optimized) =================
+@numba.jit(nopython=True)
+def dtw_cpu(x: np.ndarray):
+    """
+    Dynamic Time Warping algorithm optimized with Numba.
+    Args:
+        x: Cost matrix of shape [N, M]
+    Returns:
+        Tuple of (text_indices, time_indices) arrays
+    """
+    N, M = x.shape
+    # Use float32 for memory efficiency
+    cost = np.ones((N + 1, M + 1), dtype=np.float32) * np.inf
+    trace = -np.ones((N + 1, M + 1), dtype=np.float32)
+    cost[0, 0] = 0
+    for j in range(1, M + 1):
+        for i in range(1, N + 1):
+            c0 = cost[i - 1, j - 1]
+            c1 = cost[i - 1, j]
+            c2 = cost[i, j - 1]
+            if c0 < c1 and c0 < c2:
+                c, t = c0, 0
+            elif c1 < c0 and c1 < c2:
+                c, t = c1, 1
+            else:
+                c, t = c2, 2
+            cost[i, j] = x[i - 1, j - 1] + c
+            trace[i, j] = t
+    return _backtrace(trace, N, M)
+@numba.jit(nopython=True)
+def _backtrace(trace: np.ndarray, N: int, M: int):
+    """
+    Optimized backtrace function for DTW.
+    Args:
+        trace: Trace matrix of shape (N+1, M+1)
+        N, M: Original matrix dimensions
+    Returns:
+        Path array of shape (2, path_len) - first row is text indices, second is time indices
+    """
+    # Boundary handling
+    trace[0, :] = 2
+    trace[:, 0] = 1
+    # Pre-allocate array, max path length is N+M
+    max_path_len = N + M
+    path = np.zeros((2, max_path_len), dtype=np.int32)
+    i, j = N, M
+    path_idx = max_path_len - 1
+    while i > 0 or j > 0:
+        path[0, path_idx] = i - 1  # text index
+        path[1, path_idx] = j - 1  # time index
+        path_idx -= 1
+        t = trace[i, j]
+        if t == 0:
+            i -= 1
+            j -= 1
+        elif t == 1:
+            i -= 1
+        elif t == 2:
+            j -= 1
+        else:
+            break
+    actual_len = max_path_len - path_idx - 1
+    return path[:, path_idx + 1:max_path_len]
+# ================= Utility Functions =================
+def median_filter(x: torch.Tensor, filter_width: int) -> torch.Tensor:
+    """
+    Apply median filter to tensor.
+    Args:
+        x: Input tensor
+        filter_width: Width of median filter
+    Returns:
+        Filtered tensor
+    """
+    pad_width = filter_width // 2
+    if x.shape[-1] <= pad_width:
+        return x
+    if x.ndim == 2:
+        x = x[None, :]
+    x = F.pad(x, (filter_width // 2, filter_width // 2, 0, 0), mode="reflect")
+    result = x.unfold(-1, filter_width, 1).sort()[0][..., filter_width // 2]
+    if result.ndim > 2:
+        result = result.squeeze(0)
+    return result
+# ================= Main Aligner Class =================
+class MusicStampsAligner:
+    """
+    Aligner class for generating lyrics timestamps from cross-attention matrices.
+    Uses bidirectional consensus denoising and DTW for alignment.
+    """
+    def __init__(self, tokenizer):
+        """
+        Initialize the aligner.
+        Args:
+            tokenizer: Text tokenizer for decoding tokens
+        """
+        self.tokenizer = tokenizer
+    def _apply_bidirectional_consensus(
+        self,
+        weights_stack: torch.Tensor,
+        violence_level: float,
+        medfilt_width: int
+    ) -> tuple:
+        """
+        Core denoising logic using bidirectional consensus.
+        Args:
+            weights_stack: Attention weights [Heads, Tokens, Frames]
+            violence_level: Denoising strength coefficient
+            medfilt_width: Median filter width
+        Returns:
+            Tuple of (calc_matrix, energy_matrix) as numpy arrays
+        """
+        # A. Bidirectional Consensus
+        row_prob = F.softmax(weights_stack, dim=-1)  # Token -> Frame
+        col_prob = F.softmax(weights_stack, dim=-2)  # Frame -> Token
+        processed = row_prob * col_prob
+        # 1. Row suppression (kill horizontal crossing lines)
+        row_medians = torch.quantile(processed, 0.5, dim=-1, keepdim=True)
+        processed = processed - (violence_level * row_medians)
+        processed = torch.relu(processed)
+        # 2. Column suppression (kill vertical crossing lines)
+        col_medians = torch.quantile(processed, 0.5, dim=-2, keepdim=True)
+        processed = processed - (violence_level * col_medians)
+        processed = torch.relu(processed)
+        # C. Power sharpening
+        processed = processed ** 2
+        # Energy matrix for confidence
+        energy_matrix = processed.mean(dim=0).cpu().numpy()
+        # D. Z-Score normalization
+        std, mean = torch.std_mean(processed, unbiased=False)
+        weights_processed = (processed - mean) / (std + 1e-9)
+        # E. Median filtering
+        weights_processed = median_filter(weights_processed, filter_width=medfilt_width)
+        calc_matrix = weights_processed.mean(dim=0).numpy()
+        return calc_matrix, energy_matrix
+    def _preprocess_attention(
+        self,
+        attention_matrix: torch.Tensor,
+        custom_config: Dict[int, List[int]],
+        violence_level: float,
+        medfilt_width: int = 7
+    ) -> tuple:
+        """
+        Preprocess attention matrix for alignment.
+        Args:
+            attention_matrix: Attention tensor [Layers, Heads, Tokens, Frames]
+            custom_config: Dict mapping layer indices to head indices
+            violence_level: Denoising strength
+            medfilt_width: Median filter width
+        Returns:
+            Tuple of (calc_matrix, energy_matrix, visual_matrix)
+        """
+        if not isinstance(attention_matrix, torch.Tensor):
+            weights = torch.tensor(attention_matrix)
+        else:
+            weights = attention_matrix.clone()
+        weights = weights.cpu().float()
+        selected_tensors = []
+        for layer_idx, head_indices in custom_config.items():
+            for head_idx in head_indices:
+                if layer_idx < weights.shape[0] and head_idx < weights.shape[1]:
+                    head_matrix = weights[layer_idx, head_idx]
+                    selected_tensors.append(head_matrix)
+        if not selected_tensors:
+            return None, None, None
+        # Stack selected heads: [Heads, Tokens, Frames]
+        weights_stack = torch.stack(selected_tensors, dim=0)
+        visual_matrix = weights_stack.mean(dim=0).numpy()
+        calc_matrix, energy_matrix = self._apply_bidirectional_consensus(
+            weights_stack, violence_level, medfilt_width
+        )
+        return calc_matrix, energy_matrix, visual_matrix
+    def stamps_align_info(
+        self,
+        attention_matrix: torch.Tensor,
+        lyrics_tokens: List[int],
+        total_duration_seconds: float,
+        custom_config: Dict[int, List[int]],
+        return_matrices: bool = False,
+        violence_level: float = 2.0,
+        medfilt_width: int = 1
+    ) -> Dict[str, Any]:
+        """
+        Get alignment information from attention matrix.
+        Args:
+            attention_matrix: Cross-attention tensor [Layers, Heads, Tokens, Frames]
+            lyrics_tokens: List of lyrics token IDs
+            total_duration_seconds: Total audio duration in seconds
+            custom_config: Dict mapping layer indices to head indices
+            return_matrices: Whether to return intermediate matrices
+            violence_level: Denoising strength
+            medfilt_width: Median filter width
+        Returns:
+            Dict containing calc_matrix, lyrics_tokens, total_duration_seconds,
+            and optionally energy_matrix and vis_matrix
+        """
+        calc_matrix, energy_matrix, visual_matrix = self._preprocess_attention(
+            attention_matrix, custom_config, violence_level, medfilt_width
+        )
+        if calc_matrix is None:
+            return {
+                "calc_matrix": None,
+                "lyrics_tokens": lyrics_tokens,
+                "total_duration_seconds": total_duration_seconds,
+                "error": "No valid attention heads found"
+            }
+        return_dict = {
+            "calc_matrix": calc_matrix,
+            "lyrics_tokens": lyrics_tokens,
+            "total_duration_seconds": total_duration_seconds
+        }
+        if return_matrices:
+            return_dict['energy_matrix'] = energy_matrix
+            return_dict['vis_matrix'] = visual_matrix
+        return return_dict
+    def _decode_tokens_incrementally(self, token_ids: List[int]) -> List[str]:
+        """
+        Decode tokens incrementally to properly handle multi-byte UTF-8 characters.
+        For Chinese and other multi-byte characters, the tokenizer may split them
+        into multiple byte-level tokens. Decoding each token individually produces
+        invalid UTF-8 sequences (showing as �). This method uses byte-level comparison
+        to correctly track which characters each token contributes.
+        Args:
+            token_ids: List of token IDs
+        Returns:
+            List of decoded text for each token position
+        """
+        decoded_tokens = []
+        prev_bytes = b""
+        for i in range(len(token_ids)):
+            # Decode tokens from start to current position
+            current_text = self.tokenizer.decode(token_ids[:i+1], skip_special_tokens=False)
+            current_bytes = current_text.encode('utf-8', errors='surrogatepass')
+            # The contribution of current token is the new bytes added
+            if len(current_bytes) >= len(prev_bytes):
+                new_bytes = current_bytes[len(prev_bytes):]
+                # Try to decode the new bytes; if incomplete, use empty string
+                try:
+                    token_text = new_bytes.decode('utf-8')
+                except UnicodeDecodeError:
+                    # Incomplete UTF-8 sequence, this token doesn't complete a character
+                    token_text = ""
+            else:
+                # Edge case: current decode is shorter (shouldn't happen normally)
+                token_text = ""
+            decoded_tokens.append(token_text)
+            prev_bytes = current_bytes
+        return decoded_tokens
+    def token_timestamps(
+        self,
+        calc_matrix: np.ndarray,
+        lyrics_tokens: List[int],
+        total_duration_seconds: float
+    ) -> List[TokenTimestamp]:
+        """
+        Generate per-token timestamps using DTW.
+        Args:
+            calc_matrix: Processed attention matrix [Tokens, Frames]
+            lyrics_tokens: List of token IDs
+            total_duration_seconds: Total audio duration
+        Returns:
+            List of TokenTimestamp objects
+        """
+        n_frames = calc_matrix.shape[-1]
+        text_indices, time_indices = dtw_cpu(-calc_matrix.astype(np.float64))
+        seconds_per_frame = total_duration_seconds / n_frames
+        alignment_results = []
+        # Use incremental decoding to properly handle multi-byte UTF-8 characters
+        decoded_tokens = self._decode_tokens_incrementally(lyrics_tokens)
+        for i in range(len(lyrics_tokens)):
+            mask = (text_indices == i)
+            if not np.any(mask):
+                start = alignment_results[-1].end if alignment_results else 0.0
+                end = start
+                token_conf = 0.0
+            else:
+                times = time_indices[mask] * seconds_per_frame
+                start = times[0]
+                end = times[-1]
+                token_conf = 0.0
+            if end < start:
+                end = start
+            alignment_results.append(TokenTimestamp(
+                token_id=lyrics_tokens[i],
+                text=decoded_tokens[i],
+                start=float(start),
+                end=float(end),
+                probability=token_conf
+            ))
+        return alignment_results
+    def _decode_sentence_from_tokens(self, tokens: List[TokenTimestamp]) -> str:
+        """
+        Decode a sentence by decoding all token IDs together.
+        This avoids UTF-8 encoding issues from joining individual token texts.
+        Args:
+            tokens: List of TokenTimestamp objects
+        Returns:
+            Properly decoded sentence text
+        """
+        token_ids = [t.token_id for t in tokens]
+        return self.tokenizer.decode(token_ids, skip_special_tokens=False)
+    def sentence_timestamps(
+        self,
+        token_alignment: List[TokenTimestamp]
+    ) -> List[SentenceTimestamp]:
+        """
+        Group token timestamps into sentence timestamps.
+        Args:
+            token_alignment: List of TokenTimestamp objects
+        Returns:
+            List of SentenceTimestamp objects
+        """
+        results = []
+        current_tokens = []
+        for token in token_alignment:
+            current_tokens.append(token)
+            if '\n' in token.text:
+                # Decode all token IDs together to avoid UTF-8 issues
+                full_text = self._decode_sentence_from_tokens(current_tokens)
+                if full_text.strip():
+                    valid_scores = [t.probability for t in current_tokens if t.probability > 0]
+                    sent_conf = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
+                    results.append(SentenceTimestamp(
+                        text=full_text.strip(),
+                        start=round(current_tokens[0].start, 3),
+                        end=round(current_tokens[-1].end, 3),
+                        tokens=list(current_tokens),
+                        confidence=sent_conf
+                    ))
+                current_tokens = []
+        # Handle last sentence
+        if current_tokens:
+            # Decode all token IDs together to avoid UTF-8 issues
+            full_text = self._decode_sentence_from_tokens(current_tokens)
+            if full_text.strip():
+                valid_scores = [t.probability for t in current_tokens if t.probability > 0]
+                sent_conf = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
+                results.append(SentenceTimestamp(
+                    text=full_text.strip(),
+                    start=round(current_tokens[0].start, 3),
+                    end=round(current_tokens[-1].end, 3),
+                    tokens=list(current_tokens),
+                    confidence=sent_conf
+                ))
+        # Normalize confidence scores
+        if results:
+            all_scores = [s.confidence for s in results]
+            min_score = min(all_scores)
+            max_score = max(all_scores)
+            score_range = max_score - min_score
+            if score_range > 1e-9:
+                for s in results:
+                    normalized_score = (s.confidence - min_score) / score_range
+                    s.confidence = round(normalized_score, 2)
+            else:
+                for s in results:
+                    s.confidence = round(s.confidence, 2)
+        return results
+    def format_lrc(
+        self,
+        sentence_timestamps: List[SentenceTimestamp],
+        include_end_time: bool = False
+    ) -> str:
+        """
+        Format sentence timestamps as LRC lyrics format.
+        Args:
+            sentence_timestamps: List of SentenceTimestamp objects
+            include_end_time: Whether to include end time (enhanced LRC format)
+        Returns:
+            LRC formatted string
+        """
+        lines = []
+        for sentence in sentence_timestamps:
+            # Convert seconds to mm:ss.xx format
+            start_minutes = int(sentence.start // 60)
+            start_seconds = sentence.start % 60
+            if include_end_time:
+                end_minutes = int(sentence.end // 60)
+                end_seconds = sentence.end % 60
+                timestamp = f"[{start_minutes:02d}:{start_seconds:05.2f}][{end_minutes:02d}:{end_seconds:05.2f}]"
+            else:
+                timestamp = f"[{start_minutes:02d}:{start_seconds:05.2f}]"
+            # Clean the text (remove structural tags like [verse], [chorus])
+            text = sentence.text
+            lines.append(f"{timestamp}{text}")
+        return "\n".join(lines)
+    def get_timestamps_and_lrc(
+        self,
+        calc_matrix: np.ndarray,
+        lyrics_tokens: List[int],
+        total_duration_seconds: float
+    ) -> Dict[str, Any]:
+        """
+        Convenience method to get both timestamps and LRC in one call.
+        Args:
+            calc_matrix: Processed attention matrix
+            lyrics_tokens: List of token IDs
+            total_duration_seconds: Total audio duration
+        Returns:
+            Dict containing token_timestamps, sentence_timestamps, and lrc_text
+        """
+        token_stamps = self.token_timestamps(
+            calc_matrix=calc_matrix,
+            lyrics_tokens=lyrics_tokens,
+            total_duration_seconds=total_duration_seconds
+        )
+        sentence_stamps = self.sentence_timestamps(token_stamps)
+        lrc_text = self.format_lrc(sentence_stamps)
+        return {
+            "token_timestamps": token_stamps,
+            "sentence_timestamps": sentence_stamps,
+            "lrc_text": lrc_text
+        }
+class MusicLyricScorer:
+    """
+    Scorer class for evaluating lyrics-to-audio alignment quality.
+    Focuses on calculating alignment quality metrics (Coverage, Monotonicity, Confidence)
+    using tensor operations for potential differentiability or GPU acceleration.
+    """
+    def __init__(self, tokenizer: Any):
+        """
+        Initialize the aligner.
+        Args:
+            tokenizer: Tokenizer instance (must implement .decode()).
+        """
+        self.tokenizer = tokenizer
+    def _generate_token_type_mask(self, token_ids: List[int]) -> np.ndarray:
+        """
+        Generate a mask distinguishing lyrics (1) from structural tags (0).
+        Uses self.tokenizer to decode tokens.
+        Args:
+            token_ids: List of token IDs.
+        Returns:
+            Numpy array of shape [len(token_ids)] with 1 or 0.
+        """
+        decoded_tokens = [self.tokenizer.decode([tid]) for tid in token_ids]
+        mask = np.ones(len(token_ids), dtype=np.int32)
+        in_bracket = False
+        for i, token_str in enumerate(decoded_tokens):
+            if '[' in token_str:
+                in_bracket = True
+            if in_bracket:
+                mask[i] = 0
+            if ']' in token_str:
+                in_bracket = False
+                mask[i] = 0
+        return mask
+    def _preprocess_attention(
+            self,
+            attention_matrix: Union[torch.Tensor, np.ndarray],
+            custom_config: Dict[int, List[int]],
+            medfilt_width: int = 1
+    ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[torch.Tensor]]:
+        """
+        Extracts and normalizes the attention matrix.
+        Logic V4: Uses Min-Max normalization to highlight energy differences.
+        Args:
+            attention_matrix: Raw attention tensor [Layers, Heads, Tokens, Frames].
+            custom_config: Config mapping layers to heads.
+            medfilt_width: Width for median filtering.
+        Returns:
+            Tuple of (calc_matrix, energy_matrix, avg_weights_tensor).
+        """
+        # 1. Prepare Tensor
+        if not isinstance(attention_matrix, torch.Tensor):
+            weights = torch.tensor(attention_matrix)
+        else:
+            weights = attention_matrix.clone()
+        weights = weights.cpu().float()
+        # 2. Select Heads based on config
+        selected_tensors = []
+        for layer_idx, head_indices in custom_config.items():
+            for head_idx in head_indices:
+                if layer_idx < weights.shape[0] and head_idx < weights.shape[1]:
+                    selected_tensors.append(weights[layer_idx, head_idx])
+        if not selected_tensors:
+            return None, None, None
+        weights_stack = torch.stack(selected_tensors, dim=0)
+        # 3. Average Heads
+        avg_weights = weights_stack.mean(dim=0)  # [Tokens, Frames]
+        # 4. Preprocessing Logic
+        # Min-Max normalization preserving energy distribution
+        # Median filter is applied to the energy matrix
+        energy_tensor = median_filter(avg_weights, filter_width=medfilt_width)
+        energy_matrix = energy_tensor.numpy()
+        e_min, e_max = energy_matrix.min(), energy_matrix.max()
+        if e_max - e_min > 1e-9:
+            energy_matrix = (energy_matrix - e_min) / (e_max - e_min)
+        else:
+            energy_matrix = np.zeros_like(energy_matrix)
+        # Contrast enhancement for DTW pathfinding
+        # calc_matrix is used for pathfinding, energy_matrix for scoring
+        calc_matrix = energy_matrix ** 2
+        return calc_matrix, energy_matrix, avg_weights
+    def _compute_alignment_metrics(
+            self,
+            energy_matrix: torch.Tensor,
+            path_coords: torch.Tensor,
+            type_mask: torch.Tensor,
+            time_weight: float = 0.01,
+            overlap_frames: float = 9.0,
+            instrumental_weight: float = 1.0
+    ) -> Tuple[float, float, float]:
+        """
+        Core metric calculation logic using high-precision Tensor operations.
+        Args:
+            energy_matrix: Normalized energy [Rows, Cols].
+            path_coords: DTW path coordinates [Steps, 2].
+            type_mask: Token type mask [Rows] (1=Lyrics, 0=Tags).
+            time_weight: Minimum energy threshold for monotonicity.
+            overlap_frames: Allowed overlap for monotonicity check.
+            instrumental_weight: Weight for non-lyric tokens in confidence calc.
+        Returns:
+            Tuple of (coverage, monotonicity, confidence).
+        """
+        # Ensure high precision for internal calculation
+        energy_matrix = energy_matrix.to(dtype=torch.float64)
+        path_coords = path_coords.long()
+        type_mask = type_mask.long()
+        device = energy_matrix.device
+        rows, cols = energy_matrix.shape
+        is_lyrics_row = (type_mask == 1)
+        # ================= A. Coverage Score =================
+        # Ratio of lyric lines that have significant energy peak
+        row_max_energies = energy_matrix.max(dim=1).values
+        total_sung_rows = is_lyrics_row.sum().double()
+        coverage_threshold = 0.1
+        valid_sung_mask = is_lyrics_row & (row_max_energies > coverage_threshold)
+        valid_sung_rows = valid_sung_mask.sum().double()
+        if total_sung_rows > 0:
+            coverage_score = valid_sung_rows / total_sung_rows
+        else:
+            coverage_score = torch.tensor(1.0, device=device, dtype=torch.float64)
+        # ================= B. Monotonicity Score =================
+        # Check if the "center of mass" of lyric lines moves forward in time
+        col_indices = torch.arange(cols, device=device, dtype=torch.float64)
+        # Zero out low energy noise
+        weights = torch.where(
+            energy_matrix > time_weight,
+            energy_matrix,
+            torch.zeros_like(energy_matrix)
+        )
+        sum_w = weights.sum(dim=1)
+        sum_t = (weights * col_indices).sum(dim=1)
+        # Calculate centroids
+        centroids = torch.full((rows,), -1.0, device=device, dtype=torch.float64)
+        valid_w_mask = sum_w > 1e-9
+        centroids[valid_w_mask] = sum_t[valid_w_mask] / sum_w[valid_w_mask]
+        # Extract sequence of valid lyrics centroids
+        valid_sequence_mask = is_lyrics_row & (centroids >= 0)
+        sung_centroids = centroids[valid_sequence_mask]
+        cnt = sung_centroids.shape[0]
+        if cnt > 1:
+            curr_c = sung_centroids[:-1]
+            next_c = sung_centroids[1:]
+            # Check non-decreasing order with overlap tolerance
+            non_decreasing = (next_c >= (curr_c - overlap_frames)).double().sum()
+            pairs = torch.tensor(cnt - 1, device=device, dtype=torch.float64)
+            monotonicity_score = non_decreasing / pairs
+        else:
+            monotonicity_score = torch.tensor(1.0, device=device, dtype=torch.float64)
+        # ================= C. Path Confidence =================
+        # Average energy along the optimal path
+        if path_coords.shape[0] > 0:
+            p_rows = path_coords[:, 0]
+            p_cols = path_coords[:, 1]
+            path_energies = energy_matrix[p_rows, p_cols]
+            step_weights = torch.ones_like(path_energies)
+            # Lower weight for instrumental/tag steps
+            is_inst_step = (type_mask[p_rows] == 0)
+            step_weights[is_inst_step] = instrumental_weight
+            total_energy = (path_energies * step_weights).sum()
+            total_steps = step_weights.sum()
+            if total_steps > 0:
+                path_confidence = total_energy / total_steps
+            else:
+                path_confidence = torch.tensor(0.0, device=device, dtype=torch.float64)
+        else:
+            path_confidence = torch.tensor(0.0, device=device, dtype=torch.float64)
+        return coverage_score.item(), monotonicity_score.item(), path_confidence.item()
+    def lyrics_alignment_info(
+            self,
+            attention_matrix: Union[torch.Tensor, np.ndarray],
+            token_ids: List[int],
+            custom_config: Dict[int, List[int]],
+            return_matrices: bool = False,
+            medfilt_width: int = 1
+    ) -> Dict[str, Any]:
+        """
+        Generates alignment path and processed matrices.
+        Args:
+            attention_matrix: Input attention tensor.
+            token_ids: Corresponding token IDs.
+            custom_config: Layer/Head configuration.
+            return_matrices: If True, returns matrices in the output.
+            medfilt_width: Median filter width.
+        Returns:
+            Dict or AlignmentInfo object containing path and masks.
+        """
+        calc_matrix, energy_matrix, vis_matrix = self._preprocess_attention(
+            attention_matrix, custom_config, medfilt_width
+        )
+        if calc_matrix is None:
+            return {
+                "calc_matrix": None,
+                "error": "No valid attention heads found"
+            }
+        # 1. Generate Semantic Mask (1=Lyrics, 0=Tags)
+        # Uses self.tokenizer internally
+        type_mask = self._generate_token_type_mask(token_ids)
+        # Safety check for shape mismatch
+        if len(type_mask) != energy_matrix.shape[0]:
+            # Fallback to all lyrics if shapes don't align
+            type_mask = np.ones(energy_matrix.shape[0], dtype=np.int32)
+        # 2. DTW Pathfinding
+        # Using negative calc_matrix because DTW minimizes cost
+        text_indices, time_indices = dtw_cpu(-calc_matrix.astype(np.float32))
+        path_coords = np.stack([text_indices, time_indices], axis=1)
+        return_dict = {
+            "path_coords": path_coords,
+            "type_mask": type_mask,
+            "energy_matrix": energy_matrix
+        }
+        if return_matrices:
+            return_dict['calc_matrix'] = calc_matrix
+            return_dict['vis_matrix'] = vis_matrix
+        return return_dict
+    def calculate_score(
+            self,
+            energy_matrix: Union[torch.Tensor, np.ndarray],
+            type_mask: Union[torch.Tensor, np.ndarray],
+            path_coords: Union[torch.Tensor, np.ndarray],
+            time_weight: float = 0.01,
+            overlap_frames: float = 9.0,
+            instrumental_weight: float = 1.0
+    ) -> Dict[str, Any]:
+        """
+        Calculates the final alignment score based on pre-computed components.
+        Args:
+            energy_matrix: Processed energy matrix.
+            type_mask: Token type mask.
+            path_coords: DTW path coordinates.
+            time_weight: Minimum energy threshold for monotonicity.
+            overlap_frames: Allowed backward movement frames.
+            instrumental_weight: Weight for non-lyric path steps.
+        Returns:
+            AlignmentScore object containing individual metrics and final score.
+        """
+        # Ensure Inputs are Tensors on the correct device
+        if not isinstance(energy_matrix, torch.Tensor):
+            energy_matrix = torch.tensor(energy_matrix, device='cuda', dtype=torch.float32)
+        device = energy_matrix.device
+        if not isinstance(type_mask, torch.Tensor):
+            type_mask = torch.tensor(type_mask, device=device, dtype=torch.long)
+        else:
+            type_mask = type_mask.to(device=device, dtype=torch.long)
+        if not isinstance(path_coords, torch.Tensor):
+            path_coords = torch.tensor(path_coords, device=device, dtype=torch.long)
+        else:
+            path_coords = path_coords.to(device=device, dtype=torch.long)
+        # Compute Metrics
+        coverage, monotonicity, confidence = self._compute_alignment_metrics(
+            energy_matrix=energy_matrix,
+            path_coords=path_coords,
+            type_mask=type_mask,
+            time_weight=time_weight,
+            overlap_frames=overlap_frames,
+            instrumental_weight=instrumental_weight
+        )
+        # Final Score Calculation
+        # (Cov^2 * Mono^2 * Conf)
+        final_score = (coverage ** 2) * (monotonicity ** 2) * confidence
+        final_score = float(np.clip(final_score, 0.0, 1.0))
+        return {
+            "lyrics_score": round(final_score, 4)
+        }

code/acestep/genres_vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

code/acestep/gradio_ui/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from acestep.gradio_ui.interfaces import create_gradio_interface

code/acestep/gradio_ui/events/__init__.py ADDED Viewed

	@@ -0,0 +1,1129 @@

+"""
+Gradio UI Event Handlers Module
+Main entry point for setting up all event handlers
+"""
+import gradio as gr
+from typing import Optional
+# Import handler modules
+from . import generation_handlers as gen_h
+from . import results_handlers as res_h
+from . import training_handlers as train_h
+from acestep.gradio_ui.i18n import t
+def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, dataset_section, generation_section, results_section):
+    """Setup event handlers connecting UI components and business logic"""
+    # ========== Dataset Handlers ==========
+    dataset_section["import_dataset_btn"].click(
+        fn=dataset_handler.import_dataset,
+        inputs=[dataset_section["dataset_type"]],
+        outputs=[dataset_section["data_status"]]
+    )
+    # ========== Service Initialization ==========
+    generation_section["refresh_btn"].click(
+        fn=lambda: gen_h.refresh_checkpoints(dit_handler),
+        outputs=[generation_section["checkpoint_dropdown"]]
+    )
+    generation_section["config_path"].change(
+        fn=gen_h.update_model_type_settings,
+        inputs=[generation_section["config_path"]],
+        outputs=[
+            generation_section["inference_steps"],
+            generation_section["guidance_scale"],
+            generation_section["use_adg"],
+            generation_section["shift"],
+            generation_section["cfg_interval_start"],
+            generation_section["cfg_interval_end"],
+            generation_section["task_type"],
+        ]
+    )
+    generation_section["init_btn"].click(
+        fn=lambda *args: gen_h.init_service_wrapper(dit_handler, llm_handler, *args),
+        inputs=[
+            generation_section["checkpoint_dropdown"],
+            generation_section["config_path"],
+            generation_section["device"],
+            generation_section["init_llm_checkbox"],
+            generation_section["lm_model_path"],
+            generation_section["backend_dropdown"],
+            generation_section["use_flash_attention_checkbox"],
+            generation_section["offload_to_cpu_checkbox"],
+            generation_section["offload_dit_to_cpu_checkbox"],
+        ],
+        outputs=[
+            generation_section["init_status"],
+            generation_section["generate_btn"],
+            generation_section["service_config_accordion"],
+            # Model type settings (updated based on actual loaded model)
+            generation_section["inference_steps"],
+            generation_section["guidance_scale"],
+            generation_section["use_adg"],
+            generation_section["shift"],
+            generation_section["cfg_interval_start"],
+            generation_section["cfg_interval_end"],
+            generation_section["task_type"],
+        ]
+    )
+    # ========== LoRA Handlers ==========
+    generation_section["load_lora_btn"].click(
+        fn=dit_handler.load_lora,
+        inputs=[generation_section["lora_path"]],
+        outputs=[generation_section["lora_status"]]
+    ).then(
+        # Update checkbox to enabled state after loading
+        fn=lambda: gr.update(value=True),
+        outputs=[generation_section["use_lora_checkbox"]]
+    )
+    generation_section["unload_lora_btn"].click(
+        fn=dit_handler.unload_lora,
+        outputs=[generation_section["lora_status"]]
+    ).then(
+        # Update checkbox to disabled state after unloading
+        fn=lambda: gr.update(value=False),
+        outputs=[generation_section["use_lora_checkbox"]]
+    )
+    generation_section["use_lora_checkbox"].change(
+        fn=dit_handler.set_use_lora,
+        inputs=[generation_section["use_lora_checkbox"]],
+        outputs=[generation_section["lora_status"]]
+    )
+    # ========== UI Visibility Updates ==========
+    generation_section["init_llm_checkbox"].change(
+        fn=gen_h.update_negative_prompt_visibility,
+        inputs=[generation_section["init_llm_checkbox"]],
+        outputs=[generation_section["lm_negative_prompt"]]
+    )
+    generation_section["init_llm_checkbox"].change(
+        fn=gen_h.update_audio_cover_strength_visibility,
+        inputs=[generation_section["task_type"], generation_section["init_llm_checkbox"]],
+        outputs=[generation_section["audio_cover_strength"]]
+    )
+    generation_section["task_type"].change(
+        fn=gen_h.update_audio_cover_strength_visibility,
+        inputs=[generation_section["task_type"], generation_section["init_llm_checkbox"]],
+        outputs=[generation_section["audio_cover_strength"]]
+    )
+    generation_section["batch_size_input"].change(
+        fn=gen_h.update_audio_components_visibility,
+        inputs=[generation_section["batch_size_input"]],
+        outputs=[
+            results_section["audio_col_1"],
+            results_section["audio_col_2"],
+            results_section["audio_col_3"],
+            results_section["audio_col_4"],
+            results_section["audio_row_5_8"],
+            results_section["audio_col_5"],
+            results_section["audio_col_6"],
+            results_section["audio_col_7"],
+            results_section["audio_col_8"],
+        ]
+    )
+    # ========== Audio Conversion ==========
+    generation_section["convert_src_to_codes_btn"].click(
+        fn=lambda src: gen_h.convert_src_audio_to_codes_wrapper(dit_handler, src),
+        inputs=[generation_section["src_audio"]],
+        outputs=[generation_section["text2music_audio_code_string"]]
+    )
+    # ========== Instruction UI Updates ==========
+    for trigger in [generation_section["task_type"], generation_section["track_name"], generation_section["complete_track_classes"]]:
+        trigger.change(
+            fn=lambda *args: gen_h.update_instruction_ui(dit_handler, *args),
+            inputs=[
+                generation_section["task_type"],
+                generation_section["track_name"],
+                generation_section["complete_track_classes"],
+                generation_section["text2music_audio_code_string"],
+                generation_section["init_llm_checkbox"]
+            ],
+            outputs=[
+                generation_section["instruction_display_gen"],
+                generation_section["track_name"],
+                generation_section["complete_track_classes"],
+                generation_section["audio_cover_strength"],
+                generation_section["repainting_group"],
+                generation_section["text2music_audio_codes_group"],
+            ]
+        )
+    # ========== Sample/Transcribe Handlers ==========
+    # Load random example from ./examples/text2music directory
+    generation_section["sample_btn"].click(
+        fn=lambda task: gen_h.load_random_example(task) + (True,),
+        inputs=[
+            generation_section["task_type"],
+        ],
+        outputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["think_checkbox"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["vocal_language"],
+            generation_section["time_signature"],
+            results_section["is_format_caption_state"]
+        ]
+    )
+    generation_section["text2music_audio_code_string"].change(
+        fn=gen_h.update_transcribe_button_text,
+        inputs=[generation_section["text2music_audio_code_string"]],
+        outputs=[generation_section["transcribe_btn"]]
+    )
+    generation_section["transcribe_btn"].click(
+        fn=lambda codes, debug: gen_h.transcribe_audio_codes(llm_handler, codes, debug),
+        inputs=[
+            generation_section["text2music_audio_code_string"],
+            generation_section["constrained_decoding_debug"]
+        ],
+        outputs=[
+            results_section["status_output"],
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["vocal_language"],
+            generation_section["time_signature"],
+            results_section["is_format_caption_state"]
+        ]
+    )
+    # ========== Reset Format Caption Flag ==========
+    for trigger in [generation_section["captions"], generation_section["lyrics"], generation_section["bpm"],
+                    generation_section["key_scale"], generation_section["time_signature"],
+                    generation_section["vocal_language"], generation_section["audio_duration"]]:
+        trigger.change(
+            fn=gen_h.reset_format_caption_flag,
+            inputs=[],
+            outputs=[results_section["is_format_caption_state"]]
+        )
+    # ========== Audio Uploads Accordion ==========
+    for trigger in [generation_section["reference_audio"], generation_section["src_audio"]]:
+        trigger.change(
+            fn=gen_h.update_audio_uploads_accordion,
+            inputs=[generation_section["reference_audio"], generation_section["src_audio"]],
+            outputs=[generation_section["audio_uploads_accordion"]]
+        )
+    # ========== Instrumental Checkbox ==========
+    generation_section["instrumental_checkbox"].change(
+        fn=gen_h.handle_instrumental_checkbox,
+        inputs=[generation_section["instrumental_checkbox"], generation_section["lyrics"]],
+        outputs=[generation_section["lyrics"]]
+    )
+    # ========== Format Button ==========
+    # Note: cfg_scale and negative_prompt are not supported in format mode
+    generation_section["format_btn"].click(
+        fn=lambda caption, lyrics, bpm, duration, key_scale, time_sig, temp, top_k, top_p, debug: gen_h.handle_format_sample(
+            llm_handler, caption, lyrics, bpm, duration, key_scale, time_sig, temp, top_k, top_p, debug
+        ),
+        inputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["lm_temperature"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["constrained_decoding_debug"],
+        ],
+        outputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["vocal_language"],
+            generation_section["time_signature"],
+            results_section["is_format_caption_state"],
+            results_section["status_output"],
+        ]
+    )
+    # ========== Simple/Custom Mode Toggle ==========
+    generation_section["generation_mode"].change(
+        fn=gen_h.handle_generation_mode_change,
+        inputs=[generation_section["generation_mode"]],
+        outputs=[
+            generation_section["simple_mode_group"],
+            generation_section["caption_accordion"],
+            generation_section["lyrics_accordion"],
+            generation_section["generate_btn"],
+            generation_section["simple_sample_created"],
+            generation_section["optional_params_accordion"],
+        ]
+    )
+    # ========== Simple Mode Instrumental Checkbox ==========
+    # When instrumental is checked, disable vocal language and set to ["unknown"]
+    generation_section["simple_instrumental_checkbox"].change(
+        fn=gen_h.handle_simple_instrumental_change,
+        inputs=[generation_section["simple_instrumental_checkbox"]],
+        outputs=[generation_section["simple_vocal_language"]]
+    )
+    # ========== Random Description Button ==========
+    generation_section["random_desc_btn"].click(
+        fn=gen_h.load_random_simple_description,
+        inputs=[],
+        outputs=[
+            generation_section["simple_query_input"],
+            generation_section["simple_instrumental_checkbox"],
+            generation_section["simple_vocal_language"],
+        ]
+    )
+    # ========== Create Sample Button (Simple Mode) ==========
+    # Note: cfg_scale and negative_prompt are not supported in create_sample mode
+    generation_section["create_sample_btn"].click(
+        fn=lambda query, instrumental, vocal_lang, temp, top_k, top_p, debug: gen_h.handle_create_sample(
+            llm_handler, query, instrumental, vocal_lang, temp, top_k, top_p, debug
+        ),
+        inputs=[
+            generation_section["simple_query_input"],
+            generation_section["simple_instrumental_checkbox"],
+            generation_section["simple_vocal_language"],
+            generation_section["lm_temperature"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["constrained_decoding_debug"],
+        ],
+        outputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["audio_duration"],
+            generation_section["key_scale"],
+            generation_section["vocal_language"],
+            generation_section["simple_vocal_language"],
+            generation_section["time_signature"],
+            generation_section["instrumental_checkbox"],
+            generation_section["caption_accordion"],
+            generation_section["lyrics_accordion"],
+            generation_section["generate_btn"],
+            generation_section["simple_sample_created"],
+            generation_section["think_checkbox"],
+            results_section["is_format_caption_state"],
+            results_section["status_output"],
+        ]
+    )
+    # ========== Load/Save Metadata ==========
+    generation_section["load_file"].upload(
+        fn=gen_h.load_metadata,
+        inputs=[generation_section["load_file"]],
+        outputs=[
+            generation_section["task_type"],
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["vocal_language"],
+            generation_section["bpm"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["audio_duration"],
+            generation_section["batch_size_input"],
+            generation_section["inference_steps"],
+            generation_section["guidance_scale"],
+            generation_section["seed"],
+            generation_section["random_seed_checkbox"],
+            generation_section["use_adg"],
+            generation_section["cfg_interval_start"],
+            generation_section["cfg_interval_end"],
+            generation_section["shift"],
+            generation_section["infer_method"],
+            generation_section["custom_timesteps"],
+            generation_section["audio_format"],
+            generation_section["lm_temperature"],
+            generation_section["lm_cfg_scale"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["lm_negative_prompt"],
+            generation_section["use_cot_metas"],  # Added: use_cot_metas
+            generation_section["use_cot_caption"],
+            generation_section["use_cot_language"],
+            generation_section["audio_cover_strength"],
+            generation_section["think_checkbox"],
+            generation_section["text2music_audio_code_string"],
+            generation_section["repainting_start"],
+            generation_section["repainting_end"],
+            generation_section["track_name"],
+            generation_section["complete_track_classes"],
+            generation_section["instrumental_checkbox"],  # Added: instrumental_checkbox
+            results_section["is_format_caption_state"]
+        ]
+    )
+    # Save buttons for all 8 audio outputs
+    download_existing_js = """(current_audio, batch_files) => {
+    // Debug: print what the input actually is
+    console.log("👉 [Debug] Current Audio Input:", current_audio);
+    // 1. Safety check
+    if (!current_audio) {
+        console.warn("⚠️ No audio selected or audio is empty.");
+        return;
+    }
+    if (!batch_files || !Array.isArray(batch_files)) {
+        console.warn("⚠️ Batch file list is empty/not ready.");
+        return;
+    }
+    // 2. Smartly extract path string
+    let pathString = "";
+    if (typeof current_audio === "string") {
+        // Case A: direct path string received
+        pathString = current_audio;
+    } else if (typeof current_audio === "object") {
+        // Case B: an object is received, try common properties
+        // Gradio file objects usually have path, url, or name
+        pathString = current_audio.path || current_audio.name || current_audio.url || "";
+    }
+    if (!pathString) {
+        console.error("❌ Error: Could not extract a valid path string from input.", current_audio);
+        return;
+    }
+    // 3. Extract Key (UUID)
+    // Path could be /tmp/.../uuid.mp3 or url like /file=.../uuid.mp3
+    let filename = pathString.split(/[\\\\/]/).pop(); // get the filename
+    let key = filename.split('.')[0]; // get UUID without extension
+    console.log(`🔑 Key extracted: ${key}`);
+    // 4. Find matching file(s) in the list
+    let targets = batch_files.filter(f => {
+        // Also extract names from batch_files objects
+        // f usually contains name (backend path) and orig_name (download name)
+        const fPath = f.name || f.path || "";
+        return fPath.includes(key);
+    });
+    if (targets.length === 0) {
+        console.warn("❌ No matching files found in batch list for key:", key);
+        alert("Batch list does not contain this file yet. Please wait for generation to finish.");
+        return;
+    }
+    // 5. Trigger download(s)
+    console.log(`🎯 Found ${targets.length} files to download.`);
+    targets.forEach((f, index) => {
+        setTimeout(() => {
+            const a = document.createElement('a');
+            // Prefer url (frontend-accessible link), otherwise try data
+            a.href = f.url || f.data;
+            a.download = f.orig_name || "download";
+            a.style.display = 'none';
+            document.body.appendChild(a);
+            a.click();
+            document.body.removeChild(a);
+        }, index * 1000); // 300ms interval to avoid browser blocking
+    });
+}
+"""
+    for btn_idx in range(1, 9):
+        results_section[f"save_btn_{btn_idx}"].click(
+            fn=None,
+            inputs=[
+                results_section[f"generated_audio_{btn_idx}"],
+                results_section["generated_audio_batch"],
+            ],
+        js=download_existing_js  # Run the above JS
+    )
+    # ========== Send to SRC Handlers ==========
+    for btn_idx in range(1, 9):
+        results_section[f"send_to_src_btn_{btn_idx}"].click(
+            fn=res_h.send_audio_to_src_with_metadata,
+            inputs=[
+                results_section[f"generated_audio_{btn_idx}"],
+                results_section["lm_metadata_state"]
+            ],
+            outputs=[
+                generation_section["src_audio"],
+                generation_section["bpm"],
+                generation_section["captions"],
+                generation_section["lyrics"],
+                generation_section["audio_duration"],
+                generation_section["key_scale"],
+                generation_section["vocal_language"],
+                generation_section["time_signature"],
+                results_section["is_format_caption_state"]
+            ]
+        )
+    # ========== Score Calculation Handlers ==========
+    # Use default argument to capture btn_idx value at definition time (Python closure fix)
+    def make_score_handler(idx):
+        return lambda scale, batch_idx, queue: res_h.calculate_score_handler_with_selection(
+            dit_handler, llm_handler, idx, scale, batch_idx, queue
+        )
+    for btn_idx in range(1, 9):
+        results_section[f"score_btn_{btn_idx}"].click(
+            fn=make_score_handler(btn_idx),
+            inputs=[
+                generation_section["score_scale"],
+                results_section["current_batch_index"],
+                results_section["batch_queue"],
+            ],
+            outputs=[
+                results_section[f"score_display_{btn_idx}"],
+                results_section[f"details_accordion_{btn_idx}"],
+                results_section["batch_queue"]
+            ]
+        )
+    # ========== LRC Timestamp Handlers ==========
+    # Use default argument to capture btn_idx value at definition time (Python closure fix)
+    def make_lrc_handler(idx):
+        return lambda batch_idx, queue, vocal_lang, infer_steps: res_h.generate_lrc_handler(
+            dit_handler, idx, batch_idx, queue, vocal_lang, infer_steps
+        )
+    for btn_idx in range(1, 9):
+        results_section[f"lrc_btn_{btn_idx}"].click(
+            fn=make_lrc_handler(btn_idx),
+            inputs=[
+                results_section["current_batch_index"],
+                results_section["batch_queue"],
+                generation_section["vocal_language"],
+                generation_section["inference_steps"],
+            ],
+            outputs=[
+                results_section[f"lrc_display_{btn_idx}"],
+                results_section[f"details_accordion_{btn_idx}"],
+                # NOTE: Removed generated_audio output!
+                # Audio subtitles are now updated via lrc_display.change() event.
+                results_section["batch_queue"]
+            ]
+        )
+    def generation_wrapper(*args):
+        yield from res_h.generate_with_batch_management(dit_handler, llm_handler, *args)
+    # ========== Generation Handler ==========
+    generation_section["generate_btn"].click(
+        fn=generation_wrapper,
+        inputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["vocal_language"],
+            generation_section["inference_steps"],
+            generation_section["guidance_scale"],
+            generation_section["random_seed_checkbox"],
+            generation_section["seed"],
+            generation_section["reference_audio"],
+            generation_section["audio_duration"],
+            generation_section["batch_size_input"],
+            generation_section["src_audio"],
+            generation_section["text2music_audio_code_string"],
+            generation_section["repainting_start"],
+            generation_section["repainting_end"],
+            generation_section["instruction_display_gen"],
+            generation_section["audio_cover_strength"],
+            generation_section["task_type"],
+            generation_section["use_adg"],
+            generation_section["cfg_interval_start"],
+            generation_section["cfg_interval_end"],
+            generation_section["shift"],
+            generation_section["infer_method"],
+            generation_section["custom_timesteps"],
+            generation_section["audio_format"],
+            generation_section["lm_temperature"],
+            generation_section["think_checkbox"],
+            generation_section["lm_cfg_scale"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["lm_negative_prompt"],
+            generation_section["use_cot_metas"],
+            generation_section["use_cot_caption"],
+            generation_section["use_cot_language"],
+            results_section["is_format_caption_state"],
+            generation_section["constrained_decoding_debug"],
+            generation_section["allow_lm_batch"],
+            generation_section["auto_score"],
+            generation_section["auto_lrc"],
+            generation_section["score_scale"],
+            generation_section["lm_batch_chunk_size"],
+            generation_section["track_name"],
+            generation_section["complete_track_classes"],
+            generation_section["autogen_checkbox"],
+            results_section["current_batch_index"],
+            results_section["total_batches"],
+            results_section["batch_queue"],
+            results_section["generation_params_state"],
+        ],
+        outputs=[
+            results_section["generated_audio_1"],
+            results_section["generated_audio_2"],
+            results_section["generated_audio_3"],
+            results_section["generated_audio_4"],
+            results_section["generated_audio_5"],
+            results_section["generated_audio_6"],
+            results_section["generated_audio_7"],
+            results_section["generated_audio_8"],
+            results_section["generated_audio_batch"],
+            results_section["generation_info"],
+            results_section["status_output"],
+            generation_section["seed"],
+            results_section["score_display_1"],
+            results_section["score_display_2"],
+            results_section["score_display_3"],
+            results_section["score_display_4"],
+            results_section["score_display_5"],
+            results_section["score_display_6"],
+            results_section["score_display_7"],
+            results_section["score_display_8"],
+            results_section["codes_display_1"],
+            results_section["codes_display_2"],
+            results_section["codes_display_3"],
+            results_section["codes_display_4"],
+            results_section["codes_display_5"],
+            results_section["codes_display_6"],
+            results_section["codes_display_7"],
+            results_section["codes_display_8"],
+            results_section["details_accordion_1"],
+            results_section["details_accordion_2"],
+            results_section["details_accordion_3"],
+            results_section["details_accordion_4"],
+            results_section["details_accordion_5"],
+            results_section["details_accordion_6"],
+            results_section["details_accordion_7"],
+            results_section["details_accordion_8"],
+            results_section["lrc_display_1"],
+            results_section["lrc_display_2"],
+            results_section["lrc_display_3"],
+            results_section["lrc_display_4"],
+            results_section["lrc_display_5"],
+            results_section["lrc_display_6"],
+            results_section["lrc_display_7"],
+            results_section["lrc_display_8"],
+            results_section["lm_metadata_state"],
+            results_section["is_format_caption_state"],
+            results_section["current_batch_index"],
+            results_section["total_batches"],
+            results_section["batch_queue"],
+            results_section["generation_params_state"],
+            results_section["batch_indicator"],
+            results_section["prev_batch_btn"],
+            results_section["next_batch_btn"],
+            results_section["next_batch_status"],
+            results_section["restore_params_btn"],
+        ]
+    ).then(
+        fn=lambda *args: res_h.generate_next_batch_background(dit_handler, llm_handler, *args),
+        inputs=[
+            generation_section["autogen_checkbox"],
+            results_section["generation_params_state"],
+            results_section["current_batch_index"],
+            results_section["total_batches"],
+            results_section["batch_queue"],
+            results_section["is_format_caption_state"],
+        ],
+        outputs=[
+            results_section["batch_queue"],
+            results_section["total_batches"],
+            results_section["next_batch_status"],
+            results_section["next_batch_btn"],
+        ]
+    )
+    # ========== Batch Navigation Handlers ==========
+    results_section["prev_batch_btn"].click(
+        fn=res_h.navigate_to_previous_batch,
+        inputs=[
+            results_section["current_batch_index"],
+            results_section["batch_queue"],
+        ],
+        outputs=[
+            results_section["generated_audio_1"],
+            results_section["generated_audio_2"],
+            results_section["generated_audio_3"],
+            results_section["generated_audio_4"],
+            results_section["generated_audio_5"],
+            results_section["generated_audio_6"],
+            results_section["generated_audio_7"],
+            results_section["generated_audio_8"],
+            results_section["generated_audio_batch"],
+            results_section["generation_info"],
+            results_section["current_batch_index"],
+            results_section["batch_indicator"],
+            results_section["prev_batch_btn"],
+            results_section["next_batch_btn"],
+            results_section["status_output"],
+            results_section["score_display_1"],
+            results_section["score_display_2"],
+            results_section["score_display_3"],
+            results_section["score_display_4"],
+            results_section["score_display_5"],
+            results_section["score_display_6"],
+            results_section["score_display_7"],
+            results_section["score_display_8"],
+            results_section["codes_display_1"],
+            results_section["codes_display_2"],
+            results_section["codes_display_3"],
+            results_section["codes_display_4"],
+            results_section["codes_display_5"],
+            results_section["codes_display_6"],
+            results_section["codes_display_7"],
+            results_section["codes_display_8"],
+            results_section["lrc_display_1"],
+            results_section["lrc_display_2"],
+            results_section["lrc_display_3"],
+            results_section["lrc_display_4"],
+            results_section["lrc_display_5"],
+            results_section["lrc_display_6"],
+            results_section["lrc_display_7"],
+            results_section["lrc_display_8"],
+            results_section["details_accordion_1"],
+            results_section["details_accordion_2"],
+            results_section["details_accordion_3"],
+            results_section["details_accordion_4"],
+            results_section["details_accordion_5"],
+            results_section["details_accordion_6"],
+            results_section["details_accordion_7"],
+            results_section["details_accordion_8"],
+            results_section["restore_params_btn"],
+        ]
+    )
+    results_section["next_batch_btn"].click(
+        fn=res_h.capture_current_params,
+        inputs=[
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["vocal_language"],
+            generation_section["inference_steps"],
+            generation_section["guidance_scale"],
+            generation_section["random_seed_checkbox"],
+            generation_section["seed"],
+            generation_section["reference_audio"],
+            generation_section["audio_duration"],
+            generation_section["batch_size_input"],
+            generation_section["src_audio"],
+            generation_section["text2music_audio_code_string"],
+            generation_section["repainting_start"],
+            generation_section["repainting_end"],
+            generation_section["instruction_display_gen"],
+            generation_section["audio_cover_strength"],
+            generation_section["task_type"],
+            generation_section["use_adg"],
+            generation_section["cfg_interval_start"],
+            generation_section["cfg_interval_end"],
+            generation_section["shift"],
+            generation_section["infer_method"],
+            generation_section["custom_timesteps"],
+            generation_section["audio_format"],
+            generation_section["lm_temperature"],
+            generation_section["think_checkbox"],
+            generation_section["lm_cfg_scale"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["lm_negative_prompt"],
+            generation_section["use_cot_metas"],
+            generation_section["use_cot_caption"],
+            generation_section["use_cot_language"],
+            generation_section["constrained_decoding_debug"],
+            generation_section["allow_lm_batch"],
+            generation_section["auto_score"],
+            generation_section["auto_lrc"],
+            generation_section["score_scale"],
+            generation_section["lm_batch_chunk_size"],
+            generation_section["track_name"],
+            generation_section["complete_track_classes"],
+        ],
+        outputs=[results_section["generation_params_state"]]
+    ).then(
+        fn=res_h.navigate_to_next_batch,
+        inputs=[
+            generation_section["autogen_checkbox"],
+            results_section["current_batch_index"],
+            results_section["total_batches"],
+            results_section["batch_queue"],
+        ],
+        outputs=[
+            results_section["generated_audio_1"],
+            results_section["generated_audio_2"],
+            results_section["generated_audio_3"],
+            results_section["generated_audio_4"],
+            results_section["generated_audio_5"],
+            results_section["generated_audio_6"],
+            results_section["generated_audio_7"],
+            results_section["generated_audio_8"],
+            results_section["generated_audio_batch"],
+            results_section["generation_info"],
+            results_section["current_batch_index"],
+            results_section["batch_indicator"],
+            results_section["prev_batch_btn"],
+            results_section["next_batch_btn"],
+            results_section["status_output"],
+            results_section["next_batch_status"],
+            results_section["score_display_1"],
+            results_section["score_display_2"],
+            results_section["score_display_3"],
+            results_section["score_display_4"],
+            results_section["score_display_5"],
+            results_section["score_display_6"],
+            results_section["score_display_7"],
+            results_section["score_display_8"],
+            results_section["codes_display_1"],
+            results_section["codes_display_2"],
+            results_section["codes_display_3"],
+            results_section["codes_display_4"],
+            results_section["codes_display_5"],
+            results_section["codes_display_6"],
+            results_section["codes_display_7"],
+            results_section["codes_display_8"],
+            results_section["lrc_display_1"],
+            results_section["lrc_display_2"],
+            results_section["lrc_display_3"],
+            results_section["lrc_display_4"],
+            results_section["lrc_display_5"],
+            results_section["lrc_display_6"],
+            results_section["lrc_display_7"],
+            results_section["lrc_display_8"],
+            results_section["details_accordion_1"],
+            results_section["details_accordion_2"],
+            results_section["details_accordion_3"],
+            results_section["details_accordion_4"],
+            results_section["details_accordion_5"],
+            results_section["details_accordion_6"],
+            results_section["details_accordion_7"],
+            results_section["details_accordion_8"],
+            results_section["restore_params_btn"],
+        ]
+    ).then(
+        fn=lambda *args: res_h.generate_next_batch_background(dit_handler, llm_handler, *args),
+        inputs=[
+            generation_section["autogen_checkbox"],
+            results_section["generation_params_state"],
+            results_section["current_batch_index"],
+            results_section["total_batches"],
+            results_section["batch_queue"],
+            results_section["is_format_caption_state"],
+        ],
+        outputs=[
+            results_section["batch_queue"],
+            results_section["total_batches"],
+            results_section["next_batch_status"],
+            results_section["next_batch_btn"],
+        ]
+    )
+    # ========== Restore Parameters Handler ==========
+    results_section["restore_params_btn"].click(
+        fn=res_h.restore_batch_parameters,
+        inputs=[
+            results_section["current_batch_index"],
+            results_section["batch_queue"]
+        ],
+        outputs=[
+            generation_section["text2music_audio_code_string"],
+            generation_section["captions"],
+            generation_section["lyrics"],
+            generation_section["bpm"],
+            generation_section["key_scale"],
+            generation_section["time_signature"],
+            generation_section["vocal_language"],
+            generation_section["audio_duration"],
+            generation_section["batch_size_input"],
+            generation_section["inference_steps"],
+            generation_section["lm_temperature"],
+            generation_section["lm_cfg_scale"],
+            generation_section["lm_top_k"],
+            generation_section["lm_top_p"],
+            generation_section["think_checkbox"],
+            generation_section["use_cot_caption"],
+            generation_section["use_cot_language"],
+            generation_section["allow_lm_batch"],
+            generation_section["track_name"],
+            generation_section["complete_track_classes"],
+        ]
+    )
+    # ========== LRC Display Change Handlers ==========
+    # NEW APPROACH: Use lrc_display.change() to update audio subtitles
+    # This decouples audio value updates from subtitle updates, avoiding flickering.
+    #
+    # When lrc_display text changes (from generate, LRC button, or manual edit):
+    # 1. lrc_display.change() is triggered
+    # 2. update_audio_subtitles_from_lrc() parses LRC and updates audio subtitles
+    # 3. Audio value is NEVER updated here - only subtitles
+    for lrc_idx in range(1, 9):
+        results_section[f"lrc_display_{lrc_idx}"].change(
+            fn=res_h.update_audio_subtitles_from_lrc,
+            inputs=[
+                results_section[f"lrc_display_{lrc_idx}"],
+                # audio_duration not needed - parse_lrc_to_subtitles calculates end time from timestamps
+            ],
+            outputs=[
+                results_section[f"generated_audio_{lrc_idx}"],  # Only updates subtitles, not value
+            ]
+        )
+def setup_training_event_handlers(demo, dit_handler, llm_handler, training_section):
+    """Setup event handlers for the training tab (dataset builder and LoRA training)"""
+    # ========== Load Existing Dataset (Top Section) ==========
+    # Load existing dataset JSON at the top of Dataset Builder
+    training_section["load_json_btn"].click(
+        fn=train_h.load_existing_dataset_for_preprocess,
+        inputs=[
+            training_section["load_json_path"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[
+            training_section["load_json_status"],
+            training_section["audio_files_table"],
+            training_section["sample_selector"],
+            training_section["dataset_builder_state"],
+            # Also update preview fields with first sample
+            training_section["preview_audio"],
+            training_section["preview_filename"],
+            training_section["edit_caption"],
+            training_section["edit_lyrics"],
+            training_section["edit_bpm"],
+            training_section["edit_keyscale"],
+            training_section["edit_timesig"],
+            training_section["edit_duration"],
+            training_section["edit_language"],
+            training_section["edit_instrumental"],
+        ]
+    )
+    # ========== Dataset Builder Handlers ==========
+    # Scan directory for audio files
+    training_section["scan_btn"].click(
+        fn=lambda dir, name, tag, pos, instr, state: train_h.scan_directory(
+            dir, name, tag, pos, instr, state
+        ),
+        inputs=[
+            training_section["audio_directory"],
+            training_section["dataset_name"],
+            training_section["custom_tag"],
+            training_section["tag_position"],
+            training_section["all_instrumental"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[
+            training_section["audio_files_table"],
+            training_section["scan_status"],
+            training_section["sample_selector"],
+            training_section["dataset_builder_state"],
+        ]
+    )
+    # Auto-label all samples
+    training_section["auto_label_btn"].click(
+        fn=lambda state, skip: train_h.auto_label_all(dit_handler, llm_handler, state, skip),
+        inputs=[
+            training_section["dataset_builder_state"],
+            training_section["skip_metas"],
+        ],
+        outputs=[
+            training_section["audio_files_table"],
+            training_section["label_progress"],
+            training_section["dataset_builder_state"],
+        ]
+    )
+    # Sample selector change - update preview
+    training_section["sample_selector"].change(
+        fn=train_h.get_sample_preview,
+        inputs=[
+            training_section["sample_selector"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[
+            training_section["preview_audio"],
+            training_section["preview_filename"],
+            training_section["edit_caption"],
+            training_section["edit_lyrics"],
+            training_section["edit_bpm"],
+            training_section["edit_keyscale"],
+            training_section["edit_timesig"],
+            training_section["edit_duration"],
+            training_section["edit_language"],
+            training_section["edit_instrumental"],
+        ]
+    )
+    # Save sample edit
+    training_section["save_edit_btn"].click(
+        fn=train_h.save_sample_edit,
+        inputs=[
+            training_section["sample_selector"],
+            training_section["edit_caption"],
+            training_section["edit_lyrics"],
+            training_section["edit_bpm"],
+            training_section["edit_keyscale"],
+            training_section["edit_timesig"],
+            training_section["edit_language"],
+            training_section["edit_instrumental"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[
+            training_section["audio_files_table"],
+            training_section["edit_status"],
+            training_section["dataset_builder_state"],
+        ]
+    )
+    # Update settings when changed
+    for trigger in [training_section["custom_tag"], training_section["tag_position"], training_section["all_instrumental"]]:
+        trigger.change(
+            fn=train_h.update_settings,
+            inputs=[
+                training_section["custom_tag"],
+                training_section["tag_position"],
+                training_section["all_instrumental"],
+                training_section["dataset_builder_state"],
+            ],
+            outputs=[training_section["dataset_builder_state"]]
+        )
+    # Save dataset
+    training_section["save_dataset_btn"].click(
+        fn=train_h.save_dataset,
+        inputs=[
+            training_section["save_path"],
+            training_section["dataset_name"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[training_section["save_status"]]
+    )
+    # ========== Preprocess Handlers ==========
+    # Load existing dataset JSON for preprocessing
+    # This also updates the preview section so users can view/edit samples
+    training_section["load_existing_dataset_btn"].click(
+        fn=train_h.load_existing_dataset_for_preprocess,
+        inputs=[
+            training_section["load_existing_dataset_path"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[
+            training_section["load_existing_status"],
+            training_section["audio_files_table"],
+            training_section["sample_selector"],
+            training_section["dataset_builder_state"],
+            # Also update preview fields with first sample
+            training_section["preview_audio"],
+            training_section["preview_filename"],
+            training_section["edit_caption"],
+            training_section["edit_lyrics"],
+            training_section["edit_bpm"],
+            training_section["edit_keyscale"],
+            training_section["edit_timesig"],
+            training_section["edit_duration"],
+            training_section["edit_language"],
+            training_section["edit_instrumental"],
+        ]
+    )
+    # Preprocess dataset to tensor files
+    training_section["preprocess_btn"].click(
+        fn=lambda output_dir, state: train_h.preprocess_dataset(
+            output_dir, dit_handler, state
+        ),
+        inputs=[
+            training_section["preprocess_output_dir"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[training_section["preprocess_progress"]]
+    )
+    # ========== Training Tab Handlers ==========
+    # Load preprocessed tensor dataset
+    training_section["load_dataset_btn"].click(
+        fn=train_h.load_training_dataset,
+        inputs=[training_section["training_tensor_dir"]],
+        outputs=[training_section["training_dataset_info"]]
+    )
+    # Start training from preprocessed tensors
+    def training_wrapper(tensor_dir, r, a, d, lr, ep, bs, ga, se, sh, sd, od, ts):
+        try:
+            for progress, log, plot, state in train_h.start_training(
+                tensor_dir, dit_handler, r, a, d, lr, ep, bs, ga, se, sh, sd, od, ts
+            ):
+                yield progress, log, plot, state
+        except Exception as e:
+            logger.exception("Training wrapper error")
+            yield f"❌ Error: {str(e)}", str(e), None, ts
+    training_section["start_training_btn"].click(
+        fn=training_wrapper,
+        inputs=[
+            training_section["training_tensor_dir"],
+            training_section["lora_rank"],
+            training_section["lora_alpha"],
+            training_section["lora_dropout"],
+            training_section["learning_rate"],
+            training_section["train_epochs"],
+            training_section["train_batch_size"],
+            training_section["gradient_accumulation"],
+            training_section["save_every_n_epochs"],
+            training_section["training_shift"],
+            training_section["training_seed"],
+            training_section["lora_output_dir"],
+            training_section["training_state"],
+        ],
+        outputs=[
+            training_section["training_progress"],
+            training_section["training_log"],
+            training_section["training_loss_plot"],
+            training_section["training_state"],
+        ]
+    )
+    # Stop training
+    training_section["stop_training_btn"].click(
+        fn=train_h.stop_training,
+        inputs=[training_section["training_state"]],
+        outputs=[
+            training_section["training_progress"],
+            training_section["training_state"],
+        ]
+    )
+    # Export LoRA
+    training_section["export_lora_btn"].click(
+        fn=train_h.export_lora,
+        inputs=[
+            training_section["export_path"],
+            training_section["lora_output_dir"],
+        ],
+        outputs=[training_section["export_status"]]
+    )

code/acestep/gradio_ui/events/generation_handlers.py ADDED Viewed

	@@ -0,0 +1,974 @@

+"""
+Generation Input Handlers Module
+Contains event handlers and helper functions related to generation inputs
+"""
+import os
+import json
+import random
+import glob
+import gradio as gr
+from typing import Optional, List, Tuple
+from acestep.constants import (
+    TASK_TYPES_TURBO,
+    TASK_TYPES_BASE,
+)
+from acestep.gradio_ui.i18n import t
+from acestep.inference import understand_music, create_sample, format_sample
+def parse_and_validate_timesteps(
+    timesteps_str: str,
+    inference_steps: int
+) -> Tuple[Optional[List[float]], bool, str]:
+    """
+    Parse timesteps string and validate.
+    Args:
+        timesteps_str: Comma-separated timesteps string (e.g., "0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0")
+        inference_steps: Expected number of inference steps
+    Returns:
+        Tuple of (parsed_timesteps, has_warning, warning_message)
+        - parsed_timesteps: List of float timesteps, or None if invalid/empty
+        - has_warning: Whether a warning was shown
+        - warning_message: Description of the warning
+    """
+    if not timesteps_str or not timesteps_str.strip():
+        return None, False, ""
+    # Parse comma-separated values
+    values = [v.strip() for v in timesteps_str.split(",") if v.strip()]
+    if not values:
+        return None, False, ""
+    # Handle optional trailing 0
+    if values[-1] != "0":
+        values.append("0")
+    try:
+        timesteps = [float(v) for v in values]
+    except ValueError:
+        gr.Warning(t("messages.invalid_timesteps_format"))
+        return None, True, "Invalid format"
+    # Validate range [0, 1]
+    if any(ts < 0 or ts > 1 for ts in timesteps):
+        gr.Warning(t("messages.timesteps_out_of_range"))
+        return None, True, "Out of range"
+    # Check if count matches inference_steps
+    actual_steps = len(timesteps) - 1
+    if actual_steps != inference_steps:
+        gr.Warning(t("messages.timesteps_count_mismatch", actual=actual_steps, expected=inference_steps))
+        return timesteps, True, f"Using {actual_steps} steps from timesteps"
+    return timesteps, False, ""
+def load_metadata(file_obj):
+    """Load generation parameters from a JSON file"""
+    if file_obj is None:
+        gr.Warning(t("messages.no_file_selected"))
+        return [None] * 36 + [False]  # Return None for all fields, False for is_format_caption
+    try:
+        # Read the uploaded file
+        if hasattr(file_obj, 'name'):
+            filepath = file_obj.name
+        else:
+            filepath = file_obj
+        with open(filepath, 'r', encoding='utf-8') as f:
+            metadata = json.load(f)
+        # Extract all fields
+        task_type = metadata.get('task_type', 'text2music')
+        captions = metadata.get('caption', '')
+        lyrics = metadata.get('lyrics', '')
+        vocal_language = metadata.get('vocal_language', 'unknown')
+        # Convert bpm
+        bpm_value = metadata.get('bpm')
+        if bpm_value is not None and bpm_value != "N/A":
+            try:
+                bpm = int(bpm_value) if bpm_value else None
+            except:
+                bpm = None
+        else:
+            bpm = None
+        key_scale = metadata.get('keyscale', '')
+        time_signature = metadata.get('timesignature', '')
+        # Convert duration
+        duration_value = metadata.get('duration', -1)
+        if duration_value is not None and duration_value != "N/A":
+            try:
+                audio_duration = float(duration_value)
+            except:
+                audio_duration = -1
+        else:
+            audio_duration = -1
+        batch_size = metadata.get('batch_size', 2)
+        inference_steps = metadata.get('inference_steps', 8)
+        guidance_scale = metadata.get('guidance_scale', 7.0)
+        seed = metadata.get('seed', '-1')
+        random_seed = False  # Always set to False when loading to enable reproducibility with saved seed
+        use_adg = metadata.get('use_adg', False)
+        cfg_interval_start = metadata.get('cfg_interval_start', 0.0)
+        cfg_interval_end = metadata.get('cfg_interval_end', 1.0)
+        audio_format = metadata.get('audio_format', 'mp3')
+        lm_temperature = metadata.get('lm_temperature', 0.85)
+        lm_cfg_scale = metadata.get('lm_cfg_scale', 2.0)
+        lm_top_k = metadata.get('lm_top_k', 0)
+        lm_top_p = metadata.get('lm_top_p', 0.9)
+        lm_negative_prompt = metadata.get('lm_negative_prompt', 'NO USER INPUT')
+        use_cot_metas = metadata.get('use_cot_metas', True)  # Added: read use_cot_metas
+        use_cot_caption = metadata.get('use_cot_caption', True)
+        use_cot_language = metadata.get('use_cot_language', True)
+        audio_cover_strength = metadata.get('audio_cover_strength', 1.0)
+        think = metadata.get('thinking', True)  # Fixed: read 'thinking' not 'think'
+        audio_codes = metadata.get('audio_codes', '')
+        repainting_start = metadata.get('repainting_start', 0.0)
+        repainting_end = metadata.get('repainting_end', -1)
+        track_name = metadata.get('track_name')
+        complete_track_classes = metadata.get('complete_track_classes', [])
+        shift = metadata.get('shift', 3.0)  # Default 3.0 for base models
+        infer_method = metadata.get('infer_method', 'ode')  # Default 'ode' for diffusion inference
+        custom_timesteps = metadata.get('timesteps', '')  # Custom timesteps (stored as 'timesteps' in JSON)
+        if custom_timesteps is None:
+            custom_timesteps = ''
+        instrumental = metadata.get('instrumental', False)  # Added: read instrumental
+        gr.Info(t("messages.params_loaded", filename=os.path.basename(filepath)))
+        return (
+            task_type, captions, lyrics, vocal_language, bpm, key_scale, time_signature,
+            audio_duration, batch_size, inference_steps, guidance_scale, seed, random_seed,
+            use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method,
+            custom_timesteps,  # Added: custom_timesteps (between infer_method and audio_format)
+            audio_format, lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
+            use_cot_metas, use_cot_caption, use_cot_language, audio_cover_strength,
+            think, audio_codes, repainting_start, repainting_end,
+            track_name, complete_track_classes, instrumental,
+            True  # Set is_format_caption to True when loading from file
+        )
+    except json.JSONDecodeError as e:
+        gr.Warning(t("messages.invalid_json", error=str(e)))
+        return [None] * 36 + [False]
+    except Exception as e:
+        gr.Warning(t("messages.load_error", error=str(e)))
+        return [None] * 36 + [False]
+def load_random_example(task_type: str):
+    """Load a random example from the task-specific examples directory
+    Args:
+        task_type: The task type (e.g., "text2music")
+    Returns:
+        Tuple of (caption, lyrics, think, bpm, duration, keyscale, language, timesignature) for updating UI components
+    """
+    try:
+        # Get the project root directory
+        current_file = os.path.abspath(__file__)
+        # This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
+        project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
+        # Construct the examples directory path
+        examples_dir = os.path.join(project_root, "examples", task_type)
+        # Check if directory exists
+        if not os.path.exists(examples_dir):
+            gr.Warning(f"Examples directory not found: examples/{task_type}/")
+            return "", "", True, None, None, "", "", ""
+        # Find all JSON files in the directory
+        json_files = glob.glob(os.path.join(examples_dir, "*.json"))
+        if not json_files:
+            gr.Warning(f"No JSON files found in examples/{task_type}/")
+            return "", "", True, None, None, "", "", ""
+        # Randomly select one file
+        selected_file = random.choice(json_files)
+        # Read and parse JSON
+        try:
+            with open(selected_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            # Extract caption (prefer 'caption', fallback to 'prompt')
+            caption_value = data.get('caption', data.get('prompt', ''))
+            if not isinstance(caption_value, str):
+                caption_value = str(caption_value) if caption_value else ''
+            # Extract lyrics
+            lyrics_value = data.get('lyrics', '')
+            if not isinstance(lyrics_value, str):
+                lyrics_value = str(lyrics_value) if lyrics_value else ''
+            # Extract think (default to True if not present)
+            think_value = data.get('think', True)
+            if not isinstance(think_value, bool):
+                think_value = True
+            # Extract optional metadata fields
+            bpm_value = None
+            if 'bpm' in data and data['bpm'] not in [None, "N/A", ""]:
+                try:
+                    bpm_value = int(data['bpm'])
+                except (ValueError, TypeError):
+                    pass
+            duration_value = None
+            if 'duration' in data and data['duration'] not in [None, "N/A", ""]:
+                try:
+                    duration_value = float(data['duration'])
+                except (ValueError, TypeError):
+                    pass
+            keyscale_value = data.get('keyscale', '')
+            if keyscale_value in [None, "N/A"]:
+                keyscale_value = ''
+            language_value = data.get('language', '')
+            if language_value in [None, "N/A"]:
+                language_value = ''
+            timesignature_value = data.get('timesignature', '')
+            if timesignature_value in [None, "N/A"]:
+                timesignature_value = ''
+            gr.Info(t("messages.example_loaded", filename=os.path.basename(selected_file)))
+            return caption_value, lyrics_value, think_value, bpm_value, duration_value, keyscale_value, language_value, timesignature_value
+        except json.JSONDecodeError as e:
+            gr.Warning(t("messages.example_failed", filename=os.path.basename(selected_file), error=str(e)))
+            return "", "", True, None, None, "", "", ""
+        except Exception as e:
+            gr.Warning(t("messages.example_error", error=str(e)))
+            return "", "", True, None, None, "", "", ""
+    except Exception as e:
+        gr.Warning(t("messages.example_error", error=str(e)))
+        return "", "", True, None, None, "", "", ""
+def sample_example_smart(llm_handler, task_type: str, constrained_decoding_debug: bool = False):
+    """Smart sample function that uses LM if initialized, otherwise falls back to examples
+    This is a Gradio wrapper that uses the understand_music API from acestep.inference
+    to generate examples when LM is available.
+    Args:
+        llm_handler: LLM handler instance
+        task_type: The task type (e.g., "text2music")
+        constrained_decoding_debug: Whether to enable debug logging for constrained decoding
+    Returns:
+        Tuple of (caption, lyrics, think, bpm, duration, keyscale, language, timesignature) for updating UI components
+    """
+    # Check if LM is initialized
+    if llm_handler.llm_initialized:
+        # Use LM to generate example via understand_music API
+        try:
+            result = understand_music(
+                llm_handler=llm_handler,
+                audio_codes="NO USER INPUT",  # Empty input triggers example generation
+                temperature=0.85,
+                use_constrained_decoding=True,
+                constrained_decoding_debug=constrained_decoding_debug,
+            )
+            if result.success:
+                gr.Info(t("messages.lm_generated"))
+                return (
+                    result.caption,
+                    result.lyrics,
+                    True,  # Always enable think when using LM-generated examples
+                    result.bpm,
+                    result.duration,
+                    result.keyscale,
+                    result.language,
+                    result.timesignature,
+                )
+            else:
+                gr.Warning(t("messages.lm_fallback"))
+                return load_random_example(task_type)
+        except Exception as e:
+            gr.Warning(t("messages.lm_fallback"))
+            return load_random_example(task_type)
+    else:
+        # LM not initialized, use examples directory
+        return load_random_example(task_type)
+def load_random_simple_description():
+    """Load a random description from the simple_mode examples directory.
+    Returns:
+        Tuple of (description, instrumental, vocal_language) for updating UI components
+    """
+    try:
+        # Get the project root directory
+        current_file = os.path.abspath(__file__)
+        # This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
+        project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
+        # Construct the examples directory path
+        examples_dir = os.path.join(project_root, "examples", "simple_mode")
+        # Check if directory exists
+        if not os.path.exists(examples_dir):
+            gr.Warning(t("messages.simple_examples_not_found"))
+            return gr.update(), gr.update(), gr.update()
+        # Find all JSON files in the directory
+        json_files = glob.glob(os.path.join(examples_dir, "*.json"))
+        if not json_files:
+            gr.Warning(t("messages.simple_examples_empty"))
+            return gr.update(), gr.update(), gr.update()
+        # Randomly select one file
+        selected_file = random.choice(json_files)
+        # Read and parse JSON
+        try:
+            with open(selected_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            # Extract fields
+            description = data.get('description', '')
+            instrumental = data.get('instrumental', False)
+            vocal_language = data.get('vocal_language', 'unknown')
+            # Ensure vocal_language is a string
+            if isinstance(vocal_language, list):
+                vocal_language = vocal_language[0] if vocal_language else 'unknown'
+            gr.Info(t("messages.simple_example_loaded", filename=os.path.basename(selected_file)))
+            return description, instrumental, vocal_language
+        except json.JSONDecodeError as e:
+            gr.Warning(t("messages.example_failed", filename=os.path.basename(selected_file), error=str(e)))
+            return gr.update(), gr.update(), gr.update()
+        except Exception as e:
+            gr.Warning(t("messages.example_error", error=str(e)))
+            return gr.update(), gr.update(), gr.update()
+    except Exception as e:
+        gr.Warning(t("messages.example_error", error=str(e)))
+        return gr.update(), gr.update(), gr.update()
+def refresh_checkpoints(dit_handler):
+    """Refresh available checkpoints"""
+    choices = dit_handler.get_available_checkpoints()
+    return gr.update(choices=choices)
+def update_model_type_settings(config_path):
+    """Update UI settings based on model type (fallback when handler not initialized yet)
+    Note: This is used as a fallback when the user changes config_path dropdown
+    before initializing the model. The actual settings are determined by the
+    handler's is_turbo_model() method after initialization.
+    """
+    if config_path is None:
+        config_path = ""
+    config_path_lower = config_path.lower()
+    # Determine is_turbo based on config_path string
+    # This is a heuristic fallback - actual model type is determined after loading
+    if "turbo" in config_path_lower:
+        is_turbo = True
+    elif "base" in config_path_lower:
+        is_turbo = False
+    else:
+        # Default to turbo settings for unknown model types
+        is_turbo = True
+    return get_model_type_ui_settings(is_turbo)
+def init_service_wrapper(dit_handler, llm_handler, checkpoint, config_path, device, init_llm, lm_model_path, backend, use_flash_attention, offload_to_cpu, offload_dit_to_cpu):
+    """Wrapper for service initialization, returns status, button state, accordion state, and model type settings"""
+    # Initialize DiT handler
+    status, enable = dit_handler.initialize_service(
+        checkpoint, config_path, device,
+        use_flash_attention=use_flash_attention, compile_model=False,
+        offload_to_cpu=offload_to_cpu, offload_dit_to_cpu=offload_dit_to_cpu
+    )
+    # Initialize LM handler if requested
+    if init_llm:
+        # Get checkpoint directory
+        current_file = os.path.abspath(__file__)
+        # This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
+        project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
+        checkpoint_dir = os.path.join(project_root, "checkpoints")
+        lm_status, lm_success = llm_handler.initialize(
+            checkpoint_dir=checkpoint_dir,
+            lm_model_path=lm_model_path,
+            backend=backend,
+            device=device,
+            offload_to_cpu=offload_to_cpu,
+            dtype=dit_handler.dtype
+        )
+        if lm_success:
+            status += f"\n{lm_status}"
+        else:
+            status += f"\n{lm_status}"
+            # Don't fail the entire initialization if LM fails, but log it
+            # Keep enable as is (DiT initialization result) even if LM fails
+    # Check if model is initialized - if so, collapse the accordion
+    is_model_initialized = dit_handler.model is not None
+    accordion_state = gr.Accordion(open=not is_model_initialized)
+    # Get model type settings based on actual loaded model
+    is_turbo = dit_handler.is_turbo_model()
+    model_type_settings = get_model_type_ui_settings(is_turbo)
+    return (
+        status,
+        gr.update(interactive=enable),
+        accordion_state,
+        *model_type_settings
+    )
+def get_model_type_ui_settings(is_turbo: bool):
+    """Get UI settings based on whether the model is turbo or base"""
+    if is_turbo:
+        # Turbo model: max 20 steps, default 8, show shift with default 3.0, only show text2music/repaint/cover
+        return (
+            gr.update(value=8, maximum=20, minimum=1),  # inference_steps
+            gr.update(visible=False),  # guidance_scale
+            gr.update(visible=False),  # use_adg
+            gr.update(value=3.0, visible=True),  # shift (show with default 3.0)
+            gr.update(visible=False),  # cfg_interval_start
+            gr.update(visible=False),  # cfg_interval_end
+            gr.update(choices=TASK_TYPES_TURBO),  # task_type
+        )
+    else:
+        # Base model: max 200 steps, default 32, show CFG/ADG/shift, show all task types
+        return (
+            gr.update(value=32, maximum=200, minimum=1),  # inference_steps
+            gr.update(visible=True),  # guidance_scale
+            gr.update(visible=True),  # use_adg
+            gr.update(value=3.0, visible=True),  # shift (effective for base, default 3.0)
+            gr.update(visible=True),  # cfg_interval_start
+            gr.update(visible=True),  # cfg_interval_end
+            gr.update(choices=TASK_TYPES_BASE),  # task_type
+        )
+def update_negative_prompt_visibility(init_llm_checked):
+    """Update negative prompt visibility: show if Initialize 5Hz LM checkbox is checked"""
+    return gr.update(visible=init_llm_checked)
+def update_audio_cover_strength_visibility(task_type_value, init_llm_checked):
+    """Update audio_cover_strength visibility and label"""
+    # Show if task is cover OR if LM is initialized
+    is_visible = (task_type_value == "cover") or init_llm_checked
+    # Change label based on context
+    if init_llm_checked and task_type_value != "cover":
+        label = "LM codes strength"
+        info = "Control how many denoising steps use LM-generated codes"
+    else:
+        label = "Audio Cover Strength"
+        info = "Control how many denoising steps use cover mode"
+    return gr.update(visible=is_visible, label=label, info=info)
+def convert_src_audio_to_codes_wrapper(dit_handler, src_audio):
+    """Wrapper for converting src audio to codes"""
+    codes_string = dit_handler.convert_src_audio_to_codes(src_audio)
+    return codes_string
+def update_instruction_ui(
+    dit_handler,
+    task_type_value: str,
+    track_name_value: Optional[str],
+    complete_track_classes_value: list,
+    audio_codes_content: str = "",
+    init_llm_checked: bool = False
+) -> tuple:
+    """Update instruction and UI visibility based on task type."""
+    instruction = dit_handler.generate_instruction(
+        task_type=task_type_value,
+        track_name=track_name_value,
+        complete_track_classes=complete_track_classes_value
+    )
+    # Show track_name for lego and extract
+    track_name_visible = task_type_value in ["lego", "extract"]
+    # Show complete_track_classes for complete
+    complete_visible = task_type_value == "complete"
+    # Show audio_cover_strength for cover OR when LM is initialized
+    audio_cover_strength_visible = (task_type_value == "cover") or init_llm_checked
+    # Determine label and info based on context
+    if init_llm_checked and task_type_value != "cover":
+        audio_cover_strength_label = "LM codes strength"
+        audio_cover_strength_info = "Control how many denoising steps use LM-generated codes"
+    else:
+        audio_cover_strength_label = "Audio Cover Strength"
+        audio_cover_strength_info = "Control how many denoising steps use cover mode"
+    # Show repainting controls for repaint and lego
+    repainting_visible = task_type_value in ["repaint", "lego"]
+    # Show text2music_audio_codes if task is text2music OR if it has content
+    # This allows it to stay visible even if user switches task type but has codes
+    has_audio_codes = audio_codes_content and str(audio_codes_content).strip()
+    text2music_audio_codes_visible = task_type_value == "text2music" or has_audio_codes
+    return (
+        instruction,  # instruction_display_gen
+        gr.update(visible=track_name_visible),  # track_name
+        gr.update(visible=complete_visible),  # complete_track_classes
+        gr.update(visible=audio_cover_strength_visible, label=audio_cover_strength_label, info=audio_cover_strength_info),  # audio_cover_strength
+        gr.update(visible=repainting_visible),  # repainting_group
+        gr.update(visible=text2music_audio_codes_visible),  # text2music_audio_codes_group
+    )
+def transcribe_audio_codes(llm_handler, audio_code_string, constrained_decoding_debug):
+    """
+    Transcribe audio codes to metadata using LLM understanding.
+    If audio_code_string is empty, generate a sample example instead.
+    This is a Gradio wrapper around the understand_music API in acestep.inference.
+    Args:
+        llm_handler: LLM handler instance
+        audio_code_string: String containing audio codes (or empty for example generation)
+        constrained_decoding_debug: Whether to enable debug logging for constrained decoding
+    Returns:
+        Tuple of (status_message, caption, lyrics, bpm, duration, keyscale, language, timesignature, is_format_caption)
+    """
+    # Call the inference API
+    result = understand_music(
+        llm_handler=llm_handler,
+        audio_codes=audio_code_string,
+        use_constrained_decoding=True,
+        constrained_decoding_debug=constrained_decoding_debug,
+    )
+    # Handle error case with localized message
+    if not result.success:
+        # Use localized error message for LLM not initialized
+        if result.error == "LLM not initialized":
+            return t("messages.lm_not_initialized"), "", "", None, None, "", "", "", False
+        return result.status_message, "", "", None, None, "", "", "", False
+    return (
+        result.status_message,
+        result.caption,
+        result.lyrics,
+        result.bpm,
+        result.duration,
+        result.keyscale,
+        result.language,
+        result.timesignature,
+        True  # Set is_format_caption to True (from Transcribe/LM understanding)
+    )
+def update_transcribe_button_text(audio_code_string):
+    """
+    Update the transcribe button text based on input content.
+    If empty: "Generate Example"
+    If has content: "Transcribe"
+    """
+    if not audio_code_string or not audio_code_string.strip():
+        return gr.update(value="Generate Example")
+    else:
+        return gr.update(value="Transcribe")
+def reset_format_caption_flag():
+    """Reset is_format_caption to False when user manually edits caption/metadata"""
+    return False
+def update_audio_uploads_accordion(reference_audio, src_audio):
+    """Update Audio Uploads accordion open state based on whether audio files are present"""
+    has_audio = (reference_audio is not None) or (src_audio is not None)
+    return gr.Accordion(open=has_audio)
+def handle_instrumental_checkbox(instrumental_checked, current_lyrics):
+    """
+    Handle instrumental checkbox changes.
+    When checked: if no lyrics, fill with [Instrumental]
+    When unchecked: if lyrics is [Instrumental], clear it
+    """
+    if instrumental_checked:
+        # If checked and no lyrics, fill with [Instrumental]
+        if not current_lyrics or not current_lyrics.strip():
+            return "[Instrumental]"
+        else:
+            # Has lyrics, don't change
+            return current_lyrics
+    else:
+        # If unchecked and lyrics is exactly [Instrumental], clear it
+        if current_lyrics and current_lyrics.strip() == "[Instrumental]":
+            return ""
+        else:
+            # Has other lyrics, don't change
+            return current_lyrics
+def handle_simple_instrumental_change(is_instrumental: bool):
+    """
+    Handle simple mode instrumental checkbox changes.
+    When checked: set vocal_language to "unknown" and disable editing.
+    When unchecked: enable vocal_language editing.
+    Args:
+        is_instrumental: Whether instrumental checkbox is checked
+    Returns:
+        gr.update for simple_vocal_language dropdown
+    """
+    if is_instrumental:
+        return gr.update(value="unknown", interactive=False)
+    else:
+        return gr.update(interactive=True)
+def update_audio_components_visibility(batch_size):
+    """Show/hide individual audio components based on batch size (1-8)
+    Row 1: Components 1-4 (batch_size 1-4)
+    Row 2: Components 5-8 (batch_size 5-8)
+    """
+    # Clamp batch size to 1-8 range for UI
+    batch_size = min(max(int(batch_size), 1), 8)
+    # Row 1 columns (1-4)
+    updates_row1 = (
+        gr.update(visible=True),  # audio_col_1: always visible
+        gr.update(visible=batch_size >= 2),  # audio_col_2
+        gr.update(visible=batch_size >= 3),  # audio_col_3
+        gr.update(visible=batch_size >= 4),  # audio_col_4
+    )
+    # Row 2 container and columns (5-8)
+    show_row_5_8 = batch_size >= 5
+    updates_row2 = (
+        gr.update(visible=show_row_5_8),  # audio_row_5_8 (container)
+        gr.update(visible=batch_size >= 5),  # audio_col_5
+        gr.update(visible=batch_size >= 6),  # audio_col_6
+        gr.update(visible=batch_size >= 7),  # audio_col_7
+        gr.update(visible=batch_size >= 8),  # audio_col_8
+    )
+    return updates_row1 + updates_row2
+def handle_generation_mode_change(mode: str):
+    """
+    Handle generation mode change between Simple and Custom modes.
+    In Simple mode:
+    - Show simple mode group (query input, instrumental checkbox, create button)
+    - Collapse caption and lyrics accordions
+    - Hide optional parameters accordion
+    - Disable generate button until sample is created
+    In Custom mode:
+    - Hide simple mode group
+    - Expand caption and lyrics accordions
+    - Show optional parameters accordion
+    - Enable generate button
+    Args:
+        mode: "simple" or "custom"
+    Returns:
+        Tuple of updates for:
+        - simple_mode_group (visibility)
+        - caption_accordion (open state)
+        - lyrics_accordion (open state)
+        - generate_btn (interactive state)
+        - simple_sample_created (reset state)
+        - optional_params_accordion (visibility)
+    """
+    is_simple = mode == "simple"
+    return (
+        gr.update(visible=is_simple),  # simple_mode_group
+        gr.Accordion(open=not is_simple),  # caption_accordion - collapsed in simple, open in custom
+        gr.Accordion(open=not is_simple),  # lyrics_accordion - collapsed in simple, open in custom
+        gr.update(interactive=not is_simple),  # generate_btn - disabled in simple until sample created
+        False,  # simple_sample_created - reset to False on mode change
+        gr.Accordion(open=not is_simple),  # optional_params_accordion - hidden in simple mode
+    )
+def handle_create_sample(
+    llm_handler,
+    query: str,
+    instrumental: bool,
+    vocal_language: str,
+    lm_temperature: float,
+    lm_top_k: int,
+    lm_top_p: float,
+    constrained_decoding_debug: bool = False,
+):
+    """
+    Handle the Create Sample button click in Simple mode.
+    Creates a sample from the user's query using the LLM, then populates
+    the caption, lyrics, and metadata fields.
+    Note: cfg_scale and negative_prompt are not supported in create_sample mode.
+    Args:
+        llm_handler: LLM handler instance
+        query: User's natural language music description
+        instrumental: Whether to generate instrumental music
+        vocal_language: Preferred vocal language for constrained decoding
+        lm_temperature: LLM temperature for generation
+        lm_top_k: LLM top-k sampling
+        lm_top_p: LLM top-p sampling
+        constrained_decoding_debug: Whether to enable debug logging
+    Returns:
+        Tuple of updates for:
+        - captions
+        - lyrics
+        - bpm
+        - audio_duration
+        - key_scale
+        - vocal_language
+        - time_signature
+        - instrumental_checkbox
+        - caption_accordion (open)
+        - lyrics_accordion (open)
+        - generate_btn (interactive)
+        - simple_sample_created (True)
+        - think_checkbox (True)
+        - is_format_caption_state (True)
+        - status_output
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        gr.Warning(t("messages.lm_not_initialized"))
+        return (
+            gr.update(),  # captions - no change
+            gr.update(),  # lyrics - no change
+            gr.update(),  # bpm - no change
+            gr.update(),  # audio_duration - no change
+            gr.update(),  # key_scale - no change
+            gr.update(),  # vocal_language - no change
+            gr.update(),  # time_signature - no change
+            gr.update(),  # instrumental_checkbox - no change
+            gr.update(),  # caption_accordion - no change
+            gr.update(),  # lyrics_accordion - no change
+            gr.update(interactive=False),  # generate_btn - keep disabled
+            False,  # simple_sample_created - still False
+            gr.update(),  # think_checkbox - no change
+            gr.update(),  # is_format_caption_state - no change
+            t("messages.lm_not_initialized"),  # status_output
+        )
+    # Convert LM parameters
+    top_k_value = None if not lm_top_k or lm_top_k == 0 else int(lm_top_k)
+    top_p_value = None if not lm_top_p or lm_top_p >= 1.0 else lm_top_p
+    # Call create_sample API
+    # Note: cfg_scale and negative_prompt are not supported in create_sample mode
+    result = create_sample(
+        llm_handler=llm_handler,
+        query=query,
+        instrumental=instrumental,
+        vocal_language=vocal_language,
+        temperature=lm_temperature,
+        top_k=top_k_value,
+        top_p=top_p_value,
+        use_constrained_decoding=True,
+        constrained_decoding_debug=constrained_decoding_debug,
+    )
+    # Handle error
+    if not result.success:
+        gr.Warning(result.status_message or t("messages.sample_creation_failed"))
+        return (
+            gr.update(),  # captions - no change
+            gr.update(),  # lyrics - no change
+            gr.update(),  # bpm - no change
+            gr.update(),  # audio_duration - no change
+            gr.update(),  # key_scale - no change
+            gr.update(),  # vocal_language - no change
+            gr.update(),  # simple vocal_language - no change
+            gr.update(),  # time_signature - no change
+            gr.update(),  # instrumental_checkbox - no change
+            gr.update(),  # caption_accordion - no change
+            gr.update(),  # lyrics_accordion - no change
+            gr.update(interactive=False),  # generate_btn - keep disabled
+            False,  # simple_sample_created - still False
+            gr.update(),  # think_checkbox - no change
+            gr.update(),  # is_format_caption_state - no change
+            result.status_message or t("messages.sample_creation_failed"),  # status_output
+        )
+    # Success - populate fields
+    gr.Info(t("messages.sample_created"))
+    return (
+        result.caption,  # captions
+        result.lyrics,  # lyrics
+        result.bpm,  # bpm
+        result.duration if result.duration and result.duration > 0 else -1,  # audio_duration
+        result.keyscale,  # key_scale
+        result.language,  # vocal_language
+        result.language,  # simple vocal_language
+        result.timesignature,  # time_signature
+        result.instrumental,  # instrumental_checkbox
+        gr.Accordion(open=True),  # caption_accordion - expand
+        gr.Accordion(open=True),  # lyrics_accordion - expand
+        gr.update(interactive=True),  # generate_btn - enable
+        True,  # simple_sample_created - True
+        True,  # think_checkbox - enable thinking
+        True,  # is_format_caption_state - True (LM-generated)
+        result.status_message,  # status_output
+    )
+def handle_format_sample(
+    llm_handler,
+    caption: str,
+    lyrics: str,
+    bpm,
+    audio_duration,
+    key_scale: str,
+    time_signature: str,
+    lm_temperature: float,
+    lm_top_k: int,
+    lm_top_p: float,
+    constrained_decoding_debug: bool = False,
+):
+    """
+    Handle the Format button click to format caption and lyrics.
+    Takes user-provided caption and lyrics, and uses the LLM to generate
+    structured music metadata and an enhanced description.
+    Note: cfg_scale and negative_prompt are not supported in format mode.
+    Args:
+        llm_handler: LLM handler instance
+        caption: User's caption/description
+        lyrics: User's lyrics
+        bpm: User-provided BPM (optional, for constrained decoding)
+        audio_duration: User-provided duration (optional, for constrained decoding)
+        key_scale: User-provided key scale (optional, for constrained decoding)
+        time_signature: User-provided time signature (optional, for constrained decoding)
+        lm_temperature: LLM temperature for generation
+        lm_top_k: LLM top-k sampling
+        lm_top_p: LLM top-p sampling
+        constrained_decoding_debug: Whether to enable debug logging
+    Returns:
+        Tuple of updates for:
+        - captions
+        - lyrics
+        - bpm
+        - audio_duration
+        - key_scale
+        - vocal_language
+        - time_signature
+        - is_format_caption_state
+        - status_output
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        gr.Warning(t("messages.lm_not_initialized"))
+        return (
+            gr.update(),  # captions - no change
+            gr.update(),  # lyrics - no change
+            gr.update(),  # bpm - no change
+            gr.update(),  # audio_duration - no change
+            gr.update(),  # key_scale - no change
+            gr.update(),  # vocal_language - no change
+            gr.update(),  # time_signature - no change
+            gr.update(),  # is_format_caption_state - no change
+            t("messages.lm_not_initialized"),  # status_output
+        )
+    # Build user_metadata from provided values for constrained decoding
+    user_metadata = {}
+    if bpm is not None and bpm > 0:
+        user_metadata['bpm'] = int(bpm)
+    if audio_duration is not None and float(audio_duration) > 0:
+        user_metadata['duration'] = int(audio_duration)
+    if key_scale and key_scale.strip():
+        user_metadata['keyscale'] = key_scale.strip()
+    if time_signature and time_signature.strip():
+        user_metadata['timesignature'] = time_signature.strip()
+    # Only pass user_metadata if we have at least one field
+    user_metadata_to_pass = user_metadata if user_metadata else None
+    # Convert LM parameters
+    top_k_value = None if not lm_top_k or lm_top_k == 0 else int(lm_top_k)
+    top_p_value = None if not lm_top_p or lm_top_p >= 1.0 else lm_top_p
+    # Call format_sample API
+    result = format_sample(
+        llm_handler=llm_handler,
+        caption=caption,
+        lyrics=lyrics,
+        user_metadata=user_metadata_to_pass,
+        temperature=lm_temperature,
+        top_k=top_k_value,
+        top_p=top_p_value,
+        use_constrained_decoding=True,
+        constrained_decoding_debug=constrained_decoding_debug,
+    )
+    # Handle error
+    if not result.success:
+        gr.Warning(result.status_message or t("messages.format_failed"))
+        return (
+            gr.update(),  # captions - no change
+            gr.update(),  # lyrics - no change
+            gr.update(),  # bpm - no change
+            gr.update(),  # audio_duration - no change
+            gr.update(),  # key_scale - no change
+            gr.update(),  # vocal_language - no change
+            gr.update(),  # time_signature - no change
+            gr.update(),  # is_format_caption_state - no change
+            result.status_message or t("messages.format_failed"),  # status_output
+        )
+    # Success - populate fields
+    gr.Info(t("messages.format_success"))
+    return (
+        result.caption,  # captions
+        result.lyrics,  # lyrics
+        result.bpm,  # bpm
+        result.duration if result.duration and result.duration > 0 else -1,  # audio_duration
+        result.keyscale,  # key_scale
+        result.language,  # vocal_language
+        result.timesignature,  # time_signature
+        True,  # is_format_caption_state - True (LM-formatted)
+        result.status_message,  # status_output
+    )

code/acestep/gradio_ui/events/results_handlers.py ADDED Viewed

The diff for this file is too large to render. See raw diff

code/acestep/gradio_ui/events/training_handlers.py ADDED Viewed

	@@ -0,0 +1,644 @@

+"""
+Event Handlers for Training Tab
+Contains all event handler functions for the dataset builder and training UI.
+"""
+import os
+import json
+from typing import Any, Dict, List, Tuple, Optional
+from loguru import logger
+import gradio as gr
+from acestep.training.dataset_builder import DatasetBuilder, AudioSample
+def create_dataset_builder() -> DatasetBuilder:
+    """Create a new DatasetBuilder instance."""
+    return DatasetBuilder()
+def scan_directory(
+    audio_dir: str,
+    dataset_name: str,
+    custom_tag: str,
+    tag_position: str,
+    all_instrumental: bool,
+    builder_state: Optional[DatasetBuilder],
+) -> Tuple[Any, str, Any, DatasetBuilder]:
+    """Scan a directory for audio files.
+    Returns:
+        Tuple of (table_data, status, slider_update, builder_state)
+    """
+    if not audio_dir or not audio_dir.strip():
+        return [], "❌ Please enter a directory path", gr.Slider(maximum=0, value=0), builder_state
+    # Create or use existing builder
+    builder = builder_state if builder_state else DatasetBuilder()
+    # Set metadata before scanning
+    builder.metadata.name = dataset_name
+    builder.metadata.custom_tag = custom_tag
+    builder.metadata.tag_position = tag_position
+    builder.metadata.all_instrumental = all_instrumental
+    # Scan directory
+    samples, status = builder.scan_directory(audio_dir.strip())
+    if not samples:
+        return [], status, gr.Slider(maximum=0, value=0), builder
+    # Set instrumental and tag for all samples
+    builder.set_all_instrumental(all_instrumental)
+    if custom_tag:
+        builder.set_custom_tag(custom_tag, tag_position)
+    # Get table data
+    table_data = builder.get_samples_dataframe_data()
+    # Calculate slider max and return as Slider update
+    slider_max = max(0, len(samples) - 1)
+    return table_data, status, gr.Slider(maximum=slider_max, value=0), builder
+def auto_label_all(
+    dit_handler,
+    llm_handler,
+    builder_state: Optional[DatasetBuilder],
+    skip_metas: bool = False,
+    progress=None,
+) -> Tuple[List[List[Any]], str, DatasetBuilder]:
+    """Auto-label all samples in the dataset.
+    Args:
+        dit_handler: DiT handler for audio processing
+        llm_handler: LLM handler for caption generation
+        builder_state: Dataset builder state
+        skip_metas: If True, skip LLM labeling. BPM/Key/TimeSig = N/A, Language = unknown for instrumental
+        progress: Progress callback
+    Returns:
+        Tuple of (table_data, status, builder_state)
+    """
+    if builder_state is None:
+        return [], "❌ Please scan a directory first", builder_state
+    if not builder_state.samples:
+        return [], "❌ No samples to label. Please scan a directory first.", builder_state
+    # If skip_metas is True, just set default values without LLM
+    if skip_metas:
+        for sample in builder_state.samples:
+            sample.bpm = None  # Will display as N/A
+            sample.keyscale = "N/A"
+            sample.timesignature = "N/A"
+            # For instrumental, language should be "unknown"
+            if sample.is_instrumental:
+                sample.language = "unknown"
+            else:
+                sample.language = "unknown"
+            # Use custom tag as caption if set, otherwise use filename
+            if builder_state.metadata.custom_tag:
+                sample.caption = builder_state.metadata.custom_tag
+            else:
+                sample.caption = sample.filename
+        table_data = builder_state.get_samples_dataframe_data()
+        return table_data, f"✅ Skipped AI labeling. {len(builder_state.samples)} samples set with default values.", builder_state
+    # Check if handlers are initialized
+    if dit_handler is None or dit_handler.model is None:
+        return builder_state.get_samples_dataframe_data(), "❌ Model not initialized. Please initialize the service first.", builder_state
+    if llm_handler is None or not llm_handler.llm_initialized:
+        return builder_state.get_samples_dataframe_data(), "❌ LLM not initialized. Please initialize the service with LLM enabled.", builder_state
+    def progress_callback(msg):
+        if progress:
+            try:
+                progress(msg)
+            except:
+                pass
+    # Label all samples
+    samples, status = builder_state.label_all_samples(
+        dit_handler=dit_handler,
+        llm_handler=llm_handler,
+        progress_callback=progress_callback,
+    )
+    # Get updated table data
+    table_data = builder_state.get_samples_dataframe_data()
+    return table_data, status, builder_state
+def get_sample_preview(
+    sample_idx: int,
+    builder_state: Optional[DatasetBuilder],
+) -> Tuple[str, str, str, str, Optional[int], str, str, float, str, bool]:
+    """Get preview data for a specific sample.
+    Returns:
+        Tuple of (audio_path, filename, caption, lyrics, bpm, keyscale, timesig, duration, language, instrumental)
+    """
+    if builder_state is None or not builder_state.samples:
+        return None, "", "", "", None, "", "", 0.0, "instrumental", True
+    idx = int(sample_idx)
+    if idx < 0 or idx >= len(builder_state.samples):
+        return None, "", "", "", None, "", "", 0.0, "instrumental", True
+    sample = builder_state.samples[idx]
+    return (
+        sample.audio_path,
+        sample.filename,
+        sample.caption,
+        sample.lyrics,
+        sample.bpm,
+        sample.keyscale,
+        sample.timesignature,
+        sample.duration,
+        sample.language,
+        sample.is_instrumental,
+    )
+def save_sample_edit(
+    sample_idx: int,
+    caption: str,
+    lyrics: str,
+    bpm: Optional[int],
+    keyscale: str,
+    timesig: str,
+    language: str,
+    is_instrumental: bool,
+    builder_state: Optional[DatasetBuilder],
+) -> Tuple[List[List[Any]], str, DatasetBuilder]:
+    """Save edits to a sample.
+    Returns:
+        Tuple of (table_data, status, builder_state)
+    """
+    if builder_state is None:
+        return [], "❌ No dataset loaded", builder_state
+    idx = int(sample_idx)
+    # Update sample
+    sample, status = builder_state.update_sample(
+        idx,
+        caption=caption,
+        lyrics=lyrics if not is_instrumental else "[Instrumental]",
+        bpm=int(bpm) if bpm else None,
+        keyscale=keyscale,
+        timesignature=timesig,
+        language="instrumental" if is_instrumental else language,
+        is_instrumental=is_instrumental,
+        labeled=True,
+    )
+    # Get updated table data
+    table_data = builder_state.get_samples_dataframe_data()
+    return table_data, status, builder_state
+def update_settings(
+    custom_tag: str,
+    tag_position: str,
+    all_instrumental: bool,
+    builder_state: Optional[DatasetBuilder],
+) -> DatasetBuilder:
+    """Update dataset settings.
+    Returns:
+        Updated builder_state
+    """
+    if builder_state is None:
+        return builder_state
+    if custom_tag:
+        builder_state.set_custom_tag(custom_tag, tag_position)
+    builder_state.set_all_instrumental(all_instrumental)
+    return builder_state
+def save_dataset(
+    save_path: str,
+    dataset_name: str,
+    builder_state: Optional[DatasetBuilder],
+) -> str:
+    """Save the dataset to a JSON file.
+    Returns:
+        Status message
+    """
+    if builder_state is None:
+        return "❌ No dataset to save. Please scan a directory first."
+    if not builder_state.samples:
+        return "❌ No samples in dataset."
+    if not save_path or not save_path.strip():
+        return "❌ Please enter a save path."
+    # Check if any samples are labeled
+    labeled_count = builder_state.get_labeled_count()
+    if labeled_count == 0:
+        return "⚠️ Warning: No samples have been labeled. Consider auto-labeling first.\nSaving anyway..."
+    return builder_state.save_dataset(save_path.strip(), dataset_name)
+def load_existing_dataset_for_preprocess(
+    dataset_path: str,
+    builder_state: Optional[DatasetBuilder],
+) -> Tuple[str, Any, Any, DatasetBuilder, str, str, str, str, Optional[int], str, str, float, str, bool]:
+    """Load an existing dataset JSON file for preprocessing.
+    This allows users to load a previously saved dataset and proceed to preprocessing
+    without having to re-scan and re-label.
+    Returns:
+        Tuple of (status, table_data, slider_update, builder_state,
+                  audio_path, filename, caption, lyrics, bpm, keyscale, timesig, duration, language, instrumental)
+    """
+    empty_preview = (None, "", "", "", None, "", "", 0.0, "instrumental", True)
+    if not dataset_path or not dataset_path.strip():
+        return ("❌ Please enter a dataset path", [], gr.Slider(maximum=0, value=0), builder_state) + empty_preview
+    dataset_path = dataset_path.strip()
+    if not os.path.exists(dataset_path):
+        return (f"❌ Dataset not found: {dataset_path}", [], gr.Slider(maximum=0, value=0), builder_state) + empty_preview
+    # Create new builder (don't reuse old state when loading a file)
+    builder = DatasetBuilder()
+    # Load the dataset
+    samples, status = builder.load_dataset(dataset_path)
+    if not samples:
+        return (status, [], gr.Slider(maximum=0, value=0), builder) + empty_preview
+    # Get table data
+    table_data = builder.get_samples_dataframe_data()
+    # Calculate slider max
+    slider_max = max(0, len(samples) - 1)
+    # Create info text
+    labeled_count = builder.get_labeled_count()
+    info = f"✅ Loaded dataset: {builder.metadata.name}\n"
+    info += f"📊 Samples: {len(samples)} ({labeled_count} labeled)\n"
+    info += f"🏷️ Custom Tag: {builder.metadata.custom_tag or '(none)'}\n"
+    info += "📝 Ready for preprocessing! You can also edit samples below."
+    # Get first sample preview
+    first_sample = builder.samples[0]
+    preview = (
+        first_sample.audio_path,
+        first_sample.filename,
+        first_sample.caption,
+        first_sample.lyrics,
+        first_sample.bpm,
+        first_sample.keyscale,
+        first_sample.timesignature,
+        first_sample.duration,
+        first_sample.language,
+        first_sample.is_instrumental,
+    )
+    return (info, table_data, gr.Slider(maximum=slider_max, value=0), builder) + preview
+def preprocess_dataset(
+    output_dir: str,
+    dit_handler,
+    builder_state: Optional[DatasetBuilder],
+    progress=None,
+) -> str:
+    """Preprocess dataset to tensor files for fast training.
+    This converts audio files to VAE latents and text to embeddings.
+    Returns:
+        Status message
+    """
+    if builder_state is None:
+        return "❌ No dataset loaded. Please scan a directory first."
+    if not builder_state.samples:
+        return "❌ No samples in dataset."
+    labeled_count = builder_state.get_labeled_count()
+    if labeled_count == 0:
+        return "❌ No labeled samples. Please auto-label or manually label samples first."
+    if not output_dir or not output_dir.strip():
+        return "❌ Please enter an output directory."
+    if dit_handler is None or dit_handler.model is None:
+        return "❌ Model not initialized. Please initialize the service first."
+    def progress_callback(msg):
+        if progress:
+            try:
+                progress(msg)
+            except:
+                pass
+    # Run preprocessing
+    output_paths, status = builder_state.preprocess_to_tensors(
+        dit_handler=dit_handler,
+        output_dir=output_dir.strip(),
+        progress_callback=progress_callback,
+    )
+    return status
+def load_training_dataset(
+    tensor_dir: str,
+) -> str:
+    """Load a preprocessed tensor dataset for training.
+    Returns:
+        Info text about the dataset
+    """
+    if not tensor_dir or not tensor_dir.strip():
+        return "❌ Please enter a tensor directory path"
+    tensor_dir = tensor_dir.strip()
+    if not os.path.exists(tensor_dir):
+        return f"❌ Directory not found: {tensor_dir}"
+    if not os.path.isdir(tensor_dir):
+        return f"❌ Not a directory: {tensor_dir}"
+    # Check for manifest
+    manifest_path = os.path.join(tensor_dir, "manifest.json")
+    if os.path.exists(manifest_path):
+        try:
+            with open(manifest_path, 'r') as f:
+                manifest = json.load(f)
+            num_samples = manifest.get("num_samples", 0)
+            metadata = manifest.get("metadata", {})
+            name = metadata.get("name", "Unknown")
+            custom_tag = metadata.get("custom_tag", "")
+            info = f"✅ Loaded preprocessed dataset: {name}\n"
+            info += f"📊 Samples: {num_samples} preprocessed tensors\n"
+            info += f"🏷️ Custom Tag: {custom_tag or '(none)'}"
+            return info
+        except Exception as e:
+            logger.warning(f"Failed to read manifest: {e}")
+    # Fallback: count .pt files
+    pt_files = [f for f in os.listdir(tensor_dir) if f.endswith('.pt')]
+    if not pt_files:
+        return f"❌ No .pt tensor files found in {tensor_dir}"
+    info = f"✅ Found {len(pt_files)} tensor files in {tensor_dir}\n"
+    info += "⚠️ No manifest.json found - using all .pt files"
+    return info
+# Training handlers
+import time
+import re
+def _format_duration(seconds):
+    """Format seconds to human readable string."""
+    seconds = int(seconds)
+    if seconds < 60:
+        return f"{seconds}s"
+    elif seconds < 3600:
+        return f"{seconds // 60}m {seconds % 60}s"
+    else:
+        return f"{seconds // 3600}h {(seconds % 3600) // 60}m"
+def start_training(
+    tensor_dir: str,
+    dit_handler,
+    lora_rank: int,
+    lora_alpha: int,
+    lora_dropout: float,
+    learning_rate: float,
+    train_epochs: int,
+    train_batch_size: int,
+    gradient_accumulation: int,
+    save_every_n_epochs: int,
+    training_shift: float,
+    training_seed: int,
+    lora_output_dir: str,
+    training_state: Dict,
+    progress=None,
+):
+    """Start LoRA training from preprocessed tensors.
+    This is a generator function that yields progress updates.
+    """
+    if not tensor_dir or not tensor_dir.strip():
+        yield "❌ Please enter a tensor directory path", "", None, training_state
+        return
+    tensor_dir = tensor_dir.strip()
+    if not os.path.exists(tensor_dir):
+        yield f"❌ Tensor directory not found: {tensor_dir}", "", None, training_state
+        return
+    if dit_handler is None or dit_handler.model is None:
+        yield "❌ Model not initialized. Please initialize the service first.", "", None, training_state
+        return
+    # Check for required training dependencies
+    try:
+        from lightning.fabric import Fabric
+        from peft import get_peft_model, LoraConfig
+    except ImportError as e:
+        yield f"❌ Missing required packages: {e}\nPlease install: pip install peft lightning", "", None, training_state
+        return
+    training_state["is_training"] = True
+    training_state["should_stop"] = False
+    try:
+        from acestep.training.trainer import LoRATrainer
+        from acestep.training.configs import LoRAConfig as LoRAConfigClass, TrainingConfig
+        # Create configs
+        lora_config = LoRAConfigClass(
+            r=lora_rank,
+            alpha=lora_alpha,
+            dropout=lora_dropout,
+        )
+        training_config = TrainingConfig(
+            shift=training_shift,
+            learning_rate=learning_rate,
+            batch_size=train_batch_size,
+            gradient_accumulation_steps=gradient_accumulation,
+            max_epochs=train_epochs,
+            save_every_n_epochs=save_every_n_epochs,
+            seed=training_seed,
+            output_dir=lora_output_dir,
+        )
+        import pandas as pd
+        # Initialize training log and loss history
+        log_lines = []
+        loss_data = pd.DataFrame({"step": [0], "loss": [0.0]})
+        # Start timer
+        start_time = time.time()
+        yield f"🚀 Starting training from {tensor_dir}...", "", loss_data, training_state
+        # Create trainer
+        trainer = LoRATrainer(
+            dit_handler=dit_handler,
+            lora_config=lora_config,
+            training_config=training_config,
+        )
+        # Collect loss history
+        step_list = []
+        loss_list = []
+        # Train with progress updates using preprocessed tensors
+        for step, loss, status in trainer.train_from_preprocessed(tensor_dir, training_state):
+            # Calculate elapsed time and ETA
+            elapsed_seconds = time.time() - start_time
+            time_info = f"⏱️ Elapsed: {_format_duration(elapsed_seconds)}"
+            # Parse "Epoch x/y" from status to calculate ETA
+            match = re.search(r"Epoch\s+(\d+)/(\d+)", str(status))
+            if match:
+                current_ep = int(match.group(1))
+                total_ep = int(match.group(2))
+                if current_ep > 0:
+                    eta_seconds = (elapsed_seconds / current_ep) * (total_ep - current_ep)
+                    time_info += f" | ETA: ~{_format_duration(eta_seconds)}"
+            # Display status with time info
+            display_status = f"{status}\n{time_info}"
+            # Terminal log
+            log_msg = f"[{_format_duration(elapsed_seconds)}] Step {step}: {status}"
+            logger.info(log_msg)
+            # Add to UI log
+            log_lines.append(status)
+            if len(log_lines) > 15:
+                log_lines = log_lines[-15:]
+            log_text = "\n".join(log_lines)
+            # Track loss for plot (only valid values)
+            if step > 0 and loss is not None and loss == loss:  # Check for NaN
+                step_list.append(step)
+                loss_list.append(float(loss))
+                loss_data = pd.DataFrame({"step": step_list, "loss": loss_list})
+            yield display_status, log_text, loss_data, training_state
+            if training_state.get("should_stop", False):
+                logger.info("⏹️ Training stopped by user")
+                log_lines.append("⏹️ Training stopped by user")
+                yield f"⏹️ Stopped ({time_info})", "\n".join(log_lines[-15:]), loss_data, training_state
+                break
+        total_time = time.time() - start_time
+        training_state["is_training"] = False
+        completion_msg = f"✅ Training completed! Total time: {_format_duration(total_time)}"
+        logger.info(completion_msg)
+        log_lines.append(completion_msg)
+        yield completion_msg, "\n".join(log_lines[-15:]), loss_data, training_state
+    except Exception as e:
+        logger.exception("Training error")
+        training_state["is_training"] = False
+        import pandas as pd
+        empty_df = pd.DataFrame({"step": [], "loss": []})
+        yield f"❌ Error: {str(e)}", str(e), empty_df, training_state
+def stop_training(training_state: Dict) -> Tuple[str, Dict]:
+    """Stop the current training process.
+    Returns:
+        Tuple of (status, training_state)
+    """
+    if not training_state.get("is_training", False):
+        return "⚠️ No training in progress", training_state
+    training_state["should_stop"] = True
+    return "⏹️ Stopping training...", training_state
+def export_lora(
+    export_path: str,
+    lora_output_dir: str,
+) -> str:
+    """Export the trained LoRA weights.
+    Returns:
+        Status message
+    """
+    if not export_path or not export_path.strip():
+        return "❌ Please enter an export path"
+    # Check if there's a trained model to export
+    final_dir = os.path.join(lora_output_dir, "final")
+    checkpoint_dir = os.path.join(lora_output_dir, "checkpoints")
+    # Prefer final, fallback to checkpoints
+    if os.path.exists(final_dir):
+        source_path = final_dir
+    elif os.path.exists(checkpoint_dir):
+        # Find the latest checkpoint
+        checkpoints = [d for d in os.listdir(checkpoint_dir) if d.startswith("epoch_")]
+        if not checkpoints:
+            return "❌ No checkpoints found"
+        checkpoints.sort(key=lambda x: int(x.split("_")[1]))
+        latest = checkpoints[-1]
+        source_path = os.path.join(checkpoint_dir, latest)
+    else:
+        return f"❌ No trained model found in {lora_output_dir}"
+    try:
+        import shutil
+        export_path = export_path.strip()
+        os.makedirs(os.path.dirname(export_path) if os.path.dirname(export_path) else ".", exist_ok=True)
+        if os.path.exists(export_path):
+            shutil.rmtree(export_path)
+        shutil.copytree(source_path, export_path)
+        return f"✅ LoRA exported to {export_path}"
+    except Exception as e:
+        logger.exception("Export error")
+        return f"❌ Export failed: {str(e)}"

code/acestep/gradio_ui/i18n.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""
+Internationalization (i18n) module for Gradio UI
+Supports multiple languages with easy translation management
+"""
+import os
+import json
+from typing import Dict, Optional
+class I18n:
+    """Internationalization handler"""
+    def __init__(self, default_language: str = "en"):
+        """
+        Initialize i18n handler
+        Args:
+            default_language: Default language code (en, zh, ja, etc.)
+        """
+        self.current_language = default_language
+        self.translations: Dict[str, Dict[str, str]] = {}
+        self._load_all_translations()
+    def _load_all_translations(self):
+        """Load all translation files from i18n directory"""
+        current_file = os.path.abspath(__file__)
+        module_dir = os.path.dirname(current_file)
+        i18n_dir = os.path.join(module_dir, "i18n")
+        if not os.path.exists(i18n_dir):
+            # Create i18n directory if it doesn't exist
+            os.makedirs(i18n_dir)
+            return
+        # Load all JSON files in i18n directory
+        for filename in os.listdir(i18n_dir):
+            if filename.endswith(".json"):
+                lang_code = filename[:-5]  # Remove .json extension
+                filepath = os.path.join(i18n_dir, filename)
+                try:
+                    with open(filepath, 'r', encoding='utf-8') as f:
+                        self.translations[lang_code] = json.load(f)
+                except Exception as e:
+                    print(f"Error loading translation file {filename}: {e}")
+    def set_language(self, language: str):
+        """Set current language"""
+        if language in self.translations:
+            self.current_language = language
+        else:
+            print(f"Warning: Language '{language}' not found, using default")
+    def t(self, key: str, **kwargs) -> str:
+        """
+        Translate a key to current language
+        Args:
+            key: Translation key (dot-separated for nested keys)
+            **kwargs: Optional format parameters
+        Returns:
+            Translated string
+        """
+        # Get translation from current language
+        translation = self._get_nested_value(
+            self.translations.get(self.current_language, {}),
+            key
+        )
+        # Fallback to English if not found
+        if translation is None:
+            translation = self._get_nested_value(
+                self.translations.get('en', {}),
+                key
+            )
+        # Final fallback to key itself
+        if translation is None:
+            translation = key
+        # Apply formatting if kwargs provided
+        if kwargs:
+            try:
+                translation = translation.format(**kwargs)
+            except KeyError:
+                pass
+        return translation
+    def _get_nested_value(self, data: dict, key: str) -> Optional[str]:
+        """
+        Get nested dictionary value using dot notation
+        Args:
+            data: Dictionary to search
+            key: Dot-separated key (e.g., "section.subsection.key")
+        Returns:
+            Value if found, None otherwise
+        """
+        keys = key.split('.')
+        current = data
+        for k in keys:
+            if isinstance(current, dict) and k in current:
+                current = current[k]
+            else:
+                return None
+        return current if isinstance(current, str) else None
+    def get_available_languages(self) -> list:
+        """Get list of available language codes"""
+        return list(self.translations.keys())
+# Global i18n instance
+_i18n_instance: Optional[I18n] = None
+def get_i18n(language: Optional[str] = None) -> I18n:
+    """
+    Get global i18n instance
+    Args:
+        language: Optional language to set
+    Returns:
+        I18n instance
+    """
+    global _i18n_instance
+    if _i18n_instance is None:
+        _i18n_instance = I18n(default_language=language or "en")
+    elif language is not None:
+        _i18n_instance.set_language(language)
+    return _i18n_instance
+def t(key: str, **kwargs) -> str:
+    """
+    Convenience function for translation
+    Args:
+        key: Translation key
+        **kwargs: Optional format parameters
+    Returns:
+        Translated string
+    """
+    return get_i18n().t(key, **kwargs)

code/acestep/gradio_ui/i18n/en.json ADDED Viewed

	@@ -0,0 +1,243 @@

+{
+  "app": {
+    "title": "🎛️ ACE-Step V1.5 Playground💡",
+    "subtitle": "Pushing the Boundaries of Open-Source Music Generation"
+  },
+  "dataset": {
+    "title": "📊 Dataset Explorer",
+    "dataset_label": "Dataset",
+    "dataset_info": "Choose dataset to explore",
+    "import_btn": "📥 Import Dataset",
+    "search_type_label": "Search Type",
+    "search_type_info": "How to find items",
+    "search_value_label": "Search Value",
+    "search_value_placeholder": "Enter keys or index (leave empty for random)",
+    "search_value_info": "Keys: exact match, Index: 0 to dataset size-1",
+    "instruction_label": "📝 Instruction",
+    "instruction_placeholder": "No instruction available",
+    "metadata_title": "📋 Item Metadata (JSON)",
+    "metadata_label": "Complete Item Information",
+    "source_audio": "Source Audio",
+    "target_audio": "Target Audio",
+    "reference_audio": "Reference Audio",
+    "get_item_btn": "🔍 Get Item",
+    "use_src_checkbox": "Use Source Audio from Dataset",
+    "use_src_info": "Check to use the source audio from dataset",
+    "data_status_label": "📊 Data Status",
+    "data_status_default": "❌ No dataset imported",
+    "autofill_btn": "📋 Auto-fill Generation Form"
+  },
+  "service": {
+    "title": "🔧 Service Configuration",
+    "checkpoint_label": "Checkpoint File",
+    "checkpoint_info": "Select a trained model checkpoint file (full path or filename)",
+    "refresh_btn": "🔄 Refresh",
+    "model_path_label": "Main Model Path",
+    "model_path_info": "Select the model configuration directory (auto-scanned from checkpoints)",
+    "device_label": "Device",
+    "device_info": "Processing device (auto-detect recommended)",
+    "lm_model_path_label": "5Hz LM Model Path",
+    "lm_model_path_info": "Select the 5Hz LM model checkpoint (auto-scanned from checkpoints)",
+    "backend_label": "5Hz LM Backend",
+    "backend_info": "Select backend for 5Hz LM: vllm (faster) or pt (PyTorch, more compatible)",
+    "init_llm_label": "Initialize 5Hz LM",
+    "init_llm_info": "Check to initialize 5Hz LM during service initialization",
+    "flash_attention_label": "Use Flash Attention",
+    "flash_attention_info_enabled": "Enable flash attention for faster inference (requires flash_attn package)",
+    "flash_attention_info_disabled": "Flash attention not available (flash_attn package not installed)",
+    "offload_cpu_label": "Offload to CPU",
+    "offload_cpu_info": "Offload models to CPU when not in use to save GPU memory",
+    "offload_dit_cpu_label": "Offload DiT to CPU",
+    "offload_dit_cpu_info": "Offload DiT to CPU (needs Offload to CPU)",
+    "init_btn": "Initialize Service",
+    "status_label": "Status",
+    "language_label": "UI Language",
+    "language_info": "Select interface language"
+  },
+  "generation": {
+    "required_inputs": "📝 Required Inputs",
+    "task_type_label": "Task Type",
+    "task_type_info": "Select the task type for generation",
+    "instruction_label": "Instruction",
+    "instruction_info": "Instruction is automatically generated based on task type",
+    "load_btn": "Load",
+    "track_name_label": "Track Name",
+    "track_name_info": "Select track name for lego/extract tasks",
+    "track_classes_label": "Track Names",
+    "track_classes_info": "Select multiple track classes for complete task",
+    "audio_uploads": "🎵 Audio Uploads",
+    "reference_audio": "Reference Audio (optional)",
+    "source_audio": "Source Audio (optional)",
+    "convert_codes_btn": "Convert to Codes",
+    "lm_codes_hints": "🎼 LM Codes Hints",
+    "lm_codes_label": "LM Codes Hints",
+    "lm_codes_placeholder": "<|audio_code_10695|><|audio_code_54246|>...",
+    "lm_codes_info": "Paste LM codes hints for text2music generation",
+    "lm_codes_sample": "LM Codes Hints (Sample {n})",
+    "lm_codes_sample_info": "Codes for sample {n}",
+    "transcribe_btn": "Transcribe",
+    "repainting_controls": "🎨 Repainting Controls (seconds)",
+    "repainting_start": "Repainting Start",
+    "repainting_end": "Repainting End",
+    "mode_label": "Generation Mode",
+    "mode_info": "Simple: describe music in natural language. Custom: full control over caption and lyrics.",
+    "mode_simple": "Simple",
+    "mode_custom": "Custom",
+    "simple_query_label": "Song Description",
+    "simple_query_placeholder": "Describe the music you want to create, e.g., 'a soft Bengali love song for a quiet evening'. Leave empty for a random sample.",
+    "simple_query_info": "Enter a natural language description of the music you want to generate",
+    "simple_vocal_language_label": "Vocal Language (optional)",
+    "simple_vocal_language_info": "Select preferred language(s) for lyrics. Use 'unknown' for any language.",
+    "create_sample_btn": "Create Sample",
+    "caption_title": "📝 Music Caption",
+    "caption_label": "Music Caption (optional)",
+    "caption_placeholder": "A peaceful acoustic guitar melody with soft vocals...",
+    "caption_info": "Describe the style, genre, instruments, and mood",
+    "lyrics_title": "📝 Lyrics",
+    "lyrics_label": "Lyrics (optional)",
+    "lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
+    "lyrics_info": "Song lyrics with structure",
+    "instrumental_label": "Instrumental",
+    "format_btn": "Format",
+    "optional_params": "⚙️ Optional Parameters",
+    "vocal_language_label": "Vocal Language (optional)",
+    "vocal_language_info": "use `unknown` for inst",
+    "bpm_label": "BPM (optional)",
+    "bpm_info": "leave empty for N/A",
+    "keyscale_label": "KeyScale (optional)",
+    "keyscale_placeholder": "Leave empty for N/A",
+    "keyscale_info": "A-G, #/♭, major/minor",
+    "timesig_label": "Time Signature (optional)",
+    "timesig_info": "2/4, 3/4, 4/4...",
+    "duration_label": "Audio Duration (seconds)",
+    "duration_info": "Use -1 for random",
+    "batch_size_label": "Batch Size",
+    "batch_size_info": "Number of audio to generate (max 8)",
+    "advanced_settings": "🔧 Advanced Settings",
+    "inference_steps_label": "DiT Inference Steps",
+    "inference_steps_info": "Turbo: max 8, Base: max 200",
+    "guidance_scale_label": "DiT Guidance Scale (Only support for base model)",
+    "guidance_scale_info": "Higher values follow text more closely",
+    "seed_label": "Seed",
+    "seed_info": "Use comma-separated values for batches",
+    "random_seed_label": "Random Seed",
+    "random_seed_info": "Enable to auto-generate seeds",
+    "audio_format_label": "Audio Format",
+    "audio_format_info": "Audio format for saved files",
+    "use_adg_label": "Use ADG",
+    "use_adg_info": "Enable Angle Domain Guidance",
+    "shift_label": "Shift",
+    "shift_info": "Timestep shift factor for base models (range 1.0~5.0, default 3.0). Not effective for turbo models.",
+    "infer_method_label": "Inference Method",
+    "infer_method_info": "Diffusion inference method. ODE (Euler) is faster, SDE (stochastic) may produce different results.",
+    "custom_timesteps_label": "Custom Timesteps",
+    "custom_timesteps_info": "Optional: comma-separated values from 1.0 to 0.0 (e.g., '0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0'). Overrides inference steps and shift.",
+    "cfg_interval_start": "CFG Interval Start",
+    "cfg_interval_end": "CFG Interval End",
+    "lm_params_title": "🤖 LM Generation Parameters",
+    "lm_temperature_label": "LM Temperature",
+    "lm_temperature_info": "5Hz LM temperature (higher = more random)",
+    "lm_cfg_scale_label": "LM CFG Scale",
+    "lm_cfg_scale_info": "5Hz LM CFG (1.0 = no CFG)",
+    "lm_top_k_label": "LM Top-K",
+    "lm_top_k_info": "Top-K (0 = disabled)",
+    "lm_top_p_label": "LM Top-P",
+    "lm_top_p_info": "Top-P (1.0 = disabled)",
+    "lm_negative_prompt_label": "LM Negative Prompt",
+    "lm_negative_prompt_placeholder": "Enter negative prompt for CFG (default: NO USER INPUT)",
+    "lm_negative_prompt_info": "Negative prompt (use when LM CFG Scale > 1.0)",
+    "cot_metas_label": "CoT Metas",
+    "cot_metas_info": "Use LM to generate CoT metadata (uncheck to skip LM CoT generation)",
+    "cot_language_label": "CoT Language",
+    "cot_language_info": "Generate language in CoT (chain-of-thought)",
+    "constrained_debug_label": "Constrained Decoding Debug",
+    "constrained_debug_info": "Enable debug logging for constrained decoding (check to see detailed logs)",
+    "auto_score_label": "Auto Score",
+    "auto_score_info": "Automatically calculate quality scores for all generated audios",
+    "auto_lrc_label": "Auto LRC",
+    "auto_lrc_info": "Automatically generate LRC lyrics timestamps for all generated audios",
+    "lm_batch_chunk_label": "LM Batch Chunk Size",
+    "lm_batch_chunk_info": "Max items per LM batch chunk (default: 8, limited by GPU memory)",
+    "codes_strength_label": "LM Codes Strength",
+    "codes_strength_info": "Control how many denoising steps use LM-generated codes",
+    "cover_strength_label": "Audio Cover Strength",
+    "cover_strength_info": "Control how many denoising steps use cover mode",
+    "score_sensitivity_label": "Quality Score Sensitivity",
+    "score_sensitivity_info": "Lower = more sensitive (default: 1.0). Adjusts how PMI maps to [0,1]",
+    "think_label": "Think",
+    "parallel_thinking_label": "ParallelThinking",
+    "generate_btn": "🎵 Generate Music",
+    "autogen_label": "AutoGen",
+    "caption_rewrite_label": "CaptionRewrite"
+  },
+  "results": {
+    "title": "🎵 Results",
+    "generated_music": "🎵 Generated Music (Sample {n})",
+    "send_to_src_btn": "🔗 Send To Src Audio",
+    "save_btn": "💾 Save",
+    "score_btn": "📊 Score",
+    "lrc_btn": "🎵 LRC",
+    "quality_score_label": "Quality Score (Sample {n})",
+    "quality_score_placeholder": "Click 'Score' to calculate perplexity-based quality score",
+    "codes_label": "LM Codes (Sample {n})",
+    "lrc_label": "Lyrics Timestamps (Sample {n})",
+    "lrc_placeholder": "Click 'LRC' to generate timestamps",
+    "details_accordion": "📊 Score & LRC & LM Codes",
+    "generation_status": "Generation Status",
+    "current_batch": "Current Batch",
+    "batch_indicator": "Batch {current} / {total}",
+    "next_batch_status": "Next Batch Status",
+    "prev_btn": "◀ Previous",
+    "next_btn": "Next ▶",
+    "restore_params_btn": "↙️ Apply These Settings to UI (Restore Batch Parameters)",
+    "batch_results_title": "📁 Batch Results & Generation Details",
+    "all_files_label": "📁 All Generated Files (Download)",
+    "generation_details": "Generation Details"
+  },
+  "messages": {
+    "no_audio_to_save": "❌ No audio to save",
+    "save_success": "✅ Saved audio and metadata to {filename}",
+    "save_failed": "❌ Failed to save: {error}",
+    "no_file_selected": "⚠️ No file selected",
+    "params_loaded": "✅ Parameters loaded from {filename}",
+    "invalid_json": "❌ Invalid JSON file: {error}",
+    "load_error": "❌ Error loading file: {error}",
+    "example_loaded": "📁 Loaded example from {filename}",
+    "example_failed": "Failed to parse JSON file {filename}: {error}",
+    "example_error": "Error loading example: {error}",
+    "lm_generated": "🤖 Generated example using LM",
+    "lm_fallback": "Failed to generate example using LM, falling back to examples directory",
+    "lm_not_initialized": "❌ 5Hz LM not initialized. Please initialize it first.",
+    "autogen_enabled": "🔄 AutoGen enabled - next batch will generate after this",
+    "batch_ready": "✅ Batch {n} ready! Click 'Next' to view.",
+    "batch_generating": "🔄 Starting background generation for Batch {n}...",
+    "batch_failed": "❌ Background generation failed: {error}",
+    "viewing_batch": "✅ Viewing Batch {n}",
+    "at_first_batch": "Already at first batch",
+    "at_last_batch": "No next batch available",
+    "batch_not_found": "Batch {n} not found in queue",
+    "no_batch_data": "No batch data found to restore.",
+    "params_restored": "✅ UI Parameters restored from Batch {n}",
+    "scoring_failed": "❌ Error: Batch data not found",
+    "no_codes": "❌ No audio codes available. Please generate music first.",
+    "score_failed": "❌ Scoring failed: {error}",
+    "score_error": "❌ Error calculating score: {error}",
+    "lrc_no_batch_data": "❌ No batch data found. Please generate music first.",
+    "lrc_no_extra_outputs": "❌ No extra outputs found. Condition tensors not available.",
+    "lrc_missing_tensors": "❌ Missing required tensors for LRC generation.",
+    "lrc_sample_not_exist": "❌ Sample does not exist in current batch.",
+    "lrc_empty_result": "⚠️ LRC generation produced empty result.",
+    "empty_query": "⚠️ Please enter a music description.",
+    "sample_creation_failed": "❌ Failed to create sample. Please try again.",
+    "sample_created": "✅ Sample created! Review the caption and lyrics, then click Generate Music.",
+    "simple_examples_not_found": "⚠️ Simple mode examples directory not found.",
+    "simple_examples_empty": "⚠️ No example files found in simple mode examples.",
+    "simple_example_loaded": "🎲 Loaded random example from {filename}",
+    "format_success": "✅ Caption and lyrics formatted successfully",
+    "format_failed": "❌ Format failed: {error}",
+    "skipping_metas_cot": "⚡ Skipping Phase 1 metas COT (sample already formatted)",
+    "invalid_timesteps_format": "⚠️ Invalid timesteps format. Using default schedule.",
+    "timesteps_out_of_range": "⚠️ Timesteps must be in range [0, 1]. Using default schedule.",
+    "timesteps_count_mismatch": "⚠️ Timesteps count ({actual}) differs from inference_steps ({expected}). Using timesteps count."
+  }
+}

code/acestep/gradio_ui/i18n/ja.json ADDED Viewed

	@@ -0,0 +1,243 @@

+{
+  "app": {
+    "title": "🎛️ ACE-Step V1.5 プレイグラウンド💡",
+    "subtitle": "オープンソース音楽生成の限界を押し広げる"
+  },
+  "dataset": {
+    "title": "📊 データセットエクスプローラー",
+    "dataset_label": "データセット",
+    "dataset_info": "探索するデータセットを選択",
+    "import_btn": "📥 データセットをインポート",
+    "search_type_label": "検索タイプ",
+    "search_type_info": "アイテムの検索方法",
+    "search_value_label": "検索値",
+    "search_value_placeholder": "キーまたはインデックスを入力(空白の場合はランダム)",
+    "search_value_info": "キー: 完全一致、インデックス: 0からデータセットサイズ-1",
+    "instruction_label": "📝 指示",
+    "instruction_placeholder": "利用可能な指示がありません",
+    "metadata_title": "📋 アイテムメタデータ (JSON)",
+    "metadata_label": "完全なアイテム情報",
+    "source_audio": "ソースオーディオ",
+    "target_audio": "ターゲットオーディオ",
+    "reference_audio": "リファレンスオーディオ",
+    "get_item_btn": "🔍 アイテムを取得",
+    "use_src_checkbox": "データセットのソースオーディオを使用",
+    "use_src_info": "データセットのソースオーディオを使用する場合はチェック",
+    "data_status_label": "📊 データステータス",
+    "data_status_default": "❌ データセットがインポートされていません",
+    "autofill_btn": "📋 生成フォームを自動入力"
+  },
+  "service": {
+    "title": "🔧 サービス設定",
+    "checkpoint_label": "チェックポイントファイル",
+    "checkpoint_info": "訓練済みモデルのチェックポイントファイルを選択(フルパスまたはファイル名)",
+    "refresh_btn": "🔄 更新",
+    "model_path_label": "メインモデルパス",
+    "model_path_info": "モデル設定ディレクトリを選択(チェックポイントから自動スキャン)",
+    "device_label": "デバイス",
+    "device_info": "処理デバイス(自動検出を推奨)",
+    "lm_model_path_label": "5Hz LM モデルパス",
+    "lm_model_path_info": "5Hz LMモデルチェックポイントを選択(チェックポイントから自動スキャン)",
+    "backend_label": "5Hz LM バックエンド",
+    "backend_info": "5Hz LMのバックエンドを選択: vllm(高速)またはpt(PyTorch、より互換性あり)",
+    "init_llm_label": "5Hz LM を初期化",
+    "init_llm_info": "サービス初期化中に5Hz LMを初期化する場合はチェック",
+    "flash_attention_label": "Flash Attention を使用",
+    "flash_attention_info_enabled": "推論を高速化するためにflash attentionを有効にする(flash_attnパッケージが必要)",
+    "flash_attention_info_disabled": "Flash attentionは利用できません(flash_attnパッケージがインストールされていません)",
+    "offload_cpu_label": "CPUにオフロード",
+    "offload_cpu_info": "使用していない時にモデルをCPUにオフロードしてGPUメモリを節約",
+    "offload_dit_cpu_label": "DiTをCPUにオフロード",
+    "offload_dit_cpu_info": "DiTをCPUにオフロード(CPUへのオフロードが必要)",
+    "init_btn": "サービスを初期化",
+    "status_label": "ステータス",
+    "language_label": "UI言語",
+    "language_info": "インターフェース言語を選択"
+  },
+  "generation": {
+    "required_inputs": "📝 必須入力",
+    "task_type_label": "タスクタイプ",
+    "task_type_info": "生成のタスクタイプを選択",
+    "instruction_label": "指示",
+    "instruction_info": "指示はタスクタイプに基づいて自動生成されます",
+    "load_btn": "読み込む",
+    "track_name_label": "トラック名",
+    "track_name_info": "lego/extractタスクのトラック名を選択",
+    "track_classes_label": "トラック名",
+    "track_classes_info": "completeタスクの複数のトラッククラスを選択",
+    "audio_uploads": "🎵 オーディオアップロード",
+    "reference_audio": "リファレンスオーディオ(オプション)",
+    "source_audio": "ソースオーディオ(オプション)",
+    "convert_codes_btn": "コードに変換",
+    "lm_codes_hints": "🎼 LM コードヒント",
+    "lm_codes_label": "LM コードヒント",
+    "lm_codes_placeholder": "<|audio_code_10695|><|audio_code_54246|>...",
+    "lm_codes_info": "text2music生成用のLMコードヒントを貼り付け",
+    "lm_codes_sample": "LM コードヒント(サンプル {n})",
+    "lm_codes_sample_info": "サンプル{n}のコード",
+    "transcribe_btn": "転写",
+    "repainting_controls": "🎨 再描画コントロール(秒)",
+    "repainting_start": "再描画開始",
+    "repainting_end": "再描画終了",
+    "mode_label": "生成モード",
+    "mode_info": "シンプル：自然言語で音楽を説明��カスタム：キャプションと歌詞を完全にコントロール。",
+    "mode_simple": "シンプル",
+    "mode_custom": "カスタム",
+    "simple_query_label": "曲の説明",
+    "simple_query_placeholder": "作成したい音楽を説明してください。例：'静かな夜のための優しいベンガルのラブソング'。空欄の場合はランダムなサンプルが生成されます。",
+    "simple_query_info": "生成したい音楽の自然言語の説明を入力",
+    "simple_vocal_language_label": "ボーカル言語(オプション)",
+    "simple_vocal_language_info": "歌詞の希望言語を選択。任意の言語の場合は'unknown'を使用。",
+    "create_sample_btn": "サンプル作成",
+    "caption_title": "📝 音楽キャプション",
+    "caption_label": "音楽キャプション(オプション)",
+    "caption_placeholder": "柔らかいボーカルを伴う穏やかなアコースティックギターのメロディー...",
+    "caption_info": "スタイル、ジャンル、楽器、ムードを説明",
+    "lyrics_title": "📝 歌詞",
+    "lyrics_label": "歌詞(オプション)",
+    "lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
+    "lyrics_info": "構造を持つ曲の歌詞",
+    "instrumental_label": "インストゥルメンタル",
+    "format_btn": "フォーマット",
+    "optional_params": "⚙️ オプションパラメータ",
+    "vocal_language_label": "ボーカル言語(オプション)",
+    "vocal_language_info": "インストには`unknown`を使用",
+    "bpm_label": "BPM(オプション)",
+    "bpm_info": "空白の場合はN/A",
+    "keyscale_label": "キースケール(オプション)",
+    "keyscale_placeholder": "空白の場合はN/A",
+    "keyscale_info": "A-G, #/♭, メジャー/マイナー",
+    "timesig_label": "拍子記号(オプション)",
+    "timesig_info": "2/4, 3/4, 4/4...",
+    "duration_label": "オーディオ長(秒)",
+    "duration_info": "ランダムの場合は-1を使用",
+    "batch_size_label": "バッチサイズ",
+    "batch_size_info": "生成するオーディオの数(最大8)",
+    "advanced_settings": "🔧 詳細設定",
+    "inference_steps_label": "DiT 推論ステップ",
+    "inference_steps_info": "Turbo: 最大8、Base: 最大200",
+    "guidance_scale_label": "DiT ガイダンススケール(baseモデルのみサポート)",
+    "guidance_scale_info": "値が高いほどテキストに忠実に従う",
+    "seed_label": "シード",
+    "seed_info": "バッチにはカンマ区切りの値を使用",
+    "random_seed_label": "ランダムシード",
+    "random_seed_info": "有効にすると自動的にシードを生成",
+    "audio_format_label": "オーディオフォーマット",
+    "audio_format_info": "保存ファイルのオーディオフォーマット",
+    "use_adg_label": "ADG を使用",
+    "use_adg_info": "角度ドメインガイダンスを有効化",
+    "shift_label": "シフト",
+    "shift_info": "baseモデル用タイムステップシフト係数 (範囲 1.0~5.0、デフォルト 3.0)。turboモデルには無効。",
+    "infer_method_label": "推論方法",
+    "infer_method_info": "拡散推論方法。ODE (オイラー) は高速、SDE (確率的) は異なる結果を生成する可能性があります。",
+    "custom_timesteps_label": "カスタムタイムステップ",
+    "custom_timesteps_info": "オプション：1.0から0.0へのカンマ区切り値（例：'0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0'）。推論ステップとシフトを上書きします。",
+    "cfg_interval_start": "CFG 間隔開始",
+    "cfg_interval_end": "CFG 間隔終了",
+    "lm_params_title": "🤖 LM 生成パラメータ",
+    "lm_temperature_label": "LM 温度",
+    "lm_temperature_info": "5Hz LM温度(高いほどランダム)",
+    "lm_cfg_scale_label": "LM CFG スケール",
+    "lm_cfg_scale_info": "5Hz LM CFG (1.0 = CFGなし)",
+    "lm_top_k_label": "LM Top-K",
+    "lm_top_k_info": "Top-K (0 = 無効)",
+    "lm_top_p_label": "LM Top-P",
+    "lm_top_p_info": "Top-P (1.0 = 無効)",
+    "lm_negative_prompt_label": "LM ネガティブプロンプト",
+    "lm_negative_prompt_placeholder": "CFGのネガティブプロンプトを入力(デフォルト: NO USER INPUT)",
+    "lm_negative_prompt_info": "ネガティブプロンプト(LM CFGスケール > 1.0の場合に使用)",
+    "cot_metas_label": "CoT メタデータ",
+    "cot_metas_info": "LMを使用してCoTメタデータを生成(チェックを外すとLM CoT生成をスキップ)",
+    "cot_language_label": "CoT 言語",
+    "cot_language_info": "CoTで言語を生成(思考の連鎖)",
+    "constrained_debug_label": "制約付きデコーディングデバッグ",
+    "constrained_debug_info": "制約付きデコーディングのデバッグログを有効化(チェックすると詳細ログを表示)",
+    "auto_score_label": "自動スコアリング",
+    "auto_score_info": "生成���れたすべてのオーディオの品質スコアを自動計算",
+    "auto_lrc_label": "自動 LRC",
+    "auto_lrc_info": "生成されたすべてのオーディオのLRC歌詞タイムスタンプを自動生成",
+    "lm_batch_chunk_label": "LM バッチチャンクサイズ",
+    "lm_batch_chunk_info": "LMバッチチャンクあたりの最大アイテム数(デフォルト: 8、GPUメモリによる制限)",
+    "codes_strength_label": "LM コード強度",
+    "codes_strength_info": "LM生成コードを使用するデノイジングステップ数を制御",
+    "cover_strength_label": "オーディオカバー強度",
+    "cover_strength_info": "カバーモードを使用するデノイジングステップ数を制御",
+    "score_sensitivity_label": "品質スコア感度",
+    "score_sensitivity_info": "低い = より敏感(デフォルト: 1.0)。PMIが[0,1]にマッピングする方法を調整",
+    "think_label": "思考",
+    "parallel_thinking_label": "並列思考",
+    "generate_btn": "🎵 音楽を生成",
+    "autogen_label": "自動生成",
+    "caption_rewrite_label": "キャプション書き換え"
+  },
+  "results": {
+    "title": "🎵 結果",
+    "generated_music": "🎵 生成された音楽(サンプル {n})",
+    "send_to_src_btn": "🔗 ソースオーディオに送信",
+    "save_btn": "💾 保存",
+    "score_btn": "📊 スコア",
+    "lrc_btn": "🎵 LRC",
+    "quality_score_label": "品質スコア(サンプル {n})",
+    "quality_score_placeholder": "'スコア'をクリックしてパープレキシティベースの品質スコアを計算",
+    "codes_label": "LM コード(サンプル {n})",
+    "lrc_label": "歌詞タイムスタンプ(サンプル {n})",
+    "lrc_placeholder": "'LRC'をクリックしてタイムスタンプを生成",
+    "details_accordion": "📊 スコア & LRC & LM コード",
+    "generation_status": "生成ステータス",
+    "current_batch": "現在のバッチ",
+    "batch_indicator": "バッチ {current} / {total}",
+    "next_batch_status": "次のバッチステータス",
+    "prev_btn": "◀ 前へ",
+    "next_btn": "次へ ▶",
+    "restore_params_btn": "↙️ これらの設定をUIに適用(バッチパラメータを復元)",
+    "batch_results_title": "📁 バッチ結果と生成詳細",
+    "all_files_label": "📁 すべての生成ファイル(ダウンロード)",
+    "generation_details": "生成詳細"
+  },
+  "messages": {
+    "no_audio_to_save": "❌ 保存するオーディオがありません",
+    "save_success": "✅ オーディオとメタデータを {filename} に保存しました",
+    "save_failed": "❌ 保存に失敗しました: {error}",
+    "no_file_selected": "⚠️ ファイルが選択されていません",
+    "params_loaded": "✅ {filename} からパラメータを読み込みました",
+    "invalid_json": "❌ 無効なJSONファイル: {error}",
+    "load_error": "❌ ファイルの読み込みエラー: {error}",
+    "example_loaded": "📁 {filename} からサンプルを読み込みました",
+    "example_failed": "JSONファイル {filename} の解析に失敗しました: {error}",
+    "example_error": "サンプル読み込みエラー: {error}",
+    "lm_generated": "🤖 LMを使用してサンプルを生成しました",
+    "lm_fallback": "LMを使用したサンプル生成に失敗、サンプルディレクトリにフォールバック",
+    "lm_not_initialized": "❌ 5Hz LMが初期化されていません。最初に初期化してください。",
+    "autogen_enabled": "🔄 自動生成が有効 - このあと次のバッチを生成します",
+    "batch_ready": "✅ バッチ {n} の準備完了！'次へ'をクリックして表示。",
+    "batch_generating": "🔄 バッチ {n} のバックグラウンド生成を開始...",
+    "batch_failed": "❌ バックグラウンド生成に失敗しました: {error}",
+    "viewing_batch": "✅ バッチ {n} を表示中",
+    "at_first_batch": "すでに最初のバッチです",
+    "at_last_batch": "次のバッチはありません",
+    "batch_not_found": "キューにバッチ {n} が見つかりません",
+    "no_batch_data": "復元するバッチデータがありません。",
+    "params_restored": "✅ バッチ {n} からUIパラメータを復元しました",
+    "scoring_failed": "❌ エラー: バッチデータが見つかりません",
+    "no_codes": "❌ 利用可能なオーディオコードがありません。最初に音楽を生成してください。",
+    "score_failed": "❌ スコアリングに失敗しました: {error}",
+    "score_error": "❌ スコア計算エラー: {error}",
+    "lrc_no_batch_data": "❌ バッチデータが見つかりません。最初に音楽を生成してください。",
+    "lrc_no_extra_outputs": "❌ 追加出力が見つかりません。条件テンソルが利用できません。",
+    "lrc_missing_tensors": "❌ LRC生成に必要なテンソルがありません。",
+    "lrc_sample_not_exist": "❌ 現在のバッチにサンプルが存在しません。",
+    "lrc_empty_result": "⚠️ LRC生成の結果が空です。",
+    "empty_query": "⚠️ 音楽の説明を入力してください。",
+    "sample_creation_failed": "❌ サンプルの作成に失敗しました。もう一度お試しください。",
+    "sample_created": "✅ サンプルが作成されました！キャプションと歌詞を確認して、音楽を生成をクリックしてください。",
+    "simple_examples_not_found": "⚠️ シンプルモードサンプルディレクトリが見つかりません。",
+    "simple_examples_empty": "⚠️ シンプルモードサンプルにファイルがありません。",
+    "simple_example_loaded": "🎲 {filename} からランダムサンプルを読み込みました",
+    "format_success": "✅ キャプションと歌詞のフォーマットに成功しました",
+    "format_failed": "❌ フォーマットに失敗しました: {error}",
+    "skipping_metas_cot": "⚡ Phase 1 メタデータ COT をスキップ（サンプルは既にフォーマット済み）",
+    "invalid_timesteps_format": "⚠️ タイムステップ形式が無効です。デフォルトスケジュールを使用します。",
+    "timesteps_out_of_range": "⚠️ タイムステップは [0, 1] の範囲内である必要があります。デフォルトスケジュールを使用します。",
+    "timesteps_count_mismatch": "⚠️ タイムステップ数 ({actual}) が推論ステップ数 ({expected}) と異なります。タイムステップ数を使用します。"
+  }
+}

code/acestep/gradio_ui/i18n/zh.json ADDED Viewed

	@@ -0,0 +1,243 @@

+{
+  "app": {
+    "title": "🎛️ ACE-Step V1.5 演练场💡",
+    "subtitle": "推动开源音乐生成的边界"
+  },
+  "dataset": {
+    "title": "📊 数据集浏览器",
+    "dataset_label": "数据集",
+    "dataset_info": "选择要浏览的数据集",
+    "import_btn": "📥 导入数据集",
+    "search_type_label": "搜索类型",
+    "search_type_info": "如何查找项目",
+    "search_value_label": "搜索值",
+    "search_value_placeholder": "输入键或索引(留空表示随机)",
+    "search_value_info": "键: 精确匹配, 索引: 0到数据集大小-1",
+    "instruction_label": "📝 指令",
+    "instruction_placeholder": "无可用指令",
+    "metadata_title": "📋 项目元数据 (JSON)",
+    "metadata_label": "完整项目信息",
+    "source_audio": "源音频",
+    "target_audio": "目标音频",
+    "reference_audio": "参考音频",
+    "get_item_btn": "🔍 获取项目",
+    "use_src_checkbox": "使用数据集中的源音频",
+    "use_src_info": "勾选以使用数据集中的源音频",
+    "data_status_label": "📊 数据状态",
+    "data_status_default": "❌ 未导入数据集",
+    "autofill_btn": "📋 自动填充生成表单"
+  },
+  "service": {
+    "title": "🔧 服务配置",
+    "checkpoint_label": "检查点文件",
+    "checkpoint_info": "选择训练好的模型检查点文件(完整路径或文件名)",
+    "refresh_btn": "🔄 刷新",
+    "model_path_label": "主模型路径",
+    "model_path_info": "选择模型配置目录(从检查点自动扫描)",
+    "device_label": "设备",
+    "device_info": "处理设备(建议自动检测)",
+    "lm_model_path_label": "5Hz LM 模型路径",
+    "lm_model_path_info": "选择5Hz LM模型检查点(从检查点自动扫描)",
+    "backend_label": "5Hz LM 后端",
+    "backend_info": "选择5Hz LM的后端: vllm(更快)或pt(PyTorch, 更兼容)",
+    "init_llm_label": "初始化 5Hz LM",
+    "init_llm_info": "勾选以在服务初始化期间初始化5Hz LM",
+    "flash_attention_label": "使用Flash Attention",
+    "flash_attention_info_enabled": "启用flash attention以加快推理速度(需要flash_attn包)",
+    "flash_attention_info_disabled": "Flash attention不可用(未安装flash_attn包)",
+    "offload_cpu_label": "卸载到CPU",
+    "offload_cpu_info": "不使用时将模型卸载到CPU以节省GPU内存",
+    "offload_dit_cpu_label": "将DiT卸载到CPU",
+    "offload_dit_cpu_info": "将DiT卸载到CPU(需要启用卸载到CPU)",
+    "init_btn": "初始化服务",
+    "status_label": "状态",
+    "language_label": "界面语言",
+    "language_info": "选择界面语言"
+  },
+  "generation": {
+    "required_inputs": "📝 必需输入",
+    "task_type_label": "任务类型",
+    "task_type_info": "选择生成的任务类型",
+    "instruction_label": "指令",
+    "instruction_info": "指令根据任务类型自动生成",
+    "load_btn": "加载",
+    "track_name_label": "音轨名称",
+    "track_name_info": "为lego/extract任务选择音轨名称",
+    "track_classes_label": "音轨名称",
+    "track_classes_info": "为complete任务选择多个音轨类别",
+    "audio_uploads": "🎵 音频上传",
+    "reference_audio": "参考音频(可选)",
+    "source_audio": "源音频(可选)",
+    "convert_codes_btn": "转换为代码",
+    "lm_codes_hints": "🎼 LM 代码提示",
+    "lm_codes_label": "LM 代码提示",
+    "lm_codes_placeholder": "<|audio_code_10695|><|audio_code_54246|>...",
+    "lm_codes_info": "粘贴用于text2music生成的LM代码提示",
+    "lm_codes_sample": "LM 代码提示(样本 {n})",
+    "lm_codes_sample_info": "样本{n}的代码",
+    "transcribe_btn": "转录",
+    "repainting_controls": "🎨 重绘控制(秒)",
+    "repainting_start": "重绘开始",
+    "repainting_end": "重绘结束",
+    "mode_label": "生成模式",
+    "mode_info": "简单模式：用自然语言描述音乐。自定义模式：完全控制描述和歌词。",
+    "mode_simple": "简单",
+    "mode_custom": "自定义",
+    "simple_query_label": "歌曲描述",
+    "simple_query_placeholder": "描述你想创作的音乐，例如：'给我生成一首暗黑的戏剧古风，歌词要华丽'。留空则随机生成样本。",
+    "simple_query_info": "输入你想生成的音乐的自然语言描述",
+    "simple_vocal_language_label": "人声语言(可选)",
+    "simple_vocal_language_info": "选择歌词的首选语言。使用 'unknown' 表示任意语言。",
+    "create_sample_btn": "创建样本",
+    "caption_title": "📝 音乐描述",
+    "caption_label": "音乐描述(可选)",
+    "caption_placeholder": "一段平和的原声吉他旋律,配有柔和的人声...",
+    "caption_info": "描述风格、流派、乐器和情绪",
+    "lyrics_title": "📝 歌词",
+    "lyrics_label": "歌词(可选)",
+    "lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
+    "lyrics_info": "带有结构的歌曲歌词",
+    "instrumental_label": "纯音乐",
+    "format_btn": "格式化",
+    "optional_params": "⚙️ 可选参数",
+    "vocal_language_label": "人声语言(可选)",
+    "vocal_language_info": "纯音乐使用 `unknown`",
+    "bpm_label": "BPM(可选)",
+    "bpm_info": "留空表示N/A",
+    "keyscale_label": "调性(可选)",
+    "keyscale_placeholder": "留空表示N/A",
+    "keyscale_info": "A-G, #/♭, 大调/小调",
+    "timesig_label": "拍号(可选)",
+    "timesig_info": "2/4, 3/4, 4/4...",
+    "duration_label": "音频时长(秒)",
+    "duration_info": "使用-1表示随机",
+    "batch_size_label": "批量大小",
+    "batch_size_info": "要生成的音频数量(最多8个)",
+    "advanced_settings": "🔧 高级设置",
+    "inference_steps_label": "DiT 推理步数",
+    "inference_steps_info": "Turbo: 最多8, Base: 最多200",
+    "guidance_scale_label": "DiT 引导比例(仅支持base模型)",
+    "guidance_scale_info": "更高的值更紧密地遵循文本",
+    "seed_label": "种子",
+    "seed_info": "批量使用逗号分隔的值",
+    "random_seed_label": "随机种子",
+    "random_seed_info": "启用以自动生成种子",
+    "audio_format_label": "音频格式",
+    "audio_format_info": "保存文件的音频格式",
+    "use_adg_label": "使用 ADG",
+    "use_adg_info": "启用角域引导",
+    "shift_label": "Shift",
+    "shift_info": "时间步偏移因子，仅对 base 模型生效 (范围 1.0~5.0，默认 3.0)。对 turbo 模型无效。",
+    "infer_method_label": "推理方法",
+    "infer_method_info": "扩散推理方法。ODE (欧拉) 更快，SDE (随机) 可能产生不同结果。",
+    "custom_timesteps_label": "自定义时间步",
+    "custom_timesteps_info": "可选：从 1.0 到 0.0 的逗号分隔值（例如 '0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0'）。会覆盖推理步数和 shift 设置。",
+    "cfg_interval_start": "CFG 间隔开始",
+    "cfg_interval_end": "CFG 间隔结束",
+    "lm_params_title": "🤖 LM 生成参数",
+    "lm_temperature_label": "LM 温度",
+    "lm_temperature_info": "5Hz LM温度(越高越随机)",
+    "lm_cfg_scale_label": "LM CFG 比例",
+    "lm_cfg_scale_info": "5Hz LM CFG (1.0 = 无CFG)",
+    "lm_top_k_label": "LM Top-K",
+    "lm_top_k_info": "Top-K (0 = 禁用)",
+    "lm_top_p_label": "LM Top-P",
+    "lm_top_p_info": "Top-P (1.0 = 禁用)",
+    "lm_negative_prompt_label": "LM 负面提示",
+    "lm_negative_prompt_placeholder": "输入CFG的负面提示(默认: NO USER INPUT)",
+    "lm_negative_prompt_info": "负面提示(当LM CFG比例 > 1.0时使用)",
+    "cot_metas_label": "CoT 元数据",
+    "cot_metas_info": "使用LM生成CoT元数据(取消勾选以跳过LM CoT生成)",
+    "cot_language_label": "CoT 语言",
+    "cot_language_info": "在CoT中生成语言(思维链)",
+    "constrained_debug_label": "约束解码调试",
+    "constrained_debug_info": "启用约束解码的调试日志(勾选以查看详细日志)",
+    "auto_score_label": "自动评分",
+    "auto_score_info": "自动计算所有生成音频的质量分数",
+    "auto_lrc_label": "自动 LRC",
+    "auto_lrc_info": "自动为所有生成的音频生成LRC歌词时间戳",
+    "lm_batch_chunk_label": "LM 批量块大小",
+    "lm_batch_chunk_info": "每个LM批量块的最大项目数(默认: 8, 受GPU内存限制)",
+    "codes_strength_label": "LM 代码强度",
+    "codes_strength_info": "控制使用LM生成代码的去噪步骤数量",
+    "cover_strength_label": "音频覆盖强度",
+    "cover_strength_info": "控制使用覆盖模式的去噪步骤数量",
+    "score_sensitivity_label": "质量评分敏感度",
+    "score_sensitivity_info": "更低 = 更敏感(默认: 1.0). 调整PMI如何映射到[0,1]",
+    "think_label": "思考",
+    "parallel_thinking_label": "并行思考",
+    "generate_btn": "🎵 生成音乐",
+    "autogen_label": "自动生成",
+    "caption_rewrite_label": "描述重写"
+  },
+  "results": {
+    "title": "🎵 结果",
+    "generated_music": "🎵 生成的音乐(样本 {n})",
+    "send_to_src_btn": "🔗 发送到源音频",
+    "save_btn": "💾 保存",
+    "score_btn": "📊 评分",
+    "lrc_btn": "🎵 LRC",
+    "quality_score_label": "质量分数(样本 {n})",
+    "quality_score_placeholder": "点击'评分'以计算基于困惑度的质量分数",
+    "codes_label": "LM 代码(样本 {n})",
+    "lrc_label": "歌词时间戳(样本 {n})",
+    "lrc_placeholder": "点击'LRC'生成时间戳",
+    "details_accordion": "📊 评分与LRC与LM代码",
+    "generation_status": "生成状态",
+    "current_batch": "当前批次",
+    "batch_indicator": "批次 {current} / {total}",
+    "next_batch_status": "下一批次状态",
+    "prev_btn": "◀ 上一个",
+    "next_btn": "下一个 ▶",
+    "restore_params_btn": "↙️ 将这些设置应用到UI(恢复批次参数)",
+    "batch_results_title": "📁 批量结果和生成详情",
+    "all_files_label": "📁 所有生成的文件(下载)",
+    "generation_details": "生成详情"
+  },
+  "messages": {
+    "no_audio_to_save": "��� 没有要保存的音频",
+    "save_success": "✅ 已将音频和元数据保存到 {filename}",
+    "save_failed": "❌ 保存失败: {error}",
+    "no_file_selected": "⚠️ 未选择文件",
+    "params_loaded": "✅ 已从 {filename} 加载参数",
+    "invalid_json": "❌ 无效的JSON文件: {error}",
+    "load_error": "❌ 加载文件时出错: {error}",
+    "example_loaded": "📁 已从 {filename} 加载示例",
+    "example_failed": "解析JSON文件 {filename} 失败: {error}",
+    "example_error": "加载示例时出错: {error}",
+    "lm_generated": "🤖 使用LM生成的示例",
+    "lm_fallback": "使用LM生成示例失败,回退到示例目录",
+    "lm_not_initialized": "❌ 5Hz LM未初始化。请先初始化它。",
+    "autogen_enabled": "🔄 已启用自动生成 - 下一批次将在此之后生成",
+    "batch_ready": "✅ 批次 {n} 就绪!点击'下一个'查看。",
+    "batch_generating": "🔄 开始为批次 {n} 进行后台生成...",
+    "batch_failed": "❌ 后台生成失败: {error}",
+    "viewing_batch": "✅ 查看批次 {n}",
+    "at_first_batch": "已在第一批次",
+    "at_last_batch": "没有下一批次可用",
+    "batch_not_found": "在队列中未找到批次 {n}",
+    "no_batch_data": "没有要恢复的批次数据。",
+    "params_restored": "✅ 已从批次 {n} 恢复UI参数",
+    "scoring_failed": "❌ 错误: 未找到批次数据",
+    "no_codes": "❌ 没有可用的音频代码。请先生成音乐。",
+    "score_failed": "❌ 评分失败: {error}",
+    "score_error": "❌ 计算分数时出错: {error}",
+    "lrc_no_batch_data": "❌ 未找到批次数据。请先生成音乐。",
+    "lrc_no_extra_outputs": "❌ 未找到额外输出。条件张量不可用。",
+    "lrc_missing_tensors": "❌ 缺少LRC生成所需的张量。",
+    "lrc_sample_not_exist": "❌ 当前批次中不存在该样本。",
+    "lrc_empty_result": "⚠️ LRC生成结果为空。",
+    "empty_query": "⚠️ 请输入音乐描述。",
+    "sample_creation_failed": "❌ 创建样本失败。请重试。",
+    "sample_created": "✅ 样本已创建！检查描述和歌词，然后点击生成音乐。",
+    "simple_examples_not_found": "⚠️ 未找到简单模式示例目录。",
+    "simple_examples_empty": "⚠️ 简单模式示例中没有示例文件。",
+    "simple_example_loaded": "🎲 已从 {filename} 加载随机示例",
+    "format_success": "✅ 描述和歌词格式化成功",
+    "format_failed": "❌ 格式化失败: {error}",
+    "skipping_metas_cot": "⚡ 跳过 Phase 1 元数据 COT（样本已格式化）",
+    "invalid_timesteps_format": "⚠️ 时间步格式无效，使用默认调度。",
+    "timesteps_out_of_range": "⚠️ 时间步必须在 [0, 1] 范围内，使用默认调度。",
+    "timesteps_count_mismatch": "⚠️ 时间步数量 ({actual}) 与推理步数 ({expected}) 不匹配，将使用时间步数量。"
+  }
+}

code/acestep/gradio_ui/interfaces/__init__.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""
+Gradio UI Components Module
+Contains all Gradio interface component definitions and layouts
+"""
+import gradio as gr
+from acestep.gradio_ui.i18n import get_i18n, t
+from acestep.gradio_ui.interfaces.dataset import create_dataset_section
+from acestep.gradio_ui.interfaces.generation import create_generation_section
+from acestep.gradio_ui.interfaces.result import create_results_section
+from acestep.gradio_ui.interfaces.training import create_training_section
+from acestep.gradio_ui.events import setup_event_handlers, setup_training_event_handlers
+def create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_params=None, language='en') -> gr.Blocks:
+    """
+    Create Gradio interface
+    Args:
+        dit_handler: DiT handler instance
+        llm_handler: LM handler instance
+        dataset_handler: Dataset handler instance
+        init_params: Dictionary containing initialization parameters and state.
+                    If None, service will not be pre-initialized.
+        language: UI language code ('en', 'zh', 'ja', default: 'en')
+    Returns:
+        Gradio Blocks instance
+    """
+    # Initialize i18n with selected language
+    i18n = get_i18n(language)
+    with gr.Blocks(
+        title=t("app.title"),
+        theme=gr.themes.Soft(),
+        css="""
+        .main-header {
+            text-align: center;
+            margin-bottom: 2rem;
+        }
+        .section-header {
+            background: linear-gradient(90deg, #4CAF50, #45a049);
+            color: white;
+            padding: 10px;
+            border-radius: 5px;
+            margin: 10px 0;
+        }
+        .lm-hints-row {
+            align-items: stretch;
+        }
+        .lm-hints-col {
+            display: flex;
+        }
+        .lm-hints-col > div {
+            flex: 1;
+            display: flex;
+        }
+        .lm-hints-btn button {
+            height: 100%;
+            width: 100%;
+        }
+        """
+    ) as demo:
+        gr.HTML(f"""
+        <div class="main-header">
+            <h1>{t("app.title")}</h1>
+            <p>{t("app.subtitle")}</p>
+        </div>
+        """)
+        # Dataset Explorer Section
+        dataset_section = create_dataset_section(dataset_handler)
+        # Generation Section (pass init_params and language to support pre-initialization)
+        generation_section = create_generation_section(dit_handler, llm_handler, init_params=init_params, language=language)
+        # Results Section
+        results_section = create_results_section(dit_handler)
+        # Training Section (LoRA training and dataset builder)
+        # Pass init_params to support hiding in service mode
+        training_section = create_training_section(dit_handler, llm_handler, init_params=init_params)
+        # Connect event handlers
+        setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, dataset_section, generation_section, results_section)
+        # Connect training event handlers
+        setup_training_event_handlers(demo, dit_handler, llm_handler, training_section)
+    return demo

code/acestep/gradio_ui/interfaces/dataset.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+Gradio UI Dataset Section Module
+Contains dataset explorer section component definitions
+"""
+import gradio as gr
+def create_dataset_section(dataset_handler) -> dict:
+    """Create dataset explorer section"""
+    with gr.Accordion("📊 Dataset Explorer", open=False, visible=False):
+        with gr.Row(equal_height=True):
+            dataset_type = gr.Dropdown(
+                choices=["train", "test"],
+                value="train",
+                label="Dataset",
+                info="Choose dataset to explore",
+                scale=2
+            )
+            import_dataset_btn = gr.Button("📥 Import Dataset", variant="primary", scale=1)
+            search_type = gr.Dropdown(
+                choices=["keys", "idx", "random"],
+                value="random",
+                label="Search Type",
+                info="How to find items",
+                scale=1
+            )
+            search_value = gr.Textbox(
+                label="Search Value",
+                placeholder="Enter keys or index (leave empty for random)",
+                info="Keys: exact match, Index: 0 to dataset size-1",
+                scale=2
+            )
+        instruction_display = gr.Textbox(
+            label="📝 Instruction",
+            interactive=False,
+            placeholder="No instruction available",
+            lines=1
+        )
+        repaint_viz_plot = gr.Plot()
+        with gr.Accordion("📋 Item Metadata (JSON)", open=False):
+            item_info_json = gr.Code(
+                label="Complete Item Information",
+                language="json",
+                interactive=False,
+                lines=15
+            )
+        with gr.Row(equal_height=True):
+            item_src_audio = gr.Audio(
+                label="Source Audio",
+                type="filepath",
+                interactive=False,
+                scale=8
+            )
+            get_item_btn = gr.Button("🔍 Get Item", variant="secondary", interactive=False, scale=2)
+        with gr.Row(equal_height=True):
+            item_target_audio = gr.Audio(
+                label="Target Audio",
+                type="filepath",
+                interactive=False,
+                scale=8
+            )
+            item_refer_audio = gr.Audio(
+                label="Reference Audio",
+                type="filepath",
+                interactive=False,
+                scale=2
+            )
+        with gr.Row():
+            use_src_checkbox = gr.Checkbox(
+                label="Use Source Audio from Dataset",
+                value=True,
+                info="Check to use the source audio from dataset"
+            )
+        data_status = gr.Textbox(label="📊 Data Status", interactive=False, value="❌ No dataset imported")
+        auto_fill_btn = gr.Button("📋 Auto-fill Generation Form", variant="primary")
+    return {
+        "dataset_type": dataset_type,
+        "import_dataset_btn": import_dataset_btn,
+        "search_type": search_type,
+        "search_value": search_value,
+        "instruction_display": instruction_display,
+        "repaint_viz_plot": repaint_viz_plot,
+        "item_info_json": item_info_json,
+        "item_src_audio": item_src_audio,
+        "get_item_btn": get_item_btn,
+        "item_target_audio": item_target_audio,
+        "item_refer_audio": item_refer_audio,
+        "use_src_checkbox": use_src_checkbox,
+        "data_status": data_status,
+        "auto_fill_btn": auto_fill_btn,
+    }

code/acestep/gradio_ui/interfaces/generation.py ADDED Viewed

	@@ -0,0 +1,766 @@

+"""
+Gradio UI Generation Section Module
+Contains generation section component definitions
+"""
+import gradio as gr
+from acestep.constants import (
+    VALID_LANGUAGES,
+    TRACK_NAMES,
+    TASK_TYPES_TURBO,
+    TASK_TYPES_BASE,
+    DEFAULT_DIT_INSTRUCTION,
+)
+from acestep.gradio_ui.i18n import t
+def create_generation_section(dit_handler, llm_handler, init_params=None, language='en') -> dict:
+    """Create generation section
+    Args:
+        dit_handler: DiT handler instance
+        llm_handler: LM handler instance
+        init_params: Dictionary containing initialization parameters and state.
+                    If None, service will not be pre-initialized.
+        language: UI language code ('en', 'zh', 'ja')
+    """
+    # Check if service is pre-initialized
+    service_pre_initialized = init_params is not None and init_params.get('pre_initialized', False)
+    # Check if running in service mode (restricted UI)
+    service_mode = init_params is not None and init_params.get('service_mode', False)
+    # Get current language from init_params if available
+    current_language = init_params.get('language', language) if init_params else language
+    with gr.Group():
+        # Service Configuration - collapse if pre-initialized, hide if in service mode
+        accordion_open = not service_pre_initialized
+        accordion_visible = not service_pre_initialized  # Hide when running in service mode
+        with gr.Accordion(t("service.title"), open=accordion_open, visible=accordion_visible) as service_config_accordion:
+            # Language selector at the top
+            with gr.Row():
+                language_dropdown = gr.Dropdown(
+                    choices=[
+                        ("English", "en"),
+                        ("中文", "zh"),
+                        ("日本語", "ja"),
+                    ],
+                    value=current_language,
+                    label=t("service.language_label"),
+                    info=t("service.language_info"),
+                    scale=1,
+                )
+            # Dropdown options section - all dropdowns grouped together
+            with gr.Row(equal_height=True):
+                with gr.Column(scale=4):
+                    # Set checkpoint value from init_params if pre-initialized
+                    checkpoint_value = init_params.get('checkpoint') if service_pre_initialized else None
+                    checkpoint_dropdown = gr.Dropdown(
+                        label=t("service.checkpoint_label"),
+                        choices=dit_handler.get_available_checkpoints(),
+                        value=checkpoint_value,
+                        info=t("service.checkpoint_info")
+                    )
+                with gr.Column(scale=1, min_width=90):
+                    refresh_btn = gr.Button(t("service.refresh_btn"), size="sm")
+            with gr.Row():
+                # Get available acestep-v15- model list
+                available_models = dit_handler.get_available_acestep_v15_models()
+                default_model = "acestep-v15-turbo" if "acestep-v15-turbo" in available_models else (available_models[0] if available_models else None)
+                # Set config_path value from init_params if pre-initialized
+                config_path_value = init_params.get('config_path', default_model) if service_pre_initialized else default_model
+                config_path = gr.Dropdown(
+                    label=t("service.model_path_label"),
+                    choices=available_models,
+                    value=config_path_value,
+                    info=t("service.model_path_info")
+                )
+                # Set device value from init_params if pre-initialized
+                device_value = init_params.get('device', 'auto') if service_pre_initialized else 'auto'
+                device = gr.Dropdown(
+                    choices=["auto", "cuda", "cpu"],
+                    value=device_value,
+                    label=t("service.device_label"),
+                    info=t("service.device_info")
+                )
+            with gr.Row():
+                # Get available 5Hz LM model list
+                available_lm_models = llm_handler.get_available_5hz_lm_models()
+                default_lm_model = "acestep-5Hz-lm-0.6B" if "acestep-5Hz-lm-0.6B" in available_lm_models else (available_lm_models[0] if available_lm_models else None)
+                # Set lm_model_path value from init_params if pre-initialized
+                lm_model_path_value = init_params.get('lm_model_path', default_lm_model) if service_pre_initialized else default_lm_model
+                lm_model_path = gr.Dropdown(
+                    label=t("service.lm_model_path_label"),
+                    choices=available_lm_models,
+                    value=lm_model_path_value,
+                    info=t("service.lm_model_path_info")
+                )
+                # Set backend value from init_params if pre-initialized
+                backend_value = init_params.get('backend', 'vllm') if service_pre_initialized else 'vllm'
+                backend_dropdown = gr.Dropdown(
+                    choices=["vllm", "pt"],
+                    value=backend_value,
+                    label=t("service.backend_label"),
+                    info=t("service.backend_info")
+                )
+            # Checkbox options section - all checkboxes grouped together
+            with gr.Row():
+                # Set init_llm value from init_params if pre-initialized
+                init_llm_value = init_params.get('init_llm', True) if service_pre_initialized else True
+                init_llm_checkbox = gr.Checkbox(
+                    label=t("service.init_llm_label"),
+                    value=init_llm_value,
+                    info=t("service.init_llm_info"),
+                )
+                # Auto-detect flash attention availability
+                flash_attn_available = dit_handler.is_flash_attention_available()
+                # Set use_flash_attention value from init_params if pre-initialized
+                use_flash_attention_value = init_params.get('use_flash_attention', flash_attn_available) if service_pre_initialized else flash_attn_available
+                use_flash_attention_checkbox = gr.Checkbox(
+                    label=t("service.flash_attention_label"),
+                    value=use_flash_attention_value,
+                    interactive=flash_attn_available,
+                    info=t("service.flash_attention_info_enabled") if flash_attn_available else t("service.flash_attention_info_disabled")
+                )
+                # Set offload_to_cpu value from init_params if pre-initialized
+                offload_to_cpu_value = init_params.get('offload_to_cpu', False) if service_pre_initialized else False
+                offload_to_cpu_checkbox = gr.Checkbox(
+                    label=t("service.offload_cpu_label"),
+                    value=offload_to_cpu_value,
+                    info=t("service.offload_cpu_info")
+                )
+                # Set offload_dit_to_cpu value from init_params if pre-initialized
+                offload_dit_to_cpu_value = init_params.get('offload_dit_to_cpu', False) if service_pre_initialized else False
+                offload_dit_to_cpu_checkbox = gr.Checkbox(
+                    label=t("service.offload_dit_cpu_label"),
+                    value=offload_dit_to_cpu_value,
+                    info=t("service.offload_dit_cpu_info")
+                )
+            init_btn = gr.Button(t("service.init_btn"), variant="primary", size="lg")
+            # Set init_status value from init_params if pre-initialized
+            init_status_value = init_params.get('init_status', '') if service_pre_initialized else ''
+            init_status = gr.Textbox(label=t("service.status_label"), interactive=False, lines=3, value=init_status_value)
+            # LoRA Configuration Section
+            gr.HTML("<hr><h4>🔧 LoRA Adapter</h4>")
+            with gr.Row():
+                lora_path = gr.Textbox(
+                    label="LoRA Path",
+                    placeholder="./lora_output/final/adapter",
+                    info="Path to trained LoRA adapter directory",
+                    scale=3,
+                )
+                load_lora_btn = gr.Button("📥 Load LoRA", variant="secondary", scale=1)
+                unload_lora_btn = gr.Button("🗑️ Unload", variant="secondary", scale=1)
+            with gr.Row():
+                use_lora_checkbox = gr.Checkbox(
+                    label="Use LoRA",
+                    value=False,
+                    info="Enable LoRA adapter for inference",
+                    scale=1,
+                )
+                lora_status = gr.Textbox(
+                    label="LoRA Status",
+                    value="No LoRA loaded",
+                    interactive=False,
+                    scale=2,
+                )
+        # Inputs
+        with gr.Row():
+            with gr.Column(scale=2):
+                with gr.Accordion(t("generation.required_inputs"), open=True):
+                    # Task type
+                    # Determine initial task_type choices based on actual model in use
+                    # When service is pre-initialized, use config_path from init_params
+                    actual_model = init_params.get('config_path', default_model) if service_pre_initialized else default_model
+                    actual_model_lower = (actual_model or "").lower()
+                    if "turbo" in actual_model_lower:
+                        initial_task_choices = TASK_TYPES_TURBO
+                    else:
+                        initial_task_choices = TASK_TYPES_BASE
+                    with gr.Row(equal_height=True):
+                        with gr.Column(scale=2):
+                            task_type = gr.Dropdown(
+                                choices=initial_task_choices,
+                                value="text2music",
+                                label=t("generation.task_type_label"),
+                                info=t("generation.task_type_info"),
+                            )
+                        with gr.Column(scale=7):
+                            instruction_display_gen = gr.Textbox(
+                                label=t("generation.instruction_label"),
+                                value=DEFAULT_DIT_INSTRUCTION,
+                                interactive=False,
+                                lines=1,
+                                info=t("generation.instruction_info"),
+                            )
+                        with gr.Column(scale=1, min_width=100):
+                            load_file = gr.UploadButton(
+                                t("generation.load_btn"),
+                                file_types=[".json"],
+                                file_count="single",
+                                variant="secondary",
+                                size="sm",
+                            )
+                    track_name = gr.Dropdown(
+                        choices=TRACK_NAMES,
+                        value=None,
+                        label=t("generation.track_name_label"),
+                        info=t("generation.track_name_info"),
+                        visible=False
+                    )
+                    complete_track_classes = gr.CheckboxGroup(
+                        choices=TRACK_NAMES,
+                        label=t("generation.track_classes_label"),
+                        info=t("generation.track_classes_info"),
+                        visible=False
+                    )
+                    # Audio uploads
+                    audio_uploads_accordion = gr.Accordion(t("generation.audio_uploads"), open=False)
+                    with audio_uploads_accordion:
+                        with gr.Row(equal_height=True):
+                            with gr.Column(scale=2):
+                                reference_audio = gr.Audio(
+                                    label=t("generation.reference_audio"),
+                                    type="filepath",
+                                )
+                            with gr.Column(scale=7):
+                                src_audio = gr.Audio(
+                                    label=t("generation.source_audio"),
+                                    type="filepath",
+                                )
+                            with gr.Column(scale=1, min_width=80):
+                                convert_src_to_codes_btn = gr.Button(
+                                    t("generation.convert_codes_btn"),
+                                    variant="secondary",
+                                    size="sm"
+                                )
+                    # Audio Codes for text2music - single input for transcription or cover task
+                    with gr.Accordion(t("generation.lm_codes_hints"), open=False, visible=True) as text2music_audio_codes_group:
+                        with gr.Row(equal_height=True):
+                            text2music_audio_code_string = gr.Textbox(
+                                label=t("generation.lm_codes_label"),
+                                placeholder=t("generation.lm_codes_placeholder"),
+                                lines=6,
+                                info=t("generation.lm_codes_info"),
+                                scale=9,
+                            )
+                            transcribe_btn = gr.Button(
+                                t("generation.transcribe_btn"),
+                                variant="secondary",
+                                size="sm",
+                                scale=1,
+                            )
+                    # Repainting controls
+                    with gr.Group(visible=False) as repainting_group:
+                        gr.HTML(f"<h5>{t('generation.repainting_controls')}</h5>")
+                        with gr.Row():
+                            repainting_start = gr.Number(
+                                label=t("generation.repainting_start"),
+                                value=0.0,
+                                step=0.1,
+                            )
+                            repainting_end = gr.Number(
+                                label=t("generation.repainting_end"),
+                                value=-1,
+                                minimum=-1,
+                                step=0.1,
+                            )
+                    # Simple/Custom Mode Toggle
+                    # In service mode: only Custom mode, hide the toggle
+                    with gr.Row(visible=not service_mode):
+                        generation_mode = gr.Radio(
+                            choices=[
+                                (t("generation.mode_simple"), "simple"),
+                                (t("generation.mode_custom"), "custom"),
+                            ],
+                            value="custom" if service_mode else "simple",
+                            label=t("generation.mode_label"),
+                            info=t("generation.mode_info"),
+                        )
+                    # Simple Mode Components - hidden in service mode
+                    with gr.Group(visible=not service_mode) as simple_mode_group:
+                        with gr.Row(equal_height=True):
+                            simple_query_input = gr.Textbox(
+                                label=t("generation.simple_query_label"),
+                                placeholder=t("generation.simple_query_placeholder"),
+                                lines=2,
+                                info=t("generation.simple_query_info"),
+                                scale=12,
+                            )
+                            with gr.Column(scale=1, min_width=100):
+                                random_desc_btn = gr.Button(
+                                    "🎲",
+                                    variant="secondary",
+                                    size="sm",
+                                    scale=2
+                                )
+                        with gr.Row(equal_height=True):
+                            with gr.Column(scale=1, variant="compact"):
+                                simple_instrumental_checkbox = gr.Checkbox(
+                                    label=t("generation.instrumental_label"),
+                                    value=False,
+                                )
+                            with gr.Column(scale=18):
+                                create_sample_btn = gr.Button(
+                                    t("generation.create_sample_btn"),
+                                    variant="primary",
+                                    size="lg",
+                                )
+                            with gr.Column(scale=1, variant="compact"):
+                                simple_vocal_language = gr.Dropdown(
+                                    choices=VALID_LANGUAGES,
+                                    value="unknown",
+                                    allow_custom_value=True,
+                                    label=t("generation.simple_vocal_language_label"),
+                                    interactive=True,
+                                )
+                    # State to track if sample has been created in Simple mode
+                    simple_sample_created = gr.State(value=False)
+                # Music Caption - wrapped in accordion that can be collapsed in Simple mode
+                # In service mode: auto-expand
+                with gr.Accordion(t("generation.caption_title"), open=service_mode) as caption_accordion:
+                    with gr.Row(equal_height=True):
+                        captions = gr.Textbox(
+                            label=t("generation.caption_label"),
+                            placeholder=t("generation.caption_placeholder"),
+                            lines=3,
+                            info=t("generation.caption_info"),
+                            scale=12,
+                        )
+                        with gr.Column(scale=1, min_width=100):
+                            sample_btn = gr.Button(
+                                "🎲",
+                                variant="secondary",
+                                size="sm",
+                                scale=2,
+                            )
+                # Lyrics - wrapped in accordion that can be collapsed in Simple mode
+                # In service mode: auto-expand
+                with gr.Accordion(t("generation.lyrics_title"), open=service_mode) as lyrics_accordion:
+                    lyrics = gr.Textbox(
+                        label=t("generation.lyrics_label"),
+                        placeholder=t("generation.lyrics_placeholder"),
+                        lines=8,
+                        info=t("generation.lyrics_info")
+                    )
+                    with gr.Row(variant="compact", equal_height=True):
+                        instrumental_checkbox = gr.Checkbox(
+                            label=t("generation.instrumental_label"),
+                            value=False,
+                            scale=1,
+                            min_width=120,
+                            container=True,
+                        )
+                        # 中间：语言选择 (Dropdown)
+                        # 移除 gr.HTML hack，直接使用 label 参数，Gradio 会自动处理对齐
+                        vocal_language = gr.Dropdown(
+                            choices=VALID_LANGUAGES,
+                            value="unknown",
+                            label=t("generation.vocal_language_label"),
+                            show_label=False,
+                            container=True,
+                            allow_custom_value=True,
+                            scale=3,
+                        )
+                        # 右侧：格式化按钮 (Button)
+                        # 放在同一行最右侧，操作更顺手
+                        format_btn = gr.Button(
+                            t("generation.format_btn"),
+                            variant="secondary",
+                            scale=1,
+                            min_width=80,
+                        )
+                # Optional Parameters
+                # In service mode: auto-expand
+                with gr.Accordion(t("generation.optional_params"), open=service_mode) as optional_params_accordion:
+                    with gr.Row():
+                        bpm = gr.Number(
+                            label=t("generation.bpm_label"),
+                            value=None,
+                            step=1,
+                            info=t("generation.bpm_info")
+                        )
+                        key_scale = gr.Textbox(
+                            label=t("generation.keyscale_label"),
+                            placeholder=t("generation.keyscale_placeholder"),
+                            value="",
+                            info=t("generation.keyscale_info")
+                        )
+                        time_signature = gr.Dropdown(
+                            choices=["2", "3", "4", "N/A", ""],
+                            value="",
+                            label=t("generation.timesig_label"),
+                            allow_custom_value=True,
+                            info=t("generation.timesig_info")
+                        )
+                        audio_duration = gr.Number(
+                            label=t("generation.duration_label"),
+                            value=-1,
+                            minimum=-1,
+                            maximum=600.0,
+                            step=0.1,
+                            info=t("generation.duration_info")
+                        )
+                        batch_size_input = gr.Number(
+                            label=t("generation.batch_size_label"),
+                            value=2,
+                            minimum=1,
+                            maximum=8,
+                            step=1,
+                            info=t("generation.batch_size_info"),
+                            interactive=not service_mode  # Fixed in service mode
+                        )
+        # Advanced Settings
+        # Default UI settings use turbo mode (max 20 steps, default 8, show shift with default 3)
+        # These will be updated after model initialization based on handler.is_turbo_model()
+        with gr.Accordion(t("generation.advanced_settings"), open=False):
+            with gr.Row():
+                inference_steps = gr.Slider(
+                    minimum=1,
+                    maximum=20,
+                    value=8,
+                    step=1,
+                    label=t("generation.inference_steps_label"),
+                    info=t("generation.inference_steps_info")
+                )
+                guidance_scale = gr.Slider(
+                    minimum=1.0,
+                    maximum=15.0,
+                    value=7.0,
+                    step=0.1,
+                    label=t("generation.guidance_scale_label"),
+                    info=t("generation.guidance_scale_info"),
+                    visible=False
+                )
+                with gr.Column():
+                    seed = gr.Textbox(
+                        label=t("generation.seed_label"),
+                        value="-1",
+                        info=t("generation.seed_info")
+                    )
+                    random_seed_checkbox = gr.Checkbox(
+                        label=t("generation.random_seed_label"),
+                        value=True,
+                        info=t("generation.random_seed_info")
+                    )
+                audio_format = gr.Dropdown(
+                    choices=["mp3", "flac"],
+                    value="mp3",
+                    label=t("generation.audio_format_label"),
+                    info=t("generation.audio_format_info"),
+                    interactive=not service_mode  # Fixed in service mode
+                )
+            with gr.Row():
+                use_adg = gr.Checkbox(
+                    label=t("generation.use_adg_label"),
+                    value=False,
+                    info=t("generation.use_adg_info"),
+                    visible=False
+                )
+                shift = gr.Slider(
+                    minimum=1.0,
+                    maximum=5.0,
+                    value=3.0,
+                    step=0.1,
+                    label=t("generation.shift_label"),
+                    info=t("generation.shift_info"),
+                    visible=True
+                )
+                infer_method = gr.Dropdown(
+                    choices=["ode", "sde"],
+                    value="ode",
+                    label=t("generation.infer_method_label"),
+                    info=t("generation.infer_method_info"),
+                )
+            with gr.Row():
+                custom_timesteps = gr.Textbox(
+                    label=t("generation.custom_timesteps_label"),
+                    placeholder="0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0",
+                    value="",
+                    info=t("generation.custom_timesteps_info"),
+                )
+            with gr.Row():
+                cfg_interval_start = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.0,
+                    step=0.01,
+                    label=t("generation.cfg_interval_start"),
+                    visible=False
+                )
+                cfg_interval_end = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=1.0,
+                    step=0.01,
+                    label=t("generation.cfg_interval_end"),
+                    visible=False
+                )
+            # LM (Language Model) Parameters
+            gr.HTML(f"<h4>{t('generation.lm_params_title')}</h4>")
+            with gr.Row():
+                lm_temperature = gr.Slider(
+                    label=t("generation.lm_temperature_label"),
+                    minimum=0.0,
+                    maximum=2.0,
+                    value=0.85,
+                    step=0.1,
+                    scale=1,
+                    info=t("generation.lm_temperature_info")
+                )
+                lm_cfg_scale = gr.Slider(
+                    label=t("generation.lm_cfg_scale_label"),
+                    minimum=1.0,
+                    maximum=3.0,
+                    value=2.0,
+                    step=0.1,
+                    scale=1,
+                    info=t("generation.lm_cfg_scale_info")
+                )
+                lm_top_k = gr.Slider(
+                    label=t("generation.lm_top_k_label"),
+                    minimum=0,
+                    maximum=100,
+                    value=0,
+                    step=1,
+                    scale=1,
+                    info=t("generation.lm_top_k_info")
+                )
+                lm_top_p = gr.Slider(
+                    label=t("generation.lm_top_p_label"),
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.9,
+                    step=0.01,
+                    scale=1,
+                    info=t("generation.lm_top_p_info")
+                )
+            with gr.Row():
+                lm_negative_prompt = gr.Textbox(
+                    label=t("generation.lm_negative_prompt_label"),
+                    value="NO USER INPUT",
+                    placeholder=t("generation.lm_negative_prompt_placeholder"),
+                    info=t("generation.lm_negative_prompt_info"),
+                    lines=2,
+                    scale=2,
+                )
+            with gr.Row():
+                use_cot_metas = gr.Checkbox(
+                    label=t("generation.cot_metas_label"),
+                    value=True,
+                    info=t("generation.cot_metas_info"),
+                    scale=1,
+                )
+                use_cot_language = gr.Checkbox(
+                    label=t("generation.cot_language_label"),
+                    value=True,
+                    info=t("generation.cot_language_info"),
+                    scale=1,
+                )
+                constrained_decoding_debug = gr.Checkbox(
+                    label=t("generation.constrained_debug_label"),
+                    value=False,
+                    info=t("generation.constrained_debug_info"),
+                    scale=1,
+                    interactive=not service_mode  # Fixed in service mode
+                )
+            with gr.Row():
+                auto_score = gr.Checkbox(
+                    label=t("generation.auto_score_label"),
+                    value=False,
+                    info=t("generation.auto_score_info"),
+                    scale=1,
+                    interactive=not service_mode  # Fixed in service mode
+                )
+                auto_lrc = gr.Checkbox(
+                    label=t("generation.auto_lrc_label"),
+                    value=False,
+                    info=t("generation.auto_lrc_info"),
+                    scale=1,
+                    interactive=not service_mode  # Fixed in service mode
+                )
+                lm_batch_chunk_size = gr.Number(
+                    label=t("generation.lm_batch_chunk_label"),
+                    value=8,
+                    minimum=1,
+                    maximum=32,
+                    step=1,
+                    info=t("generation.lm_batch_chunk_info"),
+                    scale=1,
+                    interactive=not service_mode  # Fixed in service mode
+                )
+            with gr.Row():
+                audio_cover_strength = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=1.0,
+                    step=0.01,
+                    label=t("generation.codes_strength_label"),
+                    info=t("generation.codes_strength_info"),
+                    scale=1,
+                )
+                score_scale = gr.Slider(
+                    minimum=0.01,
+                    maximum=1.0,
+                    value=0.5,
+                    step=0.01,
+                    label=t("generation.score_sensitivity_label"),
+                    info=t("generation.score_sensitivity_info"),
+                    scale=1,
+                    visible=not service_mode  # Hidden in service mode
+                )
+        # Set generate_btn to interactive if service is pre-initialized
+        generate_btn_interactive = init_params.get('enable_generate', False) if service_pre_initialized else False
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1, variant="compact"):
+                think_checkbox = gr.Checkbox(
+                    label=t("generation.think_label"),
+                    value=True,
+                    scale=1,
+                )
+                allow_lm_batch = gr.Checkbox(
+                    label=t("generation.parallel_thinking_label"),
+                    value=True,
+                    scale=1,
+                )
+            with gr.Column(scale=18):
+                generate_btn = gr.Button(t("generation.generate_btn"), variant="primary", size="lg", interactive=generate_btn_interactive)
+            with gr.Column(scale=1, variant="compact"):
+                autogen_checkbox = gr.Checkbox(
+                    label=t("generation.autogen_label"),
+                    value=False,  # Default to False for both service and local modes
+                    scale=1,
+                    interactive=not service_mode  # Not selectable in service mode
+                )
+                use_cot_caption = gr.Checkbox(
+                    label=t("generation.caption_rewrite_label"),
+                    value=True,
+                    scale=1,
+                )
+    return {
+        "service_config_accordion": service_config_accordion,
+        "language_dropdown": language_dropdown,
+        "checkpoint_dropdown": checkpoint_dropdown,
+        "refresh_btn": refresh_btn,
+        "config_path": config_path,
+        "device": device,
+        "init_btn": init_btn,
+        "init_status": init_status,
+        "lm_model_path": lm_model_path,
+        "init_llm_checkbox": init_llm_checkbox,
+        "backend_dropdown": backend_dropdown,
+        "use_flash_attention_checkbox": use_flash_attention_checkbox,
+        "offload_to_cpu_checkbox": offload_to_cpu_checkbox,
+        "offload_dit_to_cpu_checkbox": offload_dit_to_cpu_checkbox,
+        # LoRA components
+        "lora_path": lora_path,
+        "load_lora_btn": load_lora_btn,
+        "unload_lora_btn": unload_lora_btn,
+        "use_lora_checkbox": use_lora_checkbox,
+        "lora_status": lora_status,
+        "task_type": task_type,
+        "instruction_display_gen": instruction_display_gen,
+        "track_name": track_name,
+        "complete_track_classes": complete_track_classes,
+        "audio_uploads_accordion": audio_uploads_accordion,
+        "reference_audio": reference_audio,
+        "src_audio": src_audio,
+        "convert_src_to_codes_btn": convert_src_to_codes_btn,
+        "text2music_audio_code_string": text2music_audio_code_string,
+        "transcribe_btn": transcribe_btn,
+        "text2music_audio_codes_group": text2music_audio_codes_group,
+        "lm_temperature": lm_temperature,
+        "lm_cfg_scale": lm_cfg_scale,
+        "lm_top_k": lm_top_k,
+        "lm_top_p": lm_top_p,
+        "lm_negative_prompt": lm_negative_prompt,
+        "use_cot_metas": use_cot_metas,
+        "use_cot_caption": use_cot_caption,
+        "use_cot_language": use_cot_language,
+        "repainting_group": repainting_group,
+        "repainting_start": repainting_start,
+        "repainting_end": repainting_end,
+        "audio_cover_strength": audio_cover_strength,
+        # Simple/Custom Mode Components
+        "generation_mode": generation_mode,
+        "simple_mode_group": simple_mode_group,
+        "simple_query_input": simple_query_input,
+        "random_desc_btn": random_desc_btn,
+        "simple_instrumental_checkbox": simple_instrumental_checkbox,
+        "simple_vocal_language": simple_vocal_language,
+        "create_sample_btn": create_sample_btn,
+        "simple_sample_created": simple_sample_created,
+        "caption_accordion": caption_accordion,
+        "lyrics_accordion": lyrics_accordion,
+        "optional_params_accordion": optional_params_accordion,
+        # Existing components
+        "captions": captions,
+        "sample_btn": sample_btn,
+        "load_file": load_file,
+        "lyrics": lyrics,
+        "vocal_language": vocal_language,
+        "bpm": bpm,
+        "key_scale": key_scale,
+        "time_signature": time_signature,
+        "audio_duration": audio_duration,
+        "batch_size_input": batch_size_input,
+        "inference_steps": inference_steps,
+        "guidance_scale": guidance_scale,
+        "seed": seed,
+        "random_seed_checkbox": random_seed_checkbox,
+        "use_adg": use_adg,
+        "cfg_interval_start": cfg_interval_start,
+        "cfg_interval_end": cfg_interval_end,
+        "shift": shift,
+        "infer_method": infer_method,
+        "custom_timesteps": custom_timesteps,
+        "audio_format": audio_format,
+        "think_checkbox": think_checkbox,
+        "autogen_checkbox": autogen_checkbox,
+        "generate_btn": generate_btn,
+        "instrumental_checkbox": instrumental_checkbox,
+        "format_btn": format_btn,
+        "constrained_decoding_debug": constrained_decoding_debug,
+        "score_scale": score_scale,
+        "allow_lm_batch": allow_lm_batch,
+        "auto_score": auto_score,
+        "auto_lrc": auto_lrc,
+        "lm_batch_chunk_size": lm_batch_chunk_size,
+    }

code/acestep/gradio_ui/interfaces/result.py ADDED Viewed

	@@ -0,0 +1,552 @@

+"""
+Gradio UI Results Section Module
+Contains results display section component definitions
+"""
+import gradio as gr
+from acestep.gradio_ui.i18n import t
+def create_results_section(dit_handler) -> dict:
+    """Create results display section"""
+    with gr.Accordion(t("results.title"), open=True):
+        # Hidden state to store LM-generated metadata
+        lm_metadata_state = gr.State(value=None)
+        # Hidden state to track if caption/metadata is from formatted source (LM/transcription)
+        is_format_caption_state = gr.State(value=False)
+        # Batch management states
+        current_batch_index = gr.State(value=0)  # Currently displayed batch index
+        total_batches = gr.State(value=1)  # Total number of batches generated
+        batch_queue = gr.State(value={})  # Dictionary storing all batch data
+        generation_params_state = gr.State(value={})  # Store generation parameters for next batches
+        is_generating_background = gr.State(value=False)  # Background generation flag
+        # All audio components in one row with dynamic visibility
+        with gr.Row():
+            with gr.Column(visible=True) as audio_col_1:
+                generated_audio_1 = gr.Audio(
+                    label=t("results.generated_music", n=1),
+                    type="filepath",
+                    interactive=False,
+                    buttons=[]
+                )
+                with gr.Row(equal_height=True):
+                    send_to_src_btn_1 = gr.Button(
+                        t("results.send_to_src_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    save_btn_1 = gr.Button(
+                        t("results.save_btn"),
+                        variant="primary",
+                        size="sm",
+                        scale=1
+                    )
+                    score_btn_1 = gr.Button(
+                        t("results.score_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    lrc_btn_1 = gr.Button(
+                        t("results.lrc_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=True) as details_accordion_1:
+                    codes_display_1 = gr.Textbox(
+                        label=t("results.codes_label", n=1),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=4,
+                        max_lines=4,
+                        visible=True
+                    )
+                    score_display_1 = gr.Textbox(
+                        label=t("results.quality_score_label", n=1),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=6,
+                        max_lines=6,
+                        visible=True
+                    )
+                    lrc_display_1 = gr.Textbox(
+                        label=t("results.lrc_label", n=1),
+                        interactive=True,
+                        buttons=["copy"],
+                        lines=8,
+                        max_lines=8,
+                        visible=True
+                    )
+            with gr.Column(visible=True) as audio_col_2:
+                generated_audio_2 = gr.Audio(
+                    label=t("results.generated_music", n=2),
+                    type="filepath",
+                    interactive=False,
+                    buttons=[]
+                )
+                with gr.Row(equal_height=True):
+                    send_to_src_btn_2 = gr.Button(
+                        t("results.send_to_src_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    save_btn_2 = gr.Button(
+                        t("results.save_btn"),
+                        variant="primary",
+                        size="sm",
+                        scale=1
+                    )
+                    score_btn_2 = gr.Button(
+                        t("results.score_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    lrc_btn_2 = gr.Button(
+                        t("results.lrc_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=True) as details_accordion_2:
+                    codes_display_2 = gr.Textbox(
+                        label=t("results.codes_label", n=2),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=4,
+                        max_lines=4,
+                        visible=True
+                    )
+                    score_display_2 = gr.Textbox(
+                        label=t("results.quality_score_label", n=2),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=6,
+                        max_lines=6,
+                        visible=True
+                    )
+                    lrc_display_2 = gr.Textbox(
+                        label=t("results.lrc_label", n=2),
+                        interactive=True,
+                        buttons=["copy"],
+                        lines=8,
+                        max_lines=8,
+                        visible=True
+                    )
+            with gr.Column(visible=False) as audio_col_3:
+                generated_audio_3 = gr.Audio(
+                    label=t("results.generated_music", n=3),
+                    type="filepath",
+                    interactive=False,
+                    buttons=[]
+                )
+                with gr.Row(equal_height=True):
+                    send_to_src_btn_3 = gr.Button(
+                        t("results.send_to_src_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    save_btn_3 = gr.Button(
+                        t("results.save_btn"),
+                        variant="primary",
+                        size="sm",
+                        scale=1
+                    )
+                    score_btn_3 = gr.Button(
+                        t("results.score_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    lrc_btn_3 = gr.Button(
+                        t("results.lrc_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=True) as details_accordion_3:
+                    codes_display_3 = gr.Textbox(
+                        label=t("results.codes_label", n=3),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=4,
+                        max_lines=4,
+                        visible=True
+                    )
+                    score_display_3 = gr.Textbox(
+                        label=t("results.quality_score_label", n=3),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=6,
+                        max_lines=6,
+                        visible=True
+                    )
+                    lrc_display_3 = gr.Textbox(
+                        label=t("results.lrc_label", n=3),
+                        interactive=True,
+                        buttons=["copy"],
+                        lines=8,
+                        max_lines=8,
+                        visible=True
+                    )
+            with gr.Column(visible=False) as audio_col_4:
+                generated_audio_4 = gr.Audio(
+                    label=t("results.generated_music", n=4),
+                    type="filepath",
+                    interactive=False,
+                    buttons=[]
+                )
+                with gr.Row(equal_height=True):
+                    send_to_src_btn_4 = gr.Button(
+                        t("results.send_to_src_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    save_btn_4 = gr.Button(
+                        t("results.save_btn"),
+                        variant="primary",
+                        size="sm",
+                        scale=1
+                    )
+                    score_btn_4 = gr.Button(
+                        t("results.score_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                    lrc_btn_4 = gr.Button(
+                        t("results.lrc_btn"),
+                        variant="secondary",
+                        size="sm",
+                        scale=1
+                    )
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=True) as details_accordion_4:
+                    codes_display_4 = gr.Textbox(
+                        label=t("results.codes_label", n=4),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=4,
+                        max_lines=4,
+                        visible=True
+                    )
+                    score_display_4 = gr.Textbox(
+                        label=t("results.quality_score_label", n=4),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=6,
+                        max_lines=6,
+                        visible=True
+                    )
+                    lrc_display_4 = gr.Textbox(
+                        label=t("results.lrc_label", n=4),
+                        interactive=True,
+                        buttons=["copy"],
+                        lines=8,
+                        max_lines=8,
+                        visible=True
+                    )
+        # Second row for batch size 5-8 (initially hidden)
+        with gr.Row(visible=False) as audio_row_5_8:
+            with gr.Column() as audio_col_5:
+                generated_audio_5 = gr.Audio(
+                    label=t("results.generated_music", n=5),
+                    type="filepath",
+                    interactive=False,
+                    buttons=[]
+                )
+                with gr.Row(equal_height=True):
+                    send_to_src_btn_5 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
+                    save_btn_5 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
+                    score_btn_5 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
+                    lrc_btn_5 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1)
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=True) as details_accordion_5:
+                    codes_display_5 = gr.Textbox(
+                        label=t("results.codes_label", n=5),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=4,
+                        max_lines=4,
+                        visible=True
+                    )
+                    score_display_5 = gr.Textbox(
+                        label=t("results.quality_score_label", n=5),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=6,
+                        max_lines=6,
+                        visible=True
+                    )
+                    lrc_display_5 = gr.Textbox(
+                        label=t("results.lrc_label", n=5),
+                        interactive=True,
+                        buttons=["copy"],
+                        lines=8,
+                        max_lines=8,
+                        visible=True
+                    )
+            with gr.Column() as audio_col_6:
+                generated_audio_6 = gr.Audio(
+                    label=t("results.generated_music", n=6),
+                    type="filepath",
+                    interactive=False,
+                    buttons=[]
+                )
+                with gr.Row(equal_height=True):
+                    send_to_src_btn_6 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
+                    save_btn_6 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
+                    score_btn_6 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
+                    lrc_btn_6 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1)
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=True) as details_accordion_6:
+                    codes_display_6 = gr.Textbox(
+                        label=t("results.codes_label", n=6),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=4,
+                        max_lines=4,
+                        visible=True
+                    )
+                    score_display_6 = gr.Textbox(
+                        label=t("results.quality_score_label", n=6),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=6,
+                        max_lines=6,
+                        visible=True
+                    )
+                    lrc_display_6 = gr.Textbox(
+                        label=t("results.lrc_label", n=6),
+                        interactive=True,
+                        buttons=["copy"],
+                        lines=8,
+                        max_lines=8,
+                        visible=True
+                    )
+            with gr.Column() as audio_col_7:
+                generated_audio_7 = gr.Audio(
+                    label=t("results.generated_music", n=7),
+                    type="filepath",
+                    interactive=False,
+                    buttons=[]
+                )
+                with gr.Row(equal_height=True):
+                    send_to_src_btn_7 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
+                    save_btn_7 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
+                    score_btn_7 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
+                    lrc_btn_7 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1)
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=True) as details_accordion_7:
+                    codes_display_7 = gr.Textbox(
+                        label=t("results.codes_label", n=7),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=4,
+                        max_lines=4,
+                        visible=True
+                    )
+                    score_display_7 = gr.Textbox(
+                        label=t("results.quality_score_label", n=7),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=6,
+                        max_lines=6,
+                        visible=True
+                    )
+                    lrc_display_7 = gr.Textbox(
+                        label=t("results.lrc_label", n=7),
+                        interactive=True,
+                        buttons=["copy"],
+                        lines=8,
+                        max_lines=8,
+                        visible=True
+                    )
+            with gr.Column() as audio_col_8:
+                generated_audio_8 = gr.Audio(
+                    label=t("results.generated_music", n=8),
+                    type="filepath",
+                    interactive=False,
+                    buttons=[]
+                )
+                with gr.Row(equal_height=True):
+                    send_to_src_btn_8 = gr.Button(t("results.send_to_src_btn"), variant="secondary", size="sm", scale=1)
+                    save_btn_8 = gr.Button(t("results.save_btn"), variant="primary", size="sm", scale=1)
+                    score_btn_8 = gr.Button(t("results.score_btn"), variant="secondary", size="sm", scale=1)
+                    lrc_btn_8 = gr.Button(t("results.lrc_btn"), variant="secondary", size="sm", scale=1)
+                with gr.Accordion(t("results.details_accordion"), open=False, visible=True) as details_accordion_8:
+                    codes_display_8 = gr.Textbox(
+                        label=t("results.codes_label", n=8),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=4,
+                        max_lines=4,
+                        visible=True
+                    )
+                    score_display_8 = gr.Textbox(
+                        label=t("results.quality_score_label", n=8),
+                        interactive=False,
+                        buttons=["copy"],
+                        lines=6,
+                        max_lines=6,
+                        visible=True
+                    )
+                    lrc_display_8 = gr.Textbox(
+                        label=t("results.lrc_label", n=8),
+                        interactive=True,
+                        buttons=["copy"],
+                        lines=8,
+                        max_lines=8,
+                        visible=True
+                    )
+        status_output = gr.Textbox(label=t("results.generation_status"), interactive=False)
+        # Batch navigation controls
+        with gr.Row(equal_height=True):
+            prev_batch_btn = gr.Button(
+                t("results.prev_btn"),
+                variant="secondary",
+                interactive=False,
+                scale=1,
+                size="sm"
+            )
+            batch_indicator = gr.Textbox(
+                label=t("results.current_batch"),
+                value=t("results.batch_indicator", current=1, total=1),
+                interactive=False,
+                scale=3
+            )
+            next_batch_status = gr.Textbox(
+                label=t("results.next_batch_status"),
+                value="",
+                interactive=False,
+                scale=3
+            )
+            next_batch_btn = gr.Button(
+                t("results.next_btn"),
+                variant="primary",
+                interactive=False,
+                scale=1,
+                size="sm"
+            )
+        # One-click restore parameters button
+        restore_params_btn = gr.Button(
+            t("results.restore_params_btn"),
+            variant="secondary",
+            interactive=False,  # Initially disabled, enabled after generation
+            size="sm"
+        )
+        with gr.Accordion(t("results.batch_results_title"), open=False):
+            generated_audio_batch = gr.File(
+                label=t("results.all_files_label"),
+                file_count="multiple",
+                interactive=False
+            )
+            generation_info = gr.Markdown(label=t("results.generation_details"))
+    return {
+        "lm_metadata_state": lm_metadata_state,
+        "is_format_caption_state": is_format_caption_state,
+        "current_batch_index": current_batch_index,
+        "total_batches": total_batches,
+        "batch_queue": batch_queue,
+        "generation_params_state": generation_params_state,
+        "is_generating_background": is_generating_background,
+        "status_output": status_output,
+        "prev_batch_btn": prev_batch_btn,
+        "batch_indicator": batch_indicator,
+        "next_batch_btn": next_batch_btn,
+        "next_batch_status": next_batch_status,
+        "restore_params_btn": restore_params_btn,
+        "generated_audio_1": generated_audio_1,
+        "generated_audio_2": generated_audio_2,
+        "generated_audio_3": generated_audio_3,
+        "generated_audio_4": generated_audio_4,
+        "generated_audio_5": generated_audio_5,
+        "generated_audio_6": generated_audio_6,
+        "generated_audio_7": generated_audio_7,
+        "generated_audio_8": generated_audio_8,
+        "audio_row_5_8": audio_row_5_8,
+        "audio_col_1": audio_col_1,
+        "audio_col_2": audio_col_2,
+        "audio_col_3": audio_col_3,
+        "audio_col_4": audio_col_4,
+        "audio_col_5": audio_col_5,
+        "audio_col_6": audio_col_6,
+        "audio_col_7": audio_col_7,
+        "audio_col_8": audio_col_8,
+        "send_to_src_btn_1": send_to_src_btn_1,
+        "send_to_src_btn_2": send_to_src_btn_2,
+        "send_to_src_btn_3": send_to_src_btn_3,
+        "send_to_src_btn_4": send_to_src_btn_4,
+        "send_to_src_btn_5": send_to_src_btn_5,
+        "send_to_src_btn_6": send_to_src_btn_6,
+        "send_to_src_btn_7": send_to_src_btn_7,
+        "send_to_src_btn_8": send_to_src_btn_8,
+        "save_btn_1": save_btn_1,
+        "save_btn_2": save_btn_2,
+        "save_btn_3": save_btn_3,
+        "save_btn_4": save_btn_4,
+        "save_btn_5": save_btn_5,
+        "save_btn_6": save_btn_6,
+        "save_btn_7": save_btn_7,
+        "save_btn_8": save_btn_8,
+        "score_btn_1": score_btn_1,
+        "score_btn_2": score_btn_2,
+        "score_btn_3": score_btn_3,
+        "score_btn_4": score_btn_4,
+        "score_btn_5": score_btn_5,
+        "score_btn_6": score_btn_6,
+        "score_btn_7": score_btn_7,
+        "score_btn_8": score_btn_8,
+        "score_display_1": score_display_1,
+        "score_display_2": score_display_2,
+        "score_display_3": score_display_3,
+        "score_display_4": score_display_4,
+        "score_display_5": score_display_5,
+        "score_display_6": score_display_6,
+        "score_display_7": score_display_7,
+        "score_display_8": score_display_8,
+        "codes_display_1": codes_display_1,
+        "codes_display_2": codes_display_2,
+        "codes_display_3": codes_display_3,
+        "codes_display_4": codes_display_4,
+        "codes_display_5": codes_display_5,
+        "codes_display_6": codes_display_6,
+        "codes_display_7": codes_display_7,
+        "codes_display_8": codes_display_8,
+        "lrc_btn_1": lrc_btn_1,
+        "lrc_btn_2": lrc_btn_2,
+        "lrc_btn_3": lrc_btn_3,
+        "lrc_btn_4": lrc_btn_4,
+        "lrc_btn_5": lrc_btn_5,
+        "lrc_btn_6": lrc_btn_6,
+        "lrc_btn_7": lrc_btn_7,
+        "lrc_btn_8": lrc_btn_8,
+        "lrc_display_1": lrc_display_1,
+        "lrc_display_2": lrc_display_2,
+        "lrc_display_3": lrc_display_3,
+        "lrc_display_4": lrc_display_4,
+        "lrc_display_5": lrc_display_5,
+        "lrc_display_6": lrc_display_6,
+        "lrc_display_7": lrc_display_7,
+        "lrc_display_8": lrc_display_8,
+        "details_accordion_1": details_accordion_1,
+        "details_accordion_2": details_accordion_2,
+        "details_accordion_3": details_accordion_3,
+        "details_accordion_4": details_accordion_4,
+        "details_accordion_5": details_accordion_5,
+        "details_accordion_6": details_accordion_6,
+        "details_accordion_7": details_accordion_7,
+        "details_accordion_8": details_accordion_8,
+        "generated_audio_batch": generated_audio_batch,
+        "generation_info": generation_info,
+    }

code/acestep/gradio_ui/interfaces/training.py ADDED Viewed

	@@ -0,0 +1,562 @@

+"""
+Gradio UI Training Tab Module
+Contains the dataset builder and LoRA training interface components.
+"""
+import os
+import gradio as gr
+from acestep.gradio_ui.i18n import t
+def create_training_section(dit_handler, llm_handler, init_params=None) -> dict:
+    """Create the training tab section with dataset builder and training controls.
+    Args:
+        dit_handler: DiT handler instance
+        llm_handler: LLM handler instance
+        init_params: Dictionary containing initialization parameters and state.
+                    If None, service will not be pre-initialized.
+    Returns:
+        Dictionary of Gradio components for event handling
+    """
+    # Check if running in service mode (hide training tab)
+    service_mode = init_params is not None and init_params.get('service_mode', False)
+    with gr.Tab("🎓 LoRA Training", visible=not service_mode):
+        gr.HTML("""
+        <div style="text-align: center; padding: 10px; margin-bottom: 15px;">
+            <h2>🎵 LoRA Training for ACE-Step</h2>
+            <p>Build datasets from your audio files and train custom LoRA adapters</p>
+        </div>
+        """)
+        with gr.Tabs():
+            # ==================== Dataset Builder Tab ====================
+            with gr.Tab("📁 Dataset Builder"):
+                # ========== Load Existing OR Scan New ==========
+                gr.HTML("""
+                <div style="padding: 10px; margin-bottom: 10px; border: 1px solid #4a4a6a; border-radius: 8px; background: linear-gradient(135deg, #2a2a4a 0%, #1a1a3a 100%);">
+                    <h3 style="margin: 0 0 5px 0;">🚀 Quick Start</h3>
+                    <p style="margin: 0; color: #aaa;">Choose one: <b>Load existing dataset</b> OR <b>Scan new directory</b></p>
+                </div>
+                """)
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.HTML("<h4>📂 Load Existing Dataset</h4>")
+                        with gr.Row():
+                            load_json_path = gr.Textbox(
+                                label="Dataset JSON Path",
+                                placeholder="./datasets/my_lora_dataset.json",
+                                info="Load a previously saved dataset",
+                                scale=3,
+                            )
+                            load_json_btn = gr.Button("📂 Load", variant="primary", scale=1)
+                        load_json_status = gr.Textbox(
+                            label="Load Status",
+                            interactive=False,
+                        )
+                    with gr.Column(scale=1):
+                        gr.HTML("<h4>🔍 Scan New Directory</h4>")
+                        with gr.Row():
+                            audio_directory = gr.Textbox(
+                                label="Audio Directory Path",
+                                placeholder="/path/to/your/audio/folder",
+                                info="Scan for audio files (wav, mp3, flac, ogg, opus)",
+                                scale=3,
+                            )
+                            scan_btn = gr.Button("🔍 Scan", variant="secondary", scale=1)
+                        scan_status = gr.Textbox(
+                            label="Scan Status",
+                            interactive=False,
+                        )
+                gr.HTML("<hr>")
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        # Audio files table
+                        audio_files_table = gr.Dataframe(
+                            headers=["#", "Filename", "Duration", "Labeled", "BPM", "Key", "Caption"],
+                            datatype=["number", "str", "str", "str", "str", "str", "str"],
+                            label="Found Audio Files",
+                            interactive=False,
+                            wrap=True,
+                        )
+                    with gr.Column(scale=1):
+                        gr.HTML("<h3>⚙️ Dataset Settings</h3>")
+                        dataset_name = gr.Textbox(
+                            label="Dataset Name",
+                            value="my_lora_dataset",
+                            placeholder="Enter dataset name",
+                        )
+                        all_instrumental = gr.Checkbox(
+                            label="All Instrumental",
+                            value=True,
+                            info="Check if all tracks are instrumental (no vocals)",
+                        )
+                        need_lyrics = gr.Checkbox(
+                            label="Transcribe Lyrics",
+                            value=False,
+                            info="Attempt to transcribe lyrics (slower)",
+                            interactive=False,  # Disabled for now
+                        )
+                        custom_tag = gr.Textbox(
+                            label="Custom Activation Tag",
+                            placeholder="e.g., 8bit_retro, my_style",
+                            info="Unique tag to activate this LoRA's style",
+                        )
+                        tag_position = gr.Radio(
+                            choices=[
+                                ("Prepend (tag, caption)", "prepend"),
+                                ("Append (caption, tag)", "append"),
+                                ("Replace caption", "replace"),
+                            ],
+                            value="replace",
+                            label="Tag Position",
+                            info="Where to place the custom tag in the caption",
+                        )
+                gr.HTML("<hr><h3>🤖 Step 2: Auto-Label with AI</h3>")
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        gr.Markdown("""
+                        Click the button below to automatically generate metadata for all audio files using AI:
+                        - **Caption**: Music style, genre, mood description
+                        - **BPM**: Beats per minute
+                        - **Key**: Musical key (e.g., C Major, Am)
+                        - **Time Signature**: 4/4, 3/4, etc.
+                        """)
+                        skip_metas = gr.Checkbox(
+                            label="Skip Metas (No LLM)",
+                            value=False,
+                            info="Skip AI labeling. BPM/Key/Time Signature will be N/A, Language will be 'unknown' for instrumental",
+                        )
+                    with gr.Column(scale=1):
+                        auto_label_btn = gr.Button(
+                            "🏷️ Auto-Label All",
+                            variant="primary",
+                            size="lg",
+                        )
+                label_progress = gr.Textbox(
+                    label="Labeling Progress",
+                    interactive=False,
+                    lines=2,
+                )
+                gr.HTML("<hr><h3>👀 Step 3: Preview & Edit</h3>")
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        sample_selector = gr.Slider(
+                            minimum=0,
+                            maximum=0,
+                            step=1,
+                            value=0,
+                            label="Select Sample #",
+                            info="Choose a sample to preview and edit",
+                        )
+                        preview_audio = gr.Audio(
+                            label="Audio Preview",
+                            type="filepath",
+                            interactive=False,
+                        )
+                        preview_filename = gr.Textbox(
+                            label="Filename",
+                            interactive=False,
+                        )
+                    with gr.Column(scale=2):
+                        with gr.Row():
+                            edit_caption = gr.Textbox(
+                                label="Caption",
+                                lines=3,
+                                placeholder="Music description...",
+                            )
+                        with gr.Row():
+                            edit_lyrics = gr.Textbox(
+                                label="Lyrics",
+                                lines=4,
+                                placeholder="[Verse 1]\nLyrics here...\n\n[Chorus]\n...",
+                            )
+                        with gr.Row():
+                            edit_bpm = gr.Number(
+                                label="BPM",
+                                precision=0,
+                            )
+                            edit_keyscale = gr.Textbox(
+                                label="Key",
+                                placeholder="C Major",
+                            )
+                            edit_timesig = gr.Dropdown(
+                                choices=["", "2", "3", "4", "6"],
+                                label="Time Signature",
+                            )
+                            edit_duration = gr.Number(
+                                label="Duration (s)",
+                                precision=1,
+                                interactive=False,
+                            )
+                        with gr.Row():
+                            edit_language = gr.Dropdown(
+                                choices=["instrumental", "en", "zh", "ja", "ko", "es", "fr", "de", "pt", "ru", "unknown"],
+                                value="instrumental",
+                                label="Language",
+                            )
+                            edit_instrumental = gr.Checkbox(
+                                label="Instrumental",
+                                value=True,
+                            )
+                            save_edit_btn = gr.Button("💾 Save Changes", variant="secondary")
+                        edit_status = gr.Textbox(
+                            label="Edit Status",
+                            interactive=False,
+                        )
+                gr.HTML("<hr><h3>💾 Step 4: Save Dataset</h3>")
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        save_path = gr.Textbox(
+                            label="Save Path",
+                            value="./datasets/my_lora_dataset.json",
+                            placeholder="./datasets/dataset_name.json",
+                            info="Path where the dataset JSON will be saved",
+                        )
+                    with gr.Column(scale=1):
+                        save_dataset_btn = gr.Button(
+                            "💾 Save Dataset",
+                            variant="primary",
+                            size="lg",
+                        )
+                save_status = gr.Textbox(
+                    label="Save Status",
+                    interactive=False,
+                    lines=2,
+                )
+                gr.HTML("<hr><h3>⚡ Step 5: Preprocess to Tensors</h3>")
+                gr.Markdown("""
+                **Preprocessing converts your dataset to pre-computed tensors for fast training.**
+                You can either:
+                - Use the dataset from Steps 1-4 above, **OR**
+                - Load an existing dataset JSON file (if you've already saved one)
+                """)
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        load_existing_dataset_path = gr.Textbox(
+                            label="Load Existing Dataset (Optional)",
+                            placeholder="./datasets/my_lora_dataset.json",
+                            info="Path to a previously saved dataset JSON file",
+                        )
+                    with gr.Column(scale=1):
+                        load_existing_dataset_btn = gr.Button(
+                            "📂 Load Dataset",
+                            variant="secondary",
+                            size="lg",
+                        )
+                load_existing_status = gr.Textbox(
+                    label="Load Status",
+                    interactive=False,
+                )
+                gr.Markdown("""
+                This step:
+                - Encodes audio to VAE latents
+                - Encodes captions and lyrics to text embeddings
+                - Runs the condition encoder
+                - Saves all tensors to `.pt` files
+                ⚠️ **This requires the model to be loaded and may take a few minutes.**
+                """)
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        preprocess_output_dir = gr.Textbox(
+                            label="Tensor Output Directory",
+                            value="./datasets/preprocessed_tensors",
+                            placeholder="./datasets/preprocessed_tensors",
+                            info="Directory to save preprocessed tensor files",
+                        )
+                    with gr.Column(scale=1):
+                        preprocess_btn = gr.Button(
+                            "⚡ Preprocess",
+                            variant="primary",
+                            size="lg",
+                        )
+                preprocess_progress = gr.Textbox(
+                    label="Preprocessing Progress",
+                    interactive=False,
+                    lines=3,
+                )
+            # ==================== Training Tab ====================
+            with gr.Tab("🚀 Train LoRA"):
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        gr.HTML("<h3>📊 Preprocessed Dataset Selection</h3>")
+                        gr.Markdown("""
+                        Select the directory containing preprocessed tensor files (`.pt` files).
+                        These are created in the "Dataset Builder" tab using the "Preprocess" button.
+                        """)
+                        training_tensor_dir = gr.Textbox(
+                            label="Preprocessed Tensors Directory",
+                            placeholder="./datasets/preprocessed_tensors",
+                            value="./datasets/preprocessed_tensors",
+                            info="Directory containing preprocessed .pt tensor files",
+                        )
+                        load_dataset_btn = gr.Button("📂 Load Dataset", variant="secondary")
+                        training_dataset_info = gr.Textbox(
+                            label="Dataset Info",
+                            interactive=False,
+                            lines=3,
+                        )
+                    with gr.Column(scale=1):
+                        gr.HTML("<h3>⚙️ LoRA Settings</h3>")
+                        lora_rank = gr.Slider(
+                            minimum=4,
+                            maximum=256,
+                            step=4,
+                            value=64,
+                            label="LoRA Rank (r)",
+                            info="Higher = more capacity, more memory",
+                        )
+                        lora_alpha = gr.Slider(
+                            minimum=4,
+                            maximum=512,
+                            step=4,
+                            value=128,
+                            label="LoRA Alpha",
+                            info="Scaling factor (typically 2x rank)",
+                        )
+                        lora_dropout = gr.Slider(
+                            minimum=0.0,
+                            maximum=0.5,
+                            step=0.05,
+                            value=0.1,
+                            label="LoRA Dropout",
+                        )
+                gr.HTML("<hr><h3>🎛️ Training Parameters</h3>")
+                with gr.Row():
+                    learning_rate = gr.Number(
+                        label="Learning Rate",
+                        value=1e-4,
+                        info="Start with 1e-4, adjust if needed",
+                    )
+                    train_epochs = gr.Slider(
+                        minimum=100,
+                        maximum=4000,
+                        step=100,
+                        value=500,
+                        label="Max Epochs",
+                    )
+                    train_batch_size = gr.Slider(
+                        minimum=1,
+                        maximum=8,
+                        step=1,
+                        value=1,
+                        label="Batch Size",
+                        info="Increase if you have enough VRAM",
+                    )
+                    gradient_accumulation = gr.Slider(
+                        minimum=1,
+                        maximum=16,
+                        step=1,
+                        value=1,
+                        label="Gradient Accumulation",
+                        info="Effective batch = batch_size × accumulation",
+                    )
+                with gr.Row():
+                    save_every_n_epochs = gr.Slider(
+                        minimum=50,
+                        maximum=1000,
+                        step=50,
+                        value=200,
+                        label="Save Every N Epochs",
+                    )
+                    training_shift = gr.Slider(
+                        minimum=1.0,
+                        maximum=5.0,
+                        step=0.5,
+                        value=3.0,
+                        label="Shift",
+                        info="Timestep shift for turbo model",
+                    )
+                    training_seed = gr.Number(
+                        label="Seed",
+                        value=42,
+                        precision=0,
+                    )
+                with gr.Row():
+                    lora_output_dir = gr.Textbox(
+                        label="Output Directory",
+                        value="./lora_output",
+                        placeholder="./lora_output",
+                        info="Directory to save trained LoRA weights",
+                    )
+                gr.HTML("<hr>")
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        start_training_btn = gr.Button(
+                            "🚀 Start Training",
+                            variant="primary",
+                            size="lg",
+                        )
+                    with gr.Column(scale=1):
+                        stop_training_btn = gr.Button(
+                            "⏹️ Stop Training",
+                            variant="stop",
+                            size="lg",
+                        )
+                training_progress = gr.Textbox(
+                    label="Training Progress",
+                    interactive=False,
+                    lines=2,
+                )
+                with gr.Row():
+                    training_log = gr.Textbox(
+                        label="Training Log",
+                        interactive=False,
+                        lines=10,
+                        max_lines=15,
+                        scale=1,
+                    )
+                    training_loss_plot = gr.LinePlot(
+                        x="step",
+                        y="loss",
+                        title="Training Loss",
+                        x_title="Step",
+                        y_title="Loss",
+                        scale=1,
+                    )
+                gr.HTML("<hr><h3>📦 Export LoRA</h3>")
+                with gr.Row():
+                    export_path = gr.Textbox(
+                        label="Export Path",
+                        value="./lora_output/final_lora",
+                        placeholder="./lora_output/my_lora",
+                    )
+                    export_lora_btn = gr.Button("📦 Export LoRA", variant="secondary")
+                export_status = gr.Textbox(
+                    label="Export Status",
+                    interactive=False,
+                )
+    # Store dataset builder state
+    dataset_builder_state = gr.State(None)
+    training_state = gr.State({"is_training": False, "should_stop": False})
+    return {
+        # Dataset Builder - Load or Scan
+        "load_json_path": load_json_path,
+        "load_json_btn": load_json_btn,
+        "load_json_status": load_json_status,
+        "audio_directory": audio_directory,
+        "scan_btn": scan_btn,
+        "scan_status": scan_status,
+        "audio_files_table": audio_files_table,
+        "dataset_name": dataset_name,
+        "all_instrumental": all_instrumental,
+        "need_lyrics": need_lyrics,
+        "custom_tag": custom_tag,
+        "tag_position": tag_position,
+        "skip_metas": skip_metas,
+        "auto_label_btn": auto_label_btn,
+        "label_progress": label_progress,
+        "sample_selector": sample_selector,
+        "preview_audio": preview_audio,
+        "preview_filename": preview_filename,
+        "edit_caption": edit_caption,
+        "edit_lyrics": edit_lyrics,
+        "edit_bpm": edit_bpm,
+        "edit_keyscale": edit_keyscale,
+        "edit_timesig": edit_timesig,
+        "edit_duration": edit_duration,
+        "edit_language": edit_language,
+        "edit_instrumental": edit_instrumental,
+        "save_edit_btn": save_edit_btn,
+        "edit_status": edit_status,
+        "save_path": save_path,
+        "save_dataset_btn": save_dataset_btn,
+        "save_status": save_status,
+        # Preprocessing
+        "load_existing_dataset_path": load_existing_dataset_path,
+        "load_existing_dataset_btn": load_existing_dataset_btn,
+        "load_existing_status": load_existing_status,
+        "preprocess_output_dir": preprocess_output_dir,
+        "preprocess_btn": preprocess_btn,
+        "preprocess_progress": preprocess_progress,
+        "dataset_builder_state": dataset_builder_state,
+        # Training
+        "training_tensor_dir": training_tensor_dir,
+        "load_dataset_btn": load_dataset_btn,
+        "training_dataset_info": training_dataset_info,
+        "lora_rank": lora_rank,
+        "lora_alpha": lora_alpha,
+        "lora_dropout": lora_dropout,
+        "learning_rate": learning_rate,
+        "train_epochs": train_epochs,
+        "train_batch_size": train_batch_size,
+        "gradient_accumulation": gradient_accumulation,
+        "save_every_n_epochs": save_every_n_epochs,
+        "training_shift": training_shift,
+        "training_seed": training_seed,
+        "lora_output_dir": lora_output_dir,
+        "start_training_btn": start_training_btn,
+        "stop_training_btn": stop_training_btn,
+        "training_progress": training_progress,
+        "training_log": training_log,
+        "training_loss_plot": training_loss_plot,
+        "export_path": export_path,
+        "export_lora_btn": export_lora_btn,
+        "export_status": export_status,
+        "training_state": training_state,
+    }

code/acestep/handler.py ADDED Viewed

The diff for this file is too large to render. See raw diff

code/acestep/inference.py ADDED Viewed

	@@ -0,0 +1,1164 @@

+"""
+ACE-Step Inference API Module
+This module provides a standardized inference interface for music generation,
+designed for third-party integration. It offers both a simplified API and
+backward-compatible Gradio UI support.
+"""
+import math
+import os
+import tempfile
+from typing import Optional, Union, List, Dict, Any, Tuple
+from dataclasses import dataclass, field, asdict
+from loguru import logger
+from acestep.audio_utils import AudioSaver, generate_uuid_from_params
+@dataclass
+class GenerationParams:
+    """Configuration for music generation parameters.
+    Attributes:
+        # Text Inputs
+        caption: A short text prompt describing the desired music (main prompt). < 512 characters
+        lyrics: Lyrics for the music. Use "[Instrumental]" for instrumental songs. < 4096 characters
+        instrumental: If True, generate instrumental music regardless of lyrics.
+        # Music Metadata
+        bpm: BPM (beats per minute), e.g., 120. Set to None for automatic estimation. 30 ~ 300
+        keyscale: Musical key (e.g., "C Major", "Am"). Leave empty for auto-detection. A-G, #/♭, major/minor
+        timesignature: Time signature (2 for '2/4', 3 for '3/4', 4 for '4/4', 6 for '6/8'). Leave empty for auto-detection.
+        vocal_language: Language code for vocals, e.g., "en", "zh", "ja", or "unknown". see acestep/constants.py:VALID_LANGUAGES
+        duration: Target audio length in seconds. If <0 or None, model chooses automatically. 10 ~ 600
+        # Generation Parameters
+        inference_steps: Number of diffusion steps (e.g., 8 for turbo, 32–100 for base model).
+        guidance_scale: CFG (classifier-free guidance) strength. Higher means following the prompt more strictly. Only support for non-turbo model.
+        seed: Integer seed for reproducibility. -1 means use random seed each time.
+        # Advanced DiT Parameters
+        use_adg: Whether to use Adaptive Dual Guidance (only works for base model).
+        cfg_interval_start: Start ratio (0.0–1.0) to apply CFG.
+        cfg_interval_end: End ratio (0.0–1.0) to apply CFG.
+        shift: Timestep shift factor (default 1.0). When != 1.0, applies t = shift * t / (1 + (shift - 1) * t) to timesteps.
+        # Task-Specific Parameters
+        task_type: Type of generation task. One of: "text2music", "cover", "repaint", "lego", "extract", "complete".
+        reference_audio: Path to a reference audio file for style transfer or cover tasks.
+        src_audio: Path to a source audio file for audio-to-audio tasks.
+        audio_codes: Audio semantic codes as a string (advanced use, for code-control generation).
+        repainting_start: For repaint/lego tasks: start time in seconds for region to repaint.
+        repainting_end: For repaint/lego tasks: end time in seconds for region to repaint (-1 for until end).
+        audio_cover_strength: Strength of reference audio/codes influence (range 0.0–1.0). set smaller (0.2) for style transfer tasks.
+        instruction: Optional task instruction prompt. If empty, auto-generated by system.
+        # 5Hz Language Model Parameters for CoT reasoning
+        thinking: If True, enable 5Hz Language Model "Chain-of-Thought" reasoning for semantic/music metadata and codes.
+        lm_temperature: Sampling temperature for the LLM (0.0–2.0). Higher = more creative/varied results.
+        lm_cfg_scale: Classifier-free guidance scale for the LLM.
+        lm_top_k: LLM top-k sampling (0 = disabled).
+        lm_top_p: LLM top-p nucleus sampling (1.0 = disabled).
+        lm_negative_prompt: Negative prompt to use for LLM (for control).
+        use_cot_metas: Whether to let LLM generate music metadata via CoT reasoning.
+        use_cot_caption: Whether to let LLM rewrite or format the input caption via CoT reasoning.
+        use_cot_language: Whether to let LLM detect vocal language via CoT.
+    """
+    # Required Inputs
+    task_type: str = "text2music"
+    instruction: str = "Fill the audio semantic mask based on the given conditions:"
+    # Audio Uploads
+    reference_audio: Optional[str] = None
+    src_audio: Optional[str] = None
+    # LM Codes Hints
+    audio_codes: str = ""
+    # Text Inputs
+    caption: str = ""
+    lyrics: str = ""
+    instrumental: bool = False
+    # Metadata
+    vocal_language: str = "unknown"
+    bpm: Optional[int] = None
+    keyscale: str = ""
+    timesignature: str = ""
+    duration: float = -1.0
+    # Advanced Settings
+    inference_steps: int = 8
+    seed: int = -1
+    guidance_scale: float = 7.0
+    use_adg: bool = False
+    cfg_interval_start: float = 0.0
+    cfg_interval_end: float = 1.0
+    shift: float = 1.0
+    infer_method: str = "ode"  # "ode" or "sde" - diffusion inference method
+    # Custom timesteps (parsed from string like "0.97,0.76,0.615,0.5,0.395,0.28,0.18,0.085,0")
+    # If provided, overrides inference_steps and shift
+    timesteps: Optional[List[float]] = None
+    repainting_start: float = 0.0
+    repainting_end: float = -1
+    audio_cover_strength: float = 1.0
+    # 5Hz Language Model Parameters
+    thinking: bool = True
+    lm_temperature: float = 0.85
+    lm_cfg_scale: float = 2.0
+    lm_top_k: int = 0
+    lm_top_p: float = 0.9
+    lm_negative_prompt: str = "NO USER INPUT"
+    use_cot_metas: bool = True
+    use_cot_caption: bool = True
+    use_cot_lyrics: bool = False  # TODO: not used yet
+    use_cot_language: bool = True
+    use_constrained_decoding: bool = True
+    cot_bpm: Optional[int] = None
+    cot_keyscale: str = ""
+    cot_timesignature: str = ""
+    cot_duration: Optional[float] = None
+    cot_vocal_language: str = "unknown"
+    cot_caption: str = ""
+    cot_lyrics: str = ""
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert config to dictionary for JSON serialization."""
+        return asdict(self)
+@dataclass
+class GenerationConfig:
+    """Configuration for music generation.
+    Attributes:
+        batch_size: Number of audio samples to generate
+        allow_lm_batch: Whether to allow batch processing in LM
+        use_random_seed: Whether to use random seed
+        seeds: Seed(s) for batch generation. Can be:
+            - None: Use random seeds (when use_random_seed=True) or params.seed (when use_random_seed=False)
+            - List[int]: List of seeds, will be padded with random seeds if fewer than batch_size
+            - int: Single seed value (will be converted to list and padded)
+        lm_batch_chunk_size: Batch chunk size for LM processing
+        constrained_decoding_debug: Whether to enable constrained decoding debug
+        audio_format: Output audio format, one of "mp3", "wav", "flac". Default: "flac"
+    """
+    batch_size: int = 2
+    allow_lm_batch: bool = False
+    use_random_seed: bool = True
+    seeds: Optional[List[int]] = None
+    lm_batch_chunk_size: int = 8
+    constrained_decoding_debug: bool = False
+    audio_format: str = "flac"  # Default to FLAC for fast saving
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert config to dictionary for JSON serialization."""
+        return asdict(self)
+@dataclass
+class GenerationResult:
+    """Result of music generation.
+    Attributes:
+        # Audio Outputs
+        audios: List of audio dictionaries with paths, keys, params
+        status_message: Status message from generation
+        extra_outputs: Extra outputs from generation
+        success: Whether generation completed successfully
+        error: Error message if generation failed
+    """
+    # Audio Outputs
+    audios: List[Dict[str, Any]] = field(default_factory=list)
+    # Generation Information
+    status_message: str = ""
+    extra_outputs: Dict[str, Any] = field(default_factory=dict)
+    # Success Status
+    success: bool = True
+    error: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert result to dictionary for JSON serialization."""
+        return asdict(self)
+@dataclass
+class UnderstandResult:
+    """Result of music understanding from audio codes.
+    Attributes:
+        # Metadata Fields
+        caption: Generated caption describing the music
+        lyrics: Generated or extracted lyrics
+        bpm: Beats per minute (None if not detected)
+        duration: Duration in seconds (None if not detected)
+        keyscale: Musical key (e.g., "C Major")
+        language: Vocal language code (e.g., "en", "zh")
+        timesignature: Time signature (e.g., "4/4")
+        # Status
+        status_message: Status message from understanding
+        success: Whether understanding completed successfully
+        error: Error message if understanding failed
+    """
+    # Metadata Fields
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    # Status
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert result to dictionary for JSON serialization."""
+        return asdict(self)
+def _update_metadata_from_lm(
+    metadata: Dict[str, Any],
+    bpm: Optional[int],
+    key_scale: str,
+    time_signature: str,
+    audio_duration: Optional[float],
+    vocal_language: str,
+    caption: str,
+    lyrics: str,
+) -> Tuple[Optional[int], str, str, Optional[float]]:
+    """Update metadata fields from LM output if not provided by user."""
+    if bpm is None and metadata.get('bpm'):
+        bpm_value = metadata.get('bpm')
+        if bpm_value not in ["N/A", ""]:
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+    if not key_scale and metadata.get('keyscale'):
+        key_scale_value = metadata.get('keyscale', metadata.get('key_scale', ""))
+        if key_scale_value != "N/A":
+            key_scale = key_scale_value
+    if not time_signature and metadata.get('timesignature'):
+        time_signature_value = metadata.get('timesignature', metadata.get('time_signature', ""))
+        if time_signature_value != "N/A":
+            time_signature = time_signature_value
+    if audio_duration is None:
+        audio_duration_value = metadata.get('duration', -1)
+        if audio_duration_value not in ["N/A", ""]:
+            try:
+                audio_duration = float(audio_duration_value)
+            except (ValueError, TypeError):
+                pass
+    if not vocal_language and metadata.get('vocal_language'):
+        vocal_language = metadata.get('vocal_language')
+    if not caption and metadata.get('caption'):
+        caption = metadata.get('caption')
+    if not lyrics and metadata.get('lyrics'):
+        lyrics = metadata.get('lyrics')
+    return bpm, key_scale, time_signature, audio_duration, vocal_language, caption, lyrics
+def generate_music(
+    dit_handler,
+    llm_handler,
+    params: GenerationParams,
+    config: GenerationConfig,
+    save_dir: Optional[str] = None,
+    progress=None,
+) -> GenerationResult:
+    """Generate music using ACE-Step model with optional LM reasoning.
+    Args:
+        dit_handler: Initialized DiT model handler (AceStepHandler instance)
+        llm_handler: Initialized LLM handler (LLMHandler instance)
+        params: Generation parameters (GenerationParams instance)
+        config: Generation configuration (GenerationConfig instance)
+    Returns:
+        GenerationResult with generated audio files and metadata
+    """
+    try:
+        # Phase 1: LM-based metadata and code generation (if enabled)
+        audio_code_string_to_use = params.audio_codes
+        lm_generated_metadata = None
+        lm_generated_audio_codes_list = []
+        lm_total_time_costs = {
+            "phase1_time": 0.0,
+            "phase2_time": 0.0,
+            "total_time": 0.0,
+        }
+        # Extract mutable copies of metadata (will be updated by LM if needed)
+        bpm = params.bpm
+        key_scale = params.keyscale
+        time_signature = params.timesignature
+        audio_duration = params.duration
+        dit_input_caption = params.caption
+        dit_input_vocal_language = params.vocal_language
+        dit_input_lyrics = params.lyrics
+        # Determine if we need to generate audio codes
+        # If user has provided audio_codes, we don't need to generate them
+        # Otherwise, check if we need audio codes (lm_dit mode) or just metas (dit mode)
+        user_provided_audio_codes = bool(params.audio_codes and str(params.audio_codes).strip())
+        # Determine infer_type: use "llm_dit" if we need audio codes, "dit" if only metas needed
+        # For now, we use "llm_dit" if batch mode or if user hasn't provided codes
+        # Use "dit" if user has provided codes (only need metas) or if explicitly only need metas
+        # Note: This logic can be refined based on specific requirements
+        need_audio_codes = not user_provided_audio_codes
+        # Determine if we should use chunk-based LM generation (always use chunks for consistency)
+        # Determine actual batch size for chunk processing
+        actual_batch_size = config.batch_size if config.batch_size is not None else 1
+        # Prepare seeds for batch generation
+        # Use config.seed if provided, otherwise fallback to params.seed
+        # Convert config.seed (None, int, or List[int]) to format that prepare_seeds accepts
+        seed_for_generation = ""
+        if config.seeds is not None and len(config.seeds) > 0:
+            if isinstance(config.seeds, list):
+                # Convert List[int] to comma-separated string
+                seed_for_generation = ",".join(str(s) for s in config.seeds)
+        # Use dit_handler.prepare_seeds to handle seed list generation and padding
+        # This will handle all the logic: padding with random seeds if needed, etc.
+        actual_seed_list, _ = dit_handler.prepare_seeds(actual_batch_size, seed_for_generation, config.use_random_seed)
+        # LM-based Chain-of-Thought reasoning
+        # Skip LM for cover/repaint tasks - these tasks use reference/src audio directly
+        # and don't need LM to generate audio codes
+        skip_lm_tasks = {"cover", "repaint"}
+        # Determine if we should use LLM
+        # LLM is needed for:
+        # 1. thinking=True: generate audio codes via LM
+        # 2. use_cot_caption=True: enhance/generate caption via CoT
+        # 3. use_cot_language=True: detect vocal language via CoT
+        # 4. use_cot_metas=True: fill missing metadata via CoT
+        need_lm_for_cot = params.use_cot_caption or params.use_cot_language or params.use_cot_metas
+        use_lm = (params.thinking or need_lm_for_cot) and llm_handler.llm_initialized and params.task_type not in skip_lm_tasks
+        lm_status = []
+        if params.task_type in skip_lm_tasks:
+            logger.info(f"Skipping LM for task_type='{params.task_type}' - using DiT directly")
+        logger.info(f"[generate_music] LLM usage decision: thinking={params.thinking}, "
+                   f"use_cot_caption={params.use_cot_caption}, use_cot_language={params.use_cot_language}, "
+                   f"use_cot_metas={params.use_cot_metas}, need_lm_for_cot={need_lm_for_cot}, "
+                   f"llm_initialized={llm_handler.llm_initialized if llm_handler else False}, use_lm={use_lm}")
+        if use_lm:
+            # Convert sampling parameters - handle None values safely
+            top_k_value = None if not params.lm_top_k or params.lm_top_k == 0 else int(params.lm_top_k)
+            top_p_value = None if not params.lm_top_p or params.lm_top_p >= 1.0 else params.lm_top_p
+            # Build user_metadata from user-provided values
+            user_metadata = {}
+            if bpm is not None:
+                try:
+                    bpm_value = float(bpm)
+                    if bpm_value > 0:
+                        user_metadata['bpm'] = int(bpm_value)
+                except (ValueError, TypeError):
+                    pass
+            if key_scale and key_scale.strip():
+                key_scale_clean = key_scale.strip()
+                if key_scale_clean.lower() not in ["n/a", ""]:
+                    user_metadata['keyscale'] = key_scale_clean
+            if time_signature and time_signature.strip():
+                time_sig_clean = time_signature.strip()
+                if time_sig_clean.lower() not in ["n/a", ""]:
+                    user_metadata['timesignature'] = time_sig_clean
+            if audio_duration is not None:
+                try:
+                    duration_value = float(audio_duration)
+                    if duration_value > 0:
+                        user_metadata['duration'] = int(duration_value)
+                except (ValueError, TypeError):
+                    pass
+            user_metadata_to_pass = user_metadata if user_metadata else None
+            # Determine infer_type based on whether we need audio codes
+            # - "llm_dit": generates both metas and audio codes (two-phase internally)
+            # - "dit": generates only metas (single phase)
+            infer_type = "llm_dit" if need_audio_codes and params.thinking else "dit"
+            # Use chunk size from config, or default to batch_size if not set
+            max_inference_batch_size = int(config.lm_batch_chunk_size) if config.lm_batch_chunk_size > 0 else actual_batch_size
+            num_chunks = math.ceil(actual_batch_size / max_inference_batch_size)
+            all_metadata_list = []
+            all_audio_codes_list = []
+            for chunk_idx in range(num_chunks):
+                chunk_start = chunk_idx * max_inference_batch_size
+                chunk_end = min(chunk_start + max_inference_batch_size, actual_batch_size)
+                chunk_size = chunk_end - chunk_start
+                chunk_seeds = actual_seed_list[chunk_start:chunk_end] if chunk_start < len(actual_seed_list) else None
+                logger.info(f"LM chunk {chunk_idx+1}/{num_chunks} (infer_type={infer_type}) "
+                            f"(size: {chunk_size}, seeds: {chunk_seeds})")
+                # Use the determined infer_type
+                # - "llm_dit" will internally run two phases (metas + codes)
+                # - "dit" will only run phase 1 (metas only)
+                result = llm_handler.generate_with_stop_condition(
+                    caption=params.caption or "",
+                    lyrics=params.lyrics or "",
+                    infer_type=infer_type,
+                    temperature=params.lm_temperature,
+                    cfg_scale=params.lm_cfg_scale,
+                    negative_prompt=params.lm_negative_prompt,
+                    top_k=top_k_value,
+                    top_p=top_p_value,
+                    user_metadata=user_metadata_to_pass,
+                    use_cot_caption=params.use_cot_caption,
+                    use_cot_language=params.use_cot_language,
+                    use_cot_metas=params.use_cot_metas,
+                    use_constrained_decoding=params.use_constrained_decoding,
+                    constrained_decoding_debug=config.constrained_decoding_debug,
+                    batch_size=chunk_size,
+                    seeds=chunk_seeds,
+                    progress=progress,
+                )
+                # Check if LM generation failed
+                if not result.get("success", False):
+                    error_msg = result.get("error", "Unknown LM error")
+                    lm_status.append(f"❌ LM Error: {error_msg}")
+                    # Return early with error
+                    return GenerationResult(
+                        audios=[],
+                        status_message=f"❌ LM generation failed: {error_msg}",
+                        extra_outputs={},
+                        success=False,
+                        error=error_msg,
+                    )
+                # Extract metadata and audio_codes from result dict
+                if chunk_size > 1:
+                    metadata_list = result.get("metadata", [])
+                    audio_codes_list = result.get("audio_codes", [])
+                    all_metadata_list.extend(metadata_list)
+                    all_audio_codes_list.extend(audio_codes_list)
+                else:
+                    metadata = result.get("metadata", {})
+                    audio_codes = result.get("audio_codes", "")
+                    all_metadata_list.append(metadata)
+                    all_audio_codes_list.append(audio_codes)
+                # Collect time costs from LM extra_outputs
+                lm_extra = result.get("extra_outputs", {})
+                lm_chunk_time_costs = lm_extra.get("time_costs", {})
+                if lm_chunk_time_costs:
+                    # Accumulate time costs from all chunks
+                    for key in ["phase1_time", "phase2_time", "total_time"]:
+                        if key in lm_chunk_time_costs:
+                            lm_total_time_costs[key] += lm_chunk_time_costs[key]
+                    time_str = ", ".join([f"{k}: {v:.2f}s" for k, v in lm_chunk_time_costs.items()])
+                    lm_status.append(f"✅ LM chunk {chunk_idx+1}: {time_str}")
+            lm_generated_metadata = all_metadata_list[0] if all_metadata_list else None
+            lm_generated_audio_codes_list = all_audio_codes_list
+            # Set audio_code_string_to_use based on infer_type
+            if infer_type == "llm_dit":
+                # If batch mode, use list; otherwise use single string
+                if actual_batch_size > 1:
+                    audio_code_string_to_use = all_audio_codes_list
+                else:
+                    audio_code_string_to_use = all_audio_codes_list[0] if all_audio_codes_list else ""
+            else:
+                # For "dit" mode, keep user-provided codes or empty
+                audio_code_string_to_use = params.audio_codes
+            # Update metadata from LM if not provided by user
+            if lm_generated_metadata:
+                bpm, key_scale, time_signature, audio_duration, vocal_language, caption, lyrics = _update_metadata_from_lm(
+                    metadata=lm_generated_metadata,
+                    bpm=bpm,
+                    key_scale=key_scale,
+                    time_signature=time_signature,
+                    audio_duration=audio_duration,
+                    vocal_language=dit_input_vocal_language,
+                    caption=dit_input_caption,
+                    lyrics=dit_input_lyrics)
+                if not params.bpm:
+                    params.cot_bpm = bpm
+                if not params.keyscale:
+                    params.cot_keyscale = key_scale
+                if not params.timesignature:
+                    params.cot_timesignature = time_signature
+                if not params.duration:
+                    params.cot_duration = audio_duration
+                if not params.vocal_language:
+                    params.cot_vocal_language = vocal_language
+                if not params.caption:
+                    params.cot_caption = caption
+                if not params.lyrics:
+                    params.cot_lyrics = lyrics
+            # set cot caption and language if needed
+            if params.use_cot_caption:
+                dit_input_caption = lm_generated_metadata.get("caption", dit_input_caption)
+            if params.use_cot_language:
+                dit_input_vocal_language = lm_generated_metadata.get("vocal_language", dit_input_vocal_language)
+        # Phase 2: DiT music generation
+        # Use seed_for_generation (from config.seed or params.seed) instead of params.seed for actual generation
+        result = dit_handler.generate_music(
+            captions=dit_input_caption,
+            lyrics=dit_input_lyrics,
+            bpm=bpm,
+            key_scale=key_scale,
+            time_signature=time_signature,
+            vocal_language=dit_input_vocal_language,
+            inference_steps=params.inference_steps,
+            guidance_scale=params.guidance_scale,
+            use_random_seed=config.use_random_seed,
+            seed=seed_for_generation,  # Use config.seed (or params.seed fallback) instead of params.seed directly
+            reference_audio=params.reference_audio,
+            audio_duration=audio_duration,
+            batch_size=config.batch_size if config.batch_size is not None else 1,
+            src_audio=params.src_audio,
+            audio_code_string=audio_code_string_to_use,
+            repainting_start=params.repainting_start,
+            repainting_end=params.repainting_end,
+            instruction=params.instruction,
+            audio_cover_strength=params.audio_cover_strength,
+            task_type=params.task_type,
+            use_adg=params.use_adg,
+            cfg_interval_start=params.cfg_interval_start,
+            cfg_interval_end=params.cfg_interval_end,
+            shift=params.shift,
+            infer_method=params.infer_method,
+            timesteps=params.timesteps,
+            progress=progress,
+        )
+        # Check if generation failed
+        if not result.get("success", False):
+            return GenerationResult(
+                audios=[],
+                status_message=result.get("status_message", ""),
+                extra_outputs={},
+                success=False,
+                error=result.get("error"),
+            )
+        # Extract results from dit_handler.generate_music dict
+        dit_audios = result.get("audios", [])
+        status_message = result.get("status_message", "")
+        dit_extra_outputs = result.get("extra_outputs", {})
+        # Use the seed list already prepared above (from config.seed or params.seed fallback)
+        # actual_seed_list was computed earlier using dit_handler.prepare_seeds
+        seed_list = actual_seed_list
+        # Get base params dictionary
+        base_params_dict = params.to_dict()
+        # Save audio files using AudioSaver (format from config)
+        audio_format = config.audio_format if config.audio_format else "flac"
+        audio_saver = AudioSaver(default_format=audio_format)
+        # Use handler's temp_dir for saving files
+        if save_dir is not None:
+            os.makedirs(save_dir, exist_ok=True)
+        # Build audios list for GenerationResult with params and save files
+        # Audio saving and UUID generation handled here, outside of handler
+        audios = []
+        for idx, dit_audio in enumerate(dit_audios):
+            # Create a copy of params dict for this audio
+            audio_params = base_params_dict.copy()
+            # Update audio-specific values
+            audio_params["seed"] = seed_list[idx] if idx < len(seed_list) else None
+            # Add audio codes if batch mode
+            if lm_generated_audio_codes_list and idx < len(lm_generated_audio_codes_list):
+                audio_params["audio_codes"] = lm_generated_audio_codes_list[idx]
+            # Get audio tensor and metadata
+            audio_tensor = dit_audio.get("tensor")
+            sample_rate = dit_audio.get("sample_rate", 48000)
+            # Generate UUID for this audio (moved from handler)
+            batch_seed = seed_list[idx] if idx < len(seed_list) else seed_list[0] if seed_list else -1
+            audio_code_str = lm_generated_audio_codes_list[idx] if (
+                lm_generated_audio_codes_list and idx < len(lm_generated_audio_codes_list)) else audio_code_string_to_use
+            if isinstance(audio_code_str, list):
+                audio_code_str = audio_code_str[idx] if idx < len(audio_code_str) else ""
+            audio_key = generate_uuid_from_params(audio_params)
+            # Save audio file (handled outside handler)
+            audio_path = None
+            if audio_tensor is not None and save_dir is not None:
+                try:
+                    audio_file = os.path.join(save_dir, f"{audio_key}.{audio_format}")
+                    audio_path = audio_saver.save_audio(audio_tensor,
+                                                        audio_file,
+                                                        sample_rate=sample_rate,
+                                                        format=audio_format,
+                                                        channels_first=True)
+                except Exception as e:
+                    logger.error(f"[generate_music] Failed to save audio file: {e}")
+                    audio_path = ""  # Fallback to empty path
+            audio_dict = {
+                "path": audio_path or "",  # File path (saved here, not in handler)
+                "tensor": audio_tensor,  # Audio tensor [channels, samples], CPU, float32
+                "key": audio_key,
+                "sample_rate": sample_rate,
+                "params": audio_params,
+            }
+            audios.append(audio_dict)
+        # Merge extra_outputs: include dit_extra_outputs (latents, masks) and add LM metadata
+        extra_outputs = dit_extra_outputs.copy()
+        extra_outputs["lm_metadata"] = lm_generated_metadata
+        # Merge time_costs from both LM and DiT into a unified dictionary
+        unified_time_costs = {}
+        # Add LM time costs (if LM was used)
+        if use_lm and lm_total_time_costs:
+            for key, value in lm_total_time_costs.items():
+                unified_time_costs[f"lm_{key}"] = value
+        # Add DiT time costs (if available)
+        dit_time_costs = dit_extra_outputs.get("time_costs", {})
+        if dit_time_costs:
+            for key, value in dit_time_costs.items():
+                unified_time_costs[f"dit_{key}"] = value
+        # Calculate total pipeline time
+        if unified_time_costs:
+            lm_total = unified_time_costs.get("lm_total_time", 0.0)
+            dit_total = unified_time_costs.get("dit_total_time_cost", 0.0)
+            unified_time_costs["pipeline_total_time"] = lm_total + dit_total
+        # Update extra_outputs with unified time_costs
+        extra_outputs["time_costs"] = unified_time_costs
+        if lm_status:
+            status_message = "\n".join(lm_status) + "\n" + status_message
+        else:
+            status_message = status_message
+        # Create and return GenerationResult
+        return GenerationResult(
+            audios=audios,
+            status_message=status_message,
+            extra_outputs=extra_outputs,
+            success=True,
+            error=None,
+        )
+    except Exception as e:
+        logger.exception("Music generation failed")
+        return GenerationResult(
+            audios=[],
+            status_message=f"Error: {str(e)}",
+            extra_outputs={},
+            success=False,
+            error=str(e),
+        )
+def understand_music(
+    llm_handler,
+    audio_codes: str,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> UnderstandResult:
+    """Understand music from audio codes using the 5Hz Language Model.
+    This function analyzes audio semantic codes and generates metadata about the music,
+    including caption, lyrics, BPM, duration, key scale, language, and time signature.
+    If audio_codes is empty or "NO USER INPUT", the LM will generate a sample example
+    instead of analyzing existing codes.
+    Note: cfg_scale and negative_prompt are not supported in understand mode.
+    Args:
+        llm_handler: Initialized LLM handler (LLMHandler instance)
+        audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
+                     Use empty string or "NO USER INPUT" to generate a sample example.
+        temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
+        top_k: Top-K sampling (None or 0 = disabled)
+        top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
+        repetition_penalty: Repetition penalty (1.0 = no penalty)
+        use_constrained_decoding: Whether to use FSM-based constrained decoding for metadata
+        constrained_decoding_debug: Whether to enable debug logging for constrained decoding
+    Returns:
+        UnderstandResult with parsed metadata fields and status
+    Example:
+        >>> result = understand_music(llm_handler, audio_codes="<|audio_code_123|>...")
+        >>> if result.success:
+        ...     print(f"Caption: {result.caption}")
+        ...     print(f"BPM: {result.bpm}")
+        ...     print(f"Lyrics: {result.lyrics}")
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        return UnderstandResult(
+            status_message="5Hz LM not initialized. Please initialize it first.",
+            success=False,
+            error="LLM not initialized",
+        )
+    # If codes are empty, use "NO USER INPUT" to generate a sample example
+    if not audio_codes or not audio_codes.strip():
+        audio_codes = "NO USER INPUT"
+    try:
+        # Call LLM understanding
+        metadata, status = llm_handler.understand_audio_from_codes(
+            audio_codes=audio_codes,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+        )
+        # Check if LLM returned empty metadata (error case)
+        if not metadata:
+            return UnderstandResult(
+                status_message=status or "Failed to understand audio codes",
+                success=False,
+                error=status or "Empty metadata returned",
+            )
+        # Extract and convert fields
+        caption = metadata.get('caption', '')
+        lyrics = metadata.get('lyrics', '')
+        keyscale = metadata.get('keyscale', '')
+        language = metadata.get('language', metadata.get('vocal_language', ''))
+        timesignature = metadata.get('timesignature', '')
+        # Convert BPM to int
+        bpm = None
+        bpm_value = metadata.get('bpm')
+        if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+        # Convert duration to float
+        duration = None
+        duration_value = metadata.get('duration')
+        if duration_value is not None and duration_value != 'N/A' and duration_value != '':
+            try:
+                duration = float(duration_value)
+            except (ValueError, TypeError):
+                pass
+        # Clean up N/A values
+        if keyscale == 'N/A':
+            keyscale = ''
+        if language == 'N/A':
+            language = ''
+        if timesignature == 'N/A':
+            timesignature = ''
+        return UnderstandResult(
+            caption=caption,
+            lyrics=lyrics,
+            bpm=bpm,
+            duration=duration,
+            keyscale=keyscale,
+            language=language,
+            timesignature=timesignature,
+            status_message=status,
+            success=True,
+            error=None,
+        )
+    except Exception as e:
+        logger.exception("Music understanding failed")
+        return UnderstandResult(
+            status_message=f"Error: {str(e)}",
+            success=False,
+            error=str(e),
+        )
+@dataclass
+class CreateSampleResult:
+    """Result of creating a music sample from a natural language query.
+    This is used by the "Simple Mode" / "Inspiration Mode" feature where users
+    provide a natural language description and the LLM generates a complete
+    sample with caption, lyrics, and metadata.
+    Attributes:
+        # Metadata Fields
+        caption: Generated detailed music description/caption
+        lyrics: Generated lyrics (or "[Instrumental]" for instrumental music)
+        bpm: Beats per minute (None if not generated)
+        duration: Duration in seconds (None if not generated)
+        keyscale: Musical key (e.g., "C Major")
+        language: Vocal language code (e.g., "en", "zh")
+        timesignature: Time signature (e.g., "4")
+        instrumental: Whether this is an instrumental piece
+        # Status
+        status_message: Status message from sample creation
+        success: Whether sample creation completed successfully
+        error: Error message if sample creation failed
+    """
+    # Metadata Fields
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    instrumental: bool = False
+    # Status
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert result to dictionary for JSON serialization."""
+        return asdict(self)
+def create_sample(
+    llm_handler,
+    query: str,
+    instrumental: bool = False,
+    vocal_language: Optional[str] = None,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> CreateSampleResult:
+    """Create a music sample from a natural language query using the 5Hz Language Model.
+    This is the "Simple Mode" / "Inspiration Mode" feature that takes a user's natural
+    language description of music and generates a complete sample including:
+    - Detailed caption/description
+    - Lyrics (unless instrumental)
+    - Metadata (BPM, duration, key, language, time signature)
+    Note: cfg_scale and negative_prompt are not supported in create_sample mode.
+    Args:
+        llm_handler: Initialized LLM handler (LLMHandler instance)
+        query: User's natural language music description (e.g., "a soft Bengali love song")
+        instrumental: Whether to generate instrumental music (no vocals)
+        vocal_language: Allowed vocal language for constrained decoding (e.g., "en", "zh").
+                       If provided, the model will be constrained to generate lyrics in this language.
+                       If None or "unknown", no language constraint is applied.
+        temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
+        top_k: Top-K sampling (None or 0 = disabled)
+        top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
+        repetition_penalty: Repetition penalty (1.0 = no penalty)
+        use_constrained_decoding: Whether to use FSM-based constrained decoding
+        constrained_decoding_debug: Whether to enable debug logging
+    Returns:
+        CreateSampleResult with generated sample fields and status
+    Example:
+        >>> result = create_sample(llm_handler, "a soft Bengali love song for a quiet evening", vocal_language="bn")
+        >>> if result.success:
+        ...     print(f"Caption: {result.caption}")
+        ...     print(f"Lyrics: {result.lyrics}")
+        ...     print(f"BPM: {result.bpm}")
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        return CreateSampleResult(
+            status_message="5Hz LM not initialized. Please initialize it first.",
+            success=False,
+            error="LLM not initialized",
+        )
+    try:
+        # Call LLM to create sample
+        metadata, status = llm_handler.create_sample_from_query(
+            query=query,
+            instrumental=instrumental,
+            vocal_language=vocal_language,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+        )
+        # Check if LLM returned empty metadata (error case)
+        if not metadata:
+            return CreateSampleResult(
+                status_message=status or "Failed to create sample",
+                success=False,
+                error=status or "Empty metadata returned",
+            )
+        # Extract and convert fields
+        caption = metadata.get('caption', '')
+        lyrics = metadata.get('lyrics', '')
+        keyscale = metadata.get('keyscale', '')
+        language = metadata.get('language', metadata.get('vocal_language', ''))
+        timesignature = metadata.get('timesignature', '')
+        is_instrumental = metadata.get('instrumental', instrumental)
+        # Convert BPM to int
+        bpm = None
+        bpm_value = metadata.get('bpm')
+        if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+        # Convert duration to float
+        duration = None
+        duration_value = metadata.get('duration')
+        if duration_value is not None and duration_value != 'N/A' and duration_value != '':
+            try:
+                duration = float(duration_value)
+            except (ValueError, TypeError):
+                pass
+        # Clean up N/A values
+        if keyscale == 'N/A':
+            keyscale = ''
+        if language == 'N/A':
+            language = ''
+        if timesignature == 'N/A':
+            timesignature = ''
+        return CreateSampleResult(
+            caption=caption,
+            lyrics=lyrics,
+            bpm=bpm,
+            duration=duration,
+            keyscale=keyscale,
+            language=language,
+            timesignature=timesignature,
+            instrumental=is_instrumental,
+            status_message=status,
+            success=True,
+            error=None,
+        )
+    except Exception as e:
+        logger.exception("Sample creation failed")
+        return CreateSampleResult(
+            status_message=f"Error: {str(e)}",
+            success=False,
+            error=str(e),
+        )
+@dataclass
+class FormatSampleResult:
+    """Result of formatting user-provided caption and lyrics.
+    This is used by the "Format" feature where users provide caption and lyrics,
+    and the LLM formats them into structured music metadata and an enhanced description.
+    Attributes:
+        # Metadata Fields
+        caption: Enhanced/formatted music description/caption
+        lyrics: Formatted lyrics (may be same as input or reformatted)
+        bpm: Beats per minute (None if not detected)
+        duration: Duration in seconds (None if not detected)
+        keyscale: Musical key (e.g., "C Major")
+        language: Vocal language code (e.g., "en", "zh")
+        timesignature: Time signature (e.g., "4")
+        # Status
+        status_message: Status message from formatting
+        success: Whether formatting completed successfully
+        error: Error message if formatting failed
+    """
+    # Metadata Fields
+    caption: str = ""
+    lyrics: str = ""
+    bpm: Optional[int] = None
+    duration: Optional[float] = None
+    keyscale: str = ""
+    language: str = ""
+    timesignature: str = ""
+    # Status
+    status_message: str = ""
+    success: bool = True
+    error: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert result to dictionary for JSON serialization."""
+        return asdict(self)
+def format_sample(
+    llm_handler,
+    caption: str,
+    lyrics: str,
+    user_metadata: Optional[Dict[str, Any]] = None,
+    temperature: float = 0.85,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    repetition_penalty: float = 1.0,
+    use_constrained_decoding: bool = True,
+    constrained_decoding_debug: bool = False,
+) -> FormatSampleResult:
+    """Format user-provided caption and lyrics using the 5Hz Language Model.
+    This function takes user input (caption and lyrics) and generates structured
+    music metadata including an enhanced caption, BPM, duration, key, language,
+    and time signature.
+    If user_metadata is provided, those values will be used to constrain the
+    decoding, ensuring the output matches user-specified values.
+    Note: cfg_scale and negative_prompt are not supported in format mode.
+    Args:
+        llm_handler: Initialized LLM handler (LLMHandler instance)
+        caption: User's caption/description (e.g., "Latin pop, reggaeton")
+        lyrics: User's lyrics with structure tags
+        user_metadata: Optional dict with user-provided metadata to constrain decoding.
+                      Supported keys: bpm, duration, keyscale, timesignature, language
+        temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
+        top_k: Top-K sampling (None or 0 = disabled)
+        top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
+        repetition_penalty: Repetition penalty (1.0 = no penalty)
+        use_constrained_decoding: Whether to use FSM-based constrained decoding for metadata
+        constrained_decoding_debug: Whether to enable debug logging for constrained decoding
+    Returns:
+        FormatSampleResult with formatted metadata fields and status
+    Example:
+        >>> result = format_sample(llm_handler, "Latin pop, reggaeton", "[Verse 1]\\nHola mundo...")
+        >>> if result.success:
+        ...     print(f"Caption: {result.caption}")
+        ...     print(f"BPM: {result.bpm}")
+        ...     print(f"Lyrics: {result.lyrics}")
+    """
+    # Check if LLM is initialized
+    if not llm_handler.llm_initialized:
+        return FormatSampleResult(
+            status_message="5Hz LM not initialized. Please initialize it first.",
+            success=False,
+            error="LLM not initialized",
+        )
+    try:
+        # Call LLM formatting
+        metadata, status = llm_handler.format_sample_from_input(
+            caption=caption,
+            lyrics=lyrics,
+            user_metadata=user_metadata,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            use_constrained_decoding=use_constrained_decoding,
+            constrained_decoding_debug=constrained_decoding_debug,
+        )
+        # Check if LLM returned empty metadata (error case)
+        if not metadata:
+            return FormatSampleResult(
+                status_message=status or "Failed to format input",
+                success=False,
+                error=status or "Empty metadata returned",
+            )
+        # Extract and convert fields
+        result_caption = metadata.get('caption', '')
+        result_lyrics = metadata.get('lyrics', lyrics)  # Fall back to input lyrics
+        keyscale = metadata.get('keyscale', '')
+        language = metadata.get('language', metadata.get('vocal_language', ''))
+        timesignature = metadata.get('timesignature', '')
+        # Convert BPM to int
+        bpm = None
+        bpm_value = metadata.get('bpm')
+        if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
+            try:
+                bpm = int(bpm_value)
+            except (ValueError, TypeError):
+                pass
+        # Convert duration to float
+        duration = None
+        duration_value = metadata.get('duration')
+        if duration_value is not None and duration_value != 'N/A' and duration_value != '':
+            try:
+                duration = float(duration_value)
+            except (ValueError, TypeError):
+                pass
+        # Clean up N/A values
+        if keyscale == 'N/A':
+            keyscale = ''
+        if language == 'N/A':
+            language = ''
+        if timesignature == 'N/A':
+            timesignature = ''
+        return FormatSampleResult(
+            caption=result_caption,
+            lyrics=result_lyrics,
+            bpm=bpm,
+            duration=duration,
+            keyscale=keyscale,
+            language=language,
+            timesignature=timesignature,
+            status_message=status,
+            success=True,
+            error=None,
+        )
+    except Exception as e:
+        logger.exception("Format sample failed")
+        return FormatSampleResult(
+            status_message=f"Error: {str(e)}",
+            success=False,
+            error=str(e),
+        )

code/acestep/llm_inference.py ADDED Viewed

The diff for this file is too large to render. See raw diff

code/acestep/local_cache.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""Local cache module to replace Redis
+Uses diskcache as backend, provides Redis-compatible API.
+Supports persistent storage and TTL expiration.
+"""
+import json
+import os
+from typing import Any, Optional
+from threading import Lock
+try:
+    from diskcache import Cache
+    HAS_DISKCACHE = True
+except ImportError:
+    HAS_DISKCACHE = False
+class LocalCache:
+    """
+    Local cache implementation with Redis-compatible API.
+    Uses diskcache as backend, supports persistence and TTL.
+    """
+    _instance = None
+    _lock = Lock()
+    def __new__(cls, cache_dir: Optional[str] = None):
+        """Singleton pattern"""
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = super().__new__(cls)
+                    cls._instance._initialized = False
+        return cls._instance
+    def __init__(self, cache_dir: Optional[str] = None):
+        if getattr(self, '_initialized', False):
+            return
+        if not HAS_DISKCACHE:
+            raise ImportError(
+                "diskcache not installed. Run: pip install diskcache"
+            )
+        if cache_dir is None:
+            cache_dir = os.path.join(
+                os.path.dirname(os.path.dirname(__file__)),
+                ".cache",
+                "local_redis"
+            )
+        os.makedirs(cache_dir, exist_ok=True)
+        self._cache = Cache(cache_dir)
+        self._initialized = True
+    def set(self, name: str, value: Any, ex: Optional[int] = None) -> bool:
+        """
+        Set key-value pair
+        Args:
+            name: Key name
+            value: Value (auto-serialize dict/list)
+            ex: Expiration time (seconds)
+        Returns:
+            bool: Success status
+        """
+        if isinstance(value, (dict, list)):
+            value = json.dumps(value, ensure_ascii=False)
+        self._cache.set(name, value, expire=ex)
+        return True
+    def get(self, name: str) -> Optional[str]:
+        """Get value"""
+        return self._cache.get(name)
+    def delete(self, name: str) -> int:
+        """Delete key, returns number of deleted items"""
+        return 1 if self._cache.delete(name) else 0
+    def exists(self, name: str) -> bool:
+        """Check if key exists"""
+        return name in self._cache
+    def keys(self, pattern: str = "*") -> list:
+        """
+        Get list of matching keys
+        Note: Simplified implementation, only supports prefix and full matching
+        """
+        if pattern == "*":
+            return list(self._cache.iterkeys())
+        prefix = pattern.rstrip("*")
+        return [k for k in self._cache.iterkeys() if k.startswith(prefix)]
+    def expire(self, name: str, seconds: int) -> bool:
+        """Set key expiration time"""
+        value = self._cache.get(name)
+        if value is not None:
+            self._cache.set(name, value, expire=seconds)
+            return True
+        return False
+    def ttl(self, name: str) -> int:
+        """
+        Get remaining time to live (seconds)
+        Note: diskcache does not directly support TTL queries
+        """
+        if name in self._cache:
+            return -1  # Exists but TTL unknown
+        return -2  # Key does not exist
+    def close(self):
+        """Close cache connection"""
+        if hasattr(self, '_cache'):
+            self._cache.close()
+# Lazily initialized global instance
+_local_cache: Optional[LocalCache] = None
+def get_local_cache(cache_dir: Optional[str] = None) -> LocalCache:
+    """Get local cache instance"""
+    global _local_cache
+    if _local_cache is None:
+        _local_cache = LocalCache(cache_dir)
+    return _local_cache

code/acestep/test_time_scaling.py ADDED Viewed

	@@ -0,0 +1,410 @@

+"""
+Test-Time Scaling Module
+Implements perplexity-based scoring for generated audio codes
+"""
+import torch
+import torch.nn.functional as F
+from typing import Tuple, Optional, Dict, Any, List
+from loguru import logger
+import yaml
+import math
+import re
+def pmi_score(log_prob_conditional: float, log_prob_unconditional: float) -> float:
+    """
+    Calculate Pointwise Mutual Information (PMI) score.
+    PMI = log P(condition|codes) - log P(condition)
+        = log [P(codes|condition) / P(codes)]
+    This removes the bias from P(condition) and measures how much the codes
+    improve our ability to predict the condition.
+    Args:
+        log_prob_conditional: Average log probability of condition given codes
+        log_prob_unconditional: Average log probability of condition without codes
+    Returns:
+        PMI score (higher is better, can be positive or negative)
+        - Positive: codes improve prediction → good match
+        - Zero: codes don't help → no correlation
+        - Negative: codes hurt prediction → poor match
+    """
+    return log_prob_conditional - log_prob_unconditional
+def pmi_to_normalized_score(pmi: float, scale: float = 0.1) -> float:
+    """
+    Convert PMI score to normalized [0, 1] range using sigmoid function.
+    score = sigmoid(PMI / scale) = 1 / (1 + exp(-PMI / scale))
+    Args:
+        pmi: PMI score (can be positive or negative)
+        scale: Scale parameter to control sensitivity (default 0.1)
+               - Smaller scale: more sensitive to PMI changes
+               - Larger scale: less sensitive to PMI changes
+    Returns:
+        Normalized score in [0, 1] range, where:
+        - PMI > 0 → score > 0.5 (good match)
+        - PMI = 0 → score = 0.5 (neutral)
+        - PMI < 0 → score < 0.5 (poor match)
+    Examples (scale=1.0):
+        PMI=2.0  → score≈0.88  (excellent)
+        PMI=1.0  → score≈0.73  (good)
+        PMI=0.0  → score=0.50  (neutral)
+        PMI=-1.0 → score≈0.27  (poor)
+        PMI=-2.0 → score≈0.12  (bad)
+    """
+    return 1.0 / (1.0 + math.exp(-pmi / scale))
+def _get_logits_and_target_for_scoring(llm_handler, formatted_prompt: str,
+                                       target_text: str) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Args:
+        llm_handler: The handler containing the model and tokenizer.
+        formatted_prompt: The input context.
+        target_text: The text we want to calculate probability/recall for.
+    Returns:
+        Tuple of (target_logits, target_ids)
+        - target_logits: Logits used to predict the target tokens.
+        - target_ids: The ground truth token IDs of the target.
+    """
+    model = llm_handler.get_hf_model_for_scoring()
+    tokenizer = llm_handler.llm_tokenizer
+    device = llm_handler.device if llm_handler.llm_backend == "pt" else next(model.parameters()).device
+    # 1. Tokenize prompt ONLY to get its length (used for slicing later).
+    #    We must ensure special tokens are added to count the offset correctly.
+    prompt_tokens_temp = tokenizer(formatted_prompt, return_tensors="pt", add_special_tokens=True)
+    prompt_len = prompt_tokens_temp['input_ids'].shape[1]
+    # 2. Tokenize the FULL text (Prompt + Target).
+    #    This ensures subword merging at boundaries is handled correctly by the tokenizer.
+    full_text = formatted_prompt + target_text
+    full_tokens = tokenizer(full_text, return_tensors="pt", padding=False, truncation=True, add_special_tokens=True).to(device)
+    input_ids = full_tokens['input_ids']
+    # Safety check: if target was empty or truncated entirely
+    if input_ids.shape[1] <= prompt_len:
+        return torch.empty(0, device=device), torch.empty(0, device=device)
+    # 3. Forward Pass (Teacher Forcing)
+    with torch.no_grad():
+        with llm_handler._load_model_context():
+            outputs = model(input_ids=input_ids, attention_mask=full_tokens['attention_mask'])
+            all_logits = outputs.logits  # [1, seq_len, vocab_size]
+    # 4. Extract Logits and Labels
+    #    We need to predict `input_ids[i]`. The logit for this is at `all_logits[i-1]`.
+    #    Target starts at index `prompt_len`.
+    #    So we need logits from `prompt_len - 1` up to the second to last position.
+    target_logits = all_logits[0, prompt_len - 1:-1, :]  # [target_len, vocab_size]
+    target_ids = input_ids[0, prompt_len:]  # [target_len]
+    return target_logits, target_ids
+# ==============================================================================
+# Scoring Logic
+# ==============================================================================
+def _calculate_topk_recall(llm_handler,
+                           formatted_prompt: str,
+                           target_text: str,
+                           topk: int = 10) -> Tuple[float, Dict[int, float]]:
+    """
+    Calculate top-k recall for target text given prompt.
+    Checks if the ground truth token is within the top-k probabilities at each step.
+    """
+    # Use the fixed helper to get aligned logits/labels
+    pred_logits, target_ids = _get_logits_and_target_for_scoring(llm_handler, formatted_prompt, target_text)
+    if target_ids.shape[0] == 0:
+        return 0.0, {}
+    target_len = target_ids.shape[0]
+    # Get top-k indices for all positions at once
+    # topk_indices: [target_len, topk]
+    _, topk_indices = torch.topk(pred_logits, k=min(topk, pred_logits.shape[-1]), dim=-1)
+    recall_per_k = {}
+    position_scores = []
+    # Convert to list for faster CPU iteration
+    target_ids_list = target_ids.tolist()
+    topk_indices_list = topk_indices.tolist()
+    for k in range(1, topk + 1):
+        hits = 0
+        for pos in range(target_len):
+            gt_token = target_ids_list[pos]
+            # Check the top-k slice
+            topk_at_pos = topk_indices_list[pos][:k]
+            if gt_token in topk_at_pos:
+                hits += 1
+                # Calculate position-weighted score only once (when k=topk)
+                if k == topk:
+                    rank = topk_at_pos.index(gt_token) + 1
+                    # Rank 1 = 1.0, Rank k = small positive
+                    position_weight = 1.0 - (rank - 1) / topk
+                    position_scores.append(position_weight)
+        recall_per_k[k] = hits / target_len if target_len > 0 else 0.0
+    # Fill scores for positions where GT was NOT in top-k
+    while len(position_scores) < target_len:
+        position_scores.append(0.0)
+    average_recall = sum(position_scores) / len(position_scores) if position_scores else 0.0
+    return average_recall, recall_per_k
+def _calculate_metadata_recall(llm_handler,
+                               formatted_prompt: str,
+                               fields_dict: Dict[str, Any],
+                               topk: int = 10) -> Dict[str, float]:
+    """
+    Args:
+        fields_dict: Dictionary of {field_name: field_value}
+    """
+    if not fields_dict:
+        return {}
+    field_scores = {}
+    for field_name in sorted(fields_dict.keys()):
+        # Construct target text for this specific field
+        # e.g. <think>\nbpm: 120\n</think>\n
+        field_yaml = yaml.dump({field_name: fields_dict[field_name]}, allow_unicode=True, sort_keys=True).strip()
+        field_target_text = f"<think>\n{field_yaml}\n</think>\n"
+        # Calculate recall using the robust logic
+        avg_score, _ = _calculate_topk_recall(llm_handler, formatted_prompt, field_target_text, topk=topk)
+        field_scores[field_name] = avg_score
+        logger.debug(f"Recall for {field_name}: {avg_score:.4f}")
+    return field_scores
+def _calculate_log_prob(
+        llm_handler,
+        formatted_prompt: str,
+        target_text: str,
+        temperature: float = 1.0  # Kept for API compatibility, but ignored for scoring
+) -> float:
+    """
+    Calculate average log probability of target text given prompt.
+    """
+    pred_logits, target_ids = _get_logits_and_target_for_scoring(llm_handler, formatted_prompt, target_text)
+    if target_ids.shape[0] == 0:
+        return float('-inf')
+    # FIX: Do not divide by temperature.
+    # Log-probability for PMI/Perplexity should be exact.
+    # Calculate log probabilities (log_softmax)
+    log_probs = F.log_softmax(pred_logits, dim=-1)  # [target_len, vocab_size]
+    # Gather log probabilities of the ground truth tokens
+    target_log_probs = log_probs[torch.arange(target_ids.shape[0]), target_ids]
+    # Return average log probability
+    mean_log_prob = target_log_probs.mean().item()
+    return mean_log_prob
+def calculate_reward_score(
+    scores: Dict[str, float],
+    weights_config: Optional[Dict[str, float]] = None
+) -> Tuple[float, str]:
+    """
+    Reward Model Calculator: Computes a final reward based on user priorities.
+    Priority Logic:
+        1. Caption (Highest): The overall vibe/style must match.
+        2. Lyrics (Medium): Content accuracy is important but secondary to vibe.
+        3. Metadata (Lowest): Technical constraints (BPM, Key) allow for slight deviations.
+    Strategy: Dynamic Weighted Sum
+    - Metadata fields are aggregated into a single 'metadata' score first.
+    - Weights are dynamically renormalized if any component (e.g., lyrics) is missing.
+    Args:
+        scores: Dictionary of raw scores (0.0 - 1.0) from the evaluation module.
+        weights_config: Optional custom weights. Defaults to:
+                        Caption (50%), Lyrics (30%), Metadata (20%).
+    Returns:
+        final_reward: The calculated reward score (0.0 - 1.0).
+        explanation: A formatted string explaining how the score was derived.
+    """
+    # 1. Default Preference Configuration
+    # These weights determine the relative importance of each component.
+    if weights_config is None:
+        weights_config = {
+            'caption': 0.50,  # High priority: Style/Vibe
+            'lyrics':  0.30,  # Medium priority: Content
+            'metadata': 0.20  # Low priority: Technical details
+        }
+    # 2. Extract and Group Scores
+    # Caption and Lyrics are standalone high-level features.
+    caption_score = scores.get('caption')
+    lyrics_score = scores.get('lyrics')
+    # Metadata fields (bpm, key, duration, etc.) are aggregated.
+    # We treat them as a single "Technical Score" to prevent them from
+    # diluting the weight of Caption/Lyrics simply by having many fields.
+    meta_scores_list = [
+        val for key, val in scores.items()
+        if key not in ['caption', 'lyrics']
+    ]
+    # Calculate average of all metadata fields (if any exist)
+    meta_aggregate_score = None
+    if meta_scores_list:
+        meta_aggregate_score = sum(meta_scores_list) / len(meta_scores_list)
+    # 3. specific Active Components & Dynamic Weighting
+    # We only include components that actually exist in this generation.
+    active_components = {}
+    if caption_score is not None:
+        active_components['caption'] = (caption_score, weights_config['caption'])
+    if lyrics_score is not None:
+        active_components['lyrics'] = (lyrics_score, weights_config['lyrics'])
+    if meta_aggregate_score is not None:
+        active_components['metadata'] = (meta_aggregate_score, weights_config['metadata'])
+    # 4. Calculate Final Weighted Score
+    total_base_weight = sum(w for _, w in active_components.values())
+    total_score = 0.0
+    breakdown_lines = []
+    if total_base_weight == 0:
+        return 0.0, "❌ No valid scores available to calculate reward."
+    # Sort by weight (importance) for display
+    sorted_components = sorted(active_components.items(), key=lambda x: x[1][1], reverse=True)
+    for name, (score, base_weight) in sorted_components:
+        # Renormalize weight: If lyrics are missing, caption/metadata weights scale up proportionately.
+        normalized_weight = base_weight / total_base_weight
+        weighted_contribution = score * normalized_weight
+        total_score += weighted_contribution
+        breakdown_lines.append(
+            f"  • {name.title():<8} | Score: {score:.4f} | Weight: {normalized_weight:.2f} "
+            f"-> Contrib: +{weighted_contribution:.4f}"
+        )
+    return total_score, "\n".join(breakdown_lines)
+# ==============================================================================
+# Main Public API
+# ==============================================================================
+def calculate_pmi_score_per_condition(
+    llm_handler,
+    audio_codes: str,
+    caption: str = "",
+    lyrics: str = "",
+    metadata: Optional[Dict[str, Any]] = None,
+    temperature: float = 1.0,
+    topk: int = 10,
+    score_scale: float = 0.1,
+) -> Tuple[Dict[str, float], float, str]:
+    """
+    Calculate quality score separately for each condition.
+    - Metadata: Uses Top-k Recall.
+    - Caption/Lyrics: Uses PMI (Normalized).
+    """
+    if not llm_handler.llm_initialized:
+        return {}, 0.0, "❌ LLM not initialized"
+    if not audio_codes or not audio_codes.strip():
+        return {}, 0.0, "❌ No audio codes provided"
+    if "caption" not in metadata:
+        metadata['caption'] = caption
+    formatted_prompt = llm_handler.build_formatted_prompt_for_understanding(audio_codes=audio_codes, is_negative_prompt=False)
+    prompt_uncond = llm_handler.build_formatted_prompt_for_understanding(audio_codes="NO USER INPUT", is_negative_prompt=False)
+    try:
+        # 1. Calculate Recall for Metadata Fields
+        if metadata and isinstance(metadata, dict):
+            scores = {}
+            # Define which fields use which metric
+            metadata_recall_keys = ['bpm', 'duration', 'genres', 'keyscale', 'language', 'timesignature']
+            metadata_pmi_keys = ['caption']
+            for key in metadata_recall_keys:
+                if key in metadata and metadata[key] is not None:
+                    recall_metadata = {key: metadata[key]}
+                    field_scores = _calculate_metadata_recall(llm_handler, formatted_prompt, recall_metadata, topk=topk)
+                    scores.update(field_scores)
+            # 2. Calculate PMI for Caption
+            for key in metadata_pmi_keys:
+                if key in metadata and metadata[key] is not None:
+                    cot_yaml = yaml.dump({key: metadata[key]}, allow_unicode=True, sort_keys=True).strip()
+                    target_text = f"<think>\n{cot_yaml}\n</think>\n"
+                    log_prob_cond = _calculate_log_prob(llm_handler, formatted_prompt, target_text)
+                    log_prob_uncond = _calculate_log_prob(llm_handler, prompt_uncond, target_text)
+                    pmi_normalized = pmi_to_normalized_score(log_prob_cond - log_prob_uncond, scale=score_scale)
+                    scores[key] = pmi_normalized
+        # 3. Calculate PMI for Lyrics
+        if lyrics:
+            target_text = f"<think>\n</think>\n# Lyric\n{lyrics}\n"
+            log_prob_cond = _calculate_log_prob(llm_handler, formatted_prompt, target_text)
+            prompt_uncond = llm_handler.build_formatted_prompt_for_understanding(audio_codes="NO USER INPUT", is_negative_prompt=False)
+            log_prob_uncond = _calculate_log_prob(llm_handler, prompt_uncond, target_text)
+            scores['lyrics'] = pmi_to_normalized_score(log_prob_cond - log_prob_uncond, scale=score_scale)
+        if not scores:
+            return {}, 0.0, "❌ No conditions to evaluate"
+        # 4. Global Score
+        global_score = sum(scores.values()) / len(scores)
+        global_score, breakdown_lines = calculate_reward_score(scores)
+        # Status Message
+        status_lines = [breakdown_lines, "\n✅ Per-condition scores (0-1):"]
+        for key, score in sorted(scores.items()):
+            metric = "Top-k Recall" if key in metadata_recall_keys else "PMI (Norm)"
+            status_lines.append(f"  {key}: {score:.4f} ({metric})")
+        status = "\n".join(status_lines)
+        logger.info(f"Calculated scores: {global_score:.4f}\n{status}")
+        return scores, global_score, status
+    except Exception as e:
+        import traceback
+        error_msg = f"❌ Error: {str(e)}"
+        logger.error(error_msg)
+        logger.error(traceback.format_exc())
+        return {}, float('-inf'), error_msg

code/acestep/third_parts/nano-vllm/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Xingkai Yu
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

code/acestep/third_parts/nano-vllm/README.md ADDED Viewed

	@@ -0,0 +1,66 @@

+<p align="center">
+<img width="300" src="assets/logo.png">
+</p>
+<p align="center">
+<a href="https://trendshift.io/repositories/15323" target="_blank"><img src="https://trendshift.io/api/badge/repositories/15323" alt="GeeeekExplorer%2Fnano-vllm | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</p>
+# Nano-vLLM
+A lightweight vLLM implementation built from scratch.
+## Key Features
+* 🚀 **Fast offline inference** - Comparable inference speeds to vLLM
+* 📖 **Readable codebase** - Clean implementation in ~ 1,200 lines of Python code
+* ⚡ **Optimization Suite** - Prefix caching, Tensor Parallelism, Torch compilation, CUDA graph, etc.
+## Installation
+```bash
+pip install git+https://github.com/GeeeekExplorer/nano-vllm.git
+```
+## Model Download
+To download the model weights manually, use the following command:
+```bash
+huggingface-cli download --resume-download Qwen/Qwen3-0.6B \
+  --local-dir ~/huggingface/Qwen3-0.6B/ \
+  --local-dir-use-symlinks False
+```
+## Quick Start
+See `example.py` for usage. The API mirrors vLLM's interface with minor differences in the `LLM.generate` method:
+```python
+from nanovllm import LLM, SamplingParams
+llm = LLM("/YOUR/MODEL/PATH", enforce_eager=True, tensor_parallel_size=1)
+sampling_params = SamplingParams(temperature=0.6, max_tokens=256)
+prompts = ["Hello, Nano-vLLM."]
+outputs = llm.generate(prompts, sampling_params)
+outputs[0]["text"]
+```
+## Benchmark
+See `bench.py` for benchmark.
+**Test Configuration:**
+- Hardware: RTX 4070 Laptop (8GB)
+- Model: Qwen3-0.6B
+- Total Requests: 256 sequences
+- Input Length: Randomly sampled between 100–1024 tokens
+- Output Length: Randomly sampled between 100–1024 tokens
+**Performance Results:**
+| Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) |
+|----------------|-------------|----------|-----------------------|
+| vLLM           | 133,966     | 98.37    | 1361.84               |
+| Nano-vLLM      | 133,966     | 93.41    | 1434.13               |
+## Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=GeeeekExplorer/nano-vllm&type=Date)](https://www.star-history.com/#GeeeekExplorer/nano-vllm&Date)

code/acestep/third_parts/nano-vllm/assets/logo.png ADDED Viewed

Git LFS Details

SHA256: 03ec4039dc248e97e9943694d3ccfb52c1a73a6dab94c4cd6fd4288e08de98c8
Pointer size: 131 Bytes
Size of remote file: 397 kB

code/acestep/third_parts/nano-vllm/bench.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+import time
+from random import randint, seed
+from nanovllm import LLM, SamplingParams
+# from vllm import LLM, SamplingParams
+def main():
+    seed(0)
+    num_seqs = 256
+    max_input_len = 1024
+    max_ouput_len = 1024
+    path = os.path.expanduser("~/huggingface/Qwen3-0.6B/")
+    llm = LLM(path, enforce_eager=False, max_model_len=4096)
+    prompt_token_ids = [[randint(0, 10000) for _ in range(randint(100, max_input_len))] for _ in range(num_seqs)]
+    sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=randint(100, max_ouput_len)) for _ in range(num_seqs)]
+    # uncomment the following line for vllm
+    # prompt_token_ids = [dict(prompt_token_ids=p) for p in prompt_token_ids]
+    llm.generate(["Benchmark: "], SamplingParams())
+    t = time.time()
+    llm.generate(prompt_token_ids, sampling_params, use_tqdm=False)
+    t = (time.time() - t)
+    total_tokens = sum(sp.max_tokens for sp in sampling_params)
+    throughput = total_tokens / t
+    print(f"Total: {total_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s")
+if __name__ == "__main__":
+    main()

code/acestep/third_parts/nano-vllm/example.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+from nanovllm import LLM, SamplingParams
+from transformers import AutoTokenizer
+def main():
+    path = os.path.expanduser("~/huggingface/Qwen3-0.6B/")
+    tokenizer = AutoTokenizer.from_pretrained(path)
+    llm = LLM(path, enforce_eager=True, tensor_parallel_size=1)
+    sampling_params = SamplingParams(temperature=0.6, max_tokens=256)
+    prompts = [
+        "introduce yourself",
+        "list all prime numbers within 100",
+    ]
+    prompts = [
+        tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        for prompt in prompts
+    ]
+    outputs = llm.generate(prompts, sampling_params)
+    for prompt, output in zip(prompts, outputs):
+        print("\n")
+        print(f"Prompt: {prompt!r}")
+        print(f"Completion: {output['text']!r}")
+if __name__ == "__main__":
+    main()

code/acestep/third_parts/nano-vllm/nanovllm/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from nanovllm.llm import LLM
2	+ from nanovllm.sampling_params import SamplingParams

code/acestep/third_parts/nano-vllm/nanovllm/config.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+from dataclasses import dataclass
+from transformers import AutoConfig
+@dataclass
+class Config:
+    model: str
+    max_num_batched_tokens: int = 16384
+    max_num_seqs: int = 512
+    max_model_len: int = 4096
+    gpu_memory_utilization: float = 0.9
+    tensor_parallel_size: int = 1
+    enforce_eager: bool = False
+    hf_config: AutoConfig | None = None
+    eos: int = -1
+    kvcache_block_size: int = 256
+    num_kvcache_blocks: int = -1
+    def __post_init__(self):
+        assert os.path.isdir(self.model)
+        assert self.kvcache_block_size % 256 == 0
+        assert 1 <= self.tensor_parallel_size <= 8
+        self.hf_config = AutoConfig.from_pretrained(self.model)
+        self.max_model_len = min(self.max_model_len, self.hf_config.max_position_embeddings)
+        assert self.max_num_batched_tokens >= self.max_model_len

code/acestep/third_parts/nano-vllm/nanovllm/engine/block_manager.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from collections import deque
+import xxhash
+import numpy as np
+from nanovllm.engine.sequence import Sequence
+class Block:
+    def __init__(self, block_id):
+        self.block_id = block_id
+        self.ref_count = 0
+        self.hash = -1
+        self.token_ids = []
+    def update(self, hash: int, token_ids: list[int]):
+        self.hash = hash
+        self.token_ids = token_ids
+    def reset(self):
+        self.ref_count = 1
+        self.hash = -1
+        self.token_ids = []
+class BlockManager:
+    def __init__(self, num_blocks: int, block_size: int):
+        self.block_size = block_size
+        self.blocks: list[Block] = [Block(i) for i in range(num_blocks)]
+        self.hash_to_block_id: dict[int, int] = dict()
+        self.free_block_ids: deque[int] = deque(range(num_blocks))
+        self.used_block_ids: set[int] = set()
+    @classmethod
+    def compute_hash(cls, token_ids: list[int], prefix: int = -1):
+        h = xxhash.xxh64()
+        if prefix != -1:
+            h.update(prefix.to_bytes(8, "little"))
+        h.update(np.array(token_ids).tobytes())
+        return h.intdigest()
+    def _allocate_block(self, block_id: int) -> Block:
+        block = self.blocks[block_id]
+        assert block.ref_count == 0
+        block.reset()
+        self.free_block_ids.remove(block_id)
+        self.used_block_ids.add(block_id)
+        return self.blocks[block_id]
+    def _deallocate_block(self, block_id: int) -> Block:
+        assert self.blocks[block_id].ref_count == 0
+        self.used_block_ids.remove(block_id)
+        self.free_block_ids.append(block_id)
+    def can_allocate(self, seq: Sequence) -> bool:
+        return len(self.free_block_ids) >= seq.num_blocks
+    def allocate(self, seq: Sequence):
+        assert not seq.block_table
+        h = -1
+        cache_miss = False
+        for i in range(seq.num_blocks):
+            token_ids = seq.block(i)
+            h = self.compute_hash(token_ids, h) if len(token_ids) == self.block_size else -1
+            block_id = self.hash_to_block_id.get(h, -1)
+            if block_id == -1 or self.blocks[block_id].token_ids != token_ids:
+                cache_miss = True
+            if cache_miss:
+                block_id = self.free_block_ids[0]
+                block = self._allocate_block(block_id)
+            else:
+                seq.num_cached_tokens += self.block_size
+                if block_id in self.used_block_ids:
+                    block = self.blocks[block_id]
+                    block.ref_count += 1
+                else:
+                    block = self._allocate_block(block_id)
+            if h != -1:
+                block.update(h, token_ids)
+                self.hash_to_block_id[h] = block_id
+            seq.block_table.append(block_id)
+    def deallocate(self, seq: Sequence):
+        for block_id in reversed(seq.block_table):
+            block = self.blocks[block_id]
+            block.ref_count -= 1
+            if block.ref_count == 0:
+                self._deallocate_block(block_id)
+        seq.num_cached_tokens = 0
+        seq.block_table.clear()
+    def can_append(self, seq: Sequence) -> bool:
+        return len(self.free_block_ids) >= (len(seq) % self.block_size == 1)
+    def may_append(self, seq: Sequence):
+        block_table = seq.block_table
+        last_block = self.blocks[block_table[-1]]
+        if len(seq) % self.block_size == 1:
+            assert last_block.hash != -1
+            block_id = self.free_block_ids[0]
+            self._allocate_block(block_id)
+            block_table.append(block_id)
+        elif len(seq) % self.block_size == 0:
+            assert last_block.hash == -1
+            token_ids = seq.block(seq.num_blocks-1)
+            prefix = self.blocks[block_table[-2]].hash if len(block_table) > 1 else -1
+            h = self.compute_hash(token_ids, prefix)
+            last_block.update(h, token_ids)
+            self.hash_to_block_id[h] = last_block.block_id
+        else:
+            assert last_block.hash == -1

code/acestep/third_parts/nano-vllm/nanovllm/engine/llm_engine.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import atexit
+from dataclasses import fields
+from time import perf_counter
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer
+import torch.multiprocessing as mp
+from nanovllm.config import Config
+from nanovllm.sampling_params import SamplingParams
+from nanovllm.engine.sequence import Sequence
+from nanovllm.engine.scheduler import Scheduler
+from nanovllm.engine.model_runner import ModelRunner
+class LLMEngine:
+    def __init__(self, model, **kwargs):
+        config_fields = {field.name for field in fields(Config)}
+        config_kwargs = {k: v for k, v in kwargs.items() if k in config_fields}
+        config = Config(model, **config_kwargs)
+        self.ps = []
+        self.events = []
+        ctx = mp.get_context("spawn")
+        for i in range(1, config.tensor_parallel_size):
+            event = ctx.Event()
+            process = ctx.Process(target=ModelRunner, args=(config, i, event))
+            process.start()
+            self.ps.append(process)
+            self.events.append(event)
+        self.model_runner = ModelRunner(config, 0, self.events)
+        tokenizer = kwargs.get("tokenizer", None)
+        if tokenizer is not None:
+            self.tokenizer = tokenizer
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(config.model, use_fast=True)
+        config.eos = self.tokenizer.eos_token_id
+        self.scheduler = Scheduler(config)
+        atexit.register(self.exit)
+    def exit(self):
+        self.model_runner.call("exit")
+        del self.model_runner
+        for p in self.ps:
+            p.join()
+    def add_request(self, prompt: str | list[int], sampling_params: SamplingParams, unconditional_prompt: str | list[int] | None = None):
+        if isinstance(prompt, str):
+            prompt = self.tokenizer.encode(prompt)
+        # For CFG: if cfg_scale > 1.0, create both conditional and unconditional sequences
+        if sampling_params.cfg_scale > 1.0:
+            if unconditional_prompt is None:
+                # Try to construct unconditional prompt by replacing user input with "NO USER INPUT"
+                # This is a fallback - ideally users should provide unconditional_prompt
+                if isinstance(prompt, list):
+                    # For now, just use the same prompt (user should provide unconditional_prompt)
+                    # TODO: Implement automatic "NO USER INPUT" replacement if possible
+                    unconditional_prompt = prompt
+                else:
+                    unconditional_prompt = prompt
+            if isinstance(unconditional_prompt, str):
+                unconditional_prompt = self.tokenizer.encode(unconditional_prompt)
+            # Create unconditional sequence first (so we can reference it from conditional)
+            uncond_seq = Sequence(unconditional_prompt, sampling_params, is_unconditional=True)
+            # Create conditional sequence with reference to unconditional
+            cond_seq = Sequence(prompt, sampling_params, is_unconditional=False, conditional_seq=uncond_seq)
+            uncond_seq.paired_seq = cond_seq  # Link them bidirectionally
+            # Add both sequences to scheduler
+            self.scheduler.add(cond_seq)
+            self.scheduler.add(uncond_seq)
+        else:
+            seq = Sequence(prompt, sampling_params)
+            self.scheduler.add(seq)
+    def step(self):
+        seqs, is_prefill = self.scheduler.schedule()
+        token_ids = self.model_runner.call("run", seqs, is_prefill)
+        self.scheduler.postprocess(seqs, token_ids)
+        # Only output conditional sequences (unconditional sequences are just for CFG computation)
+        output_seqs = [seq for seq in seqs if seq.is_finished and (seq.cfg_scale <= 1.0 or not seq.is_unconditional)]
+        outputs = [(seq.seq_id, seq.completion_token_ids) for seq in output_seqs]
+        num_tokens = sum(len(seq) for seq in seqs) if is_prefill else -len([s for s in seqs if not s.is_unconditional])
+        return outputs, num_tokens
+    def is_finished(self):
+        return self.scheduler.is_finished()
+    def generate(
+        self,
+        prompts: list[str] | list[list[int]],
+        sampling_params: SamplingParams | list[SamplingParams],
+        use_tqdm: bool = True,
+        unconditional_prompts: list[str] | list[list[int]] | None = None,
+    ) -> list[str]:
+        if use_tqdm:
+            pbar = tqdm(total=len(prompts), desc="Generating", dynamic_ncols=True)
+        if not isinstance(sampling_params, list):
+            sampling_params = [sampling_params] * len(prompts)
+        if unconditional_prompts is None:
+            unconditional_prompts = [None] * len(prompts)
+        for prompt, sp, uncond_prompt in zip(prompts, sampling_params, unconditional_prompts):
+            self.add_request(prompt, sp, uncond_prompt)
+        outputs = {}
+        prefill_throughput = decode_throughput = 0.
+        while not self.is_finished():
+            t = perf_counter()
+            output, num_tokens = self.step()
+            if use_tqdm:
+                if num_tokens > 0:
+                    prefill_throughput = num_tokens / (perf_counter() - t)
+                else:
+                    decode_throughput = -num_tokens / (perf_counter() - t)
+                pbar.set_postfix({
+                    "Prefill": f"{int(prefill_throughput)}tok/s",
+                    "Decode": f"{int(decode_throughput)}tok/s",
+                })
+            for seq_id, token_ids in output:
+                outputs[seq_id] = token_ids
+                if use_tqdm:
+                    pbar.update(1)
+        outputs = [outputs[seq_id] for seq_id in sorted(outputs.keys())]
+        outputs = [{"text": self.tokenizer.decode(token_ids), "token_ids": token_ids} for token_ids in outputs]
+        if use_tqdm:
+            pbar.close()
+        return outputs

code/acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py ADDED Viewed

	@@ -0,0 +1,529 @@

+import pickle
+import torch
+import torch.distributed as dist
+from multiprocessing.synchronize import Event
+from multiprocessing.shared_memory import SharedMemory
+import sys
+from nanovllm.config import Config
+from nanovllm.engine.sequence import Sequence
+from nanovllm.models.qwen3 import Qwen3ForCausalLM
+from nanovllm.layers.sampler import Sampler
+from nanovllm.utils.context import set_context, get_context, reset_context
+from nanovllm.utils.loader import load_model
+import socket
+def find_available_port(start_port: int = 2333, max_attempts: int = 100) -> int:
+    """Find an available port starting from start_port.
+    Args:
+        start_port: The starting port number to check
+        max_attempts: Maximum number of ports to try
+    Returns:
+        An available port number
+    Raises:
+        RuntimeError: If no available port is found within max_attempts
+    """
+    for i in range(max_attempts):
+        port = start_port + i
+        try:
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+                s.bind(('localhost', port))
+                return port
+        except OSError:
+            # Port is in use, try next one
+            continue
+    raise RuntimeError(f"Could not find an available port starting from {start_port} after {max_attempts} attempts")
+class ModelRunner:
+    def __init__(self, config: Config, rank: int, event: Event | list[Event]):
+        # Enable capturing scalar outputs to avoid graph breaks from Tensor.item() calls
+        torch._dynamo.config.capture_scalar_outputs = True
+        self.config = config
+        hf_config = config.hf_config
+        self.block_size = config.kvcache_block_size
+        self.enforce_eager = config.enforce_eager
+        self.world_size = config.tensor_parallel_size
+        self.rank = rank
+        self.event = event
+        dist_port = find_available_port()
+        print(f"[debug]dist_port: {dist_port}")
+        # Use gloo backend on Windows, nccl on Linux/other platforms
+        backend = "gloo" if sys.platform == "win32" else "nccl"
+        dist.init_process_group(backend, f"tcp://127.0.0.1:{dist_port}", world_size=self.world_size, rank=rank)
+        torch.cuda.set_device(rank)
+        default_dtype = torch.get_default_dtype()
+        # Use dtype instead of deprecated torch_dtype
+        config_dtype = getattr(hf_config, 'dtype', getattr(hf_config, 'torch_dtype', torch.float32))
+        torch.set_default_dtype(config_dtype)
+        torch.set_default_device("cuda")
+        self.model = Qwen3ForCausalLM(hf_config)
+        load_model(self.model, config.model)
+        self.sampler = Sampler()
+        # Pre-allocate buffers for sampling (optimization: avoid repeated tensor creation)
+        # Must be called before warmup_model() since it uses these buffers
+        self._allocate_sample_buffers()
+        self.warmup_model()
+        self.allocate_kv_cache()
+        if not self.enforce_eager:
+            self.capture_cudagraph()
+        torch.set_default_device("cpu")
+        torch.set_default_dtype(default_dtype)
+        if self.world_size > 1:
+            if rank == 0:
+                self.shm = SharedMemory(name="nanovllm", create=True, size=2**20)
+                dist.barrier()
+            else:
+                dist.barrier()
+                self.shm = SharedMemory(name="nanovllm")
+                self.loop()
+    def _allocate_sample_buffers(self):
+        """Pre-allocate reusable buffers for sampling to avoid repeated tensor creation."""
+        max_bs = self.config.max_num_seqs
+        max_tokens = self.config.max_num_batched_tokens
+        max_num_blocks = (self.config.max_model_len + self.block_size - 1) // self.block_size
+        # Pre-allocate pinned memory buffers on CPU for fast transfer
+        # Must explicitly specify device="cpu" since default device may be "cuda"
+        self._cpu_temperatures = torch.zeros(max_bs, dtype=torch.float32, device="cpu", pin_memory=True)
+        self._cpu_cfg_scales = torch.zeros(max_bs, dtype=torch.float32, device="cpu", pin_memory=True)
+        self._cpu_top_ks = torch.zeros(max_bs, dtype=torch.int32, device="cpu", pin_memory=True)
+        self._cpu_top_ps = torch.zeros(max_bs, dtype=torch.float32, device="cpu", pin_memory=True)
+        self._cpu_repetition_penalties = torch.zeros(max_bs, dtype=torch.float32, device="cpu", pin_memory=True)
+        # Pre-allocate decode buffers on CPU with pinned memory
+        self._cpu_input_ids = torch.zeros(max_bs, dtype=torch.int64, device="cpu", pin_memory=True)
+        self._cpu_positions = torch.zeros(max_bs, dtype=torch.int64, device="cpu", pin_memory=True)
+        self._cpu_slot_mapping = torch.zeros(max_bs, dtype=torch.int32, device="cpu", pin_memory=True)
+        self._cpu_context_lens = torch.zeros(max_bs, dtype=torch.int32, device="cpu", pin_memory=True)
+        # Pre-allocate prefill buffers on CPU with pinned memory (optimization to avoid repeated tensor creation)
+        self._cpu_prefill_input_ids = torch.zeros(max_tokens, dtype=torch.int64, device="cpu", pin_memory=True)
+        self._cpu_prefill_positions = torch.zeros(max_tokens, dtype=torch.int64, device="cpu", pin_memory=True)
+        self._cpu_prefill_cu_seqlens = torch.zeros(max_bs + 1, dtype=torch.int32, device="cpu", pin_memory=True)
+        self._cpu_prefill_slot_mapping = torch.zeros(max_tokens, dtype=torch.int32, device="cpu", pin_memory=True)
+        # Pre-allocate block tables buffer (shared by both decode and prefill)
+        self._cpu_block_tables = torch.zeros(max_bs, max_num_blocks, dtype=torch.int32, device="cpu", pin_memory=True)
+        # Pre-allocate buffer for sequence token IDs (used in logits processor and sampler)
+        # Max length is max_model_len since sequences can be that long
+        self._seq_token_ids_buffer = torch.zeros(max_bs, self.config.max_model_len, dtype=torch.int64, device="cpu", pin_memory=True)
+    def exit(self):
+        if self.world_size > 1:
+            self.shm.close()
+            dist.barrier()
+            if self.rank == 0:
+                self.shm.unlink()
+        if not self.enforce_eager:
+            del self.graphs, self.graph_pool
+        torch.cuda.synchronize()
+        dist.destroy_process_group()
+    def loop(self):
+        while True:
+            method_name, args = self.read_shm()
+            self.call(method_name, *args)
+            if method_name == "exit":
+                break
+    def read_shm(self):
+        assert self.world_size > 1 and self.rank > 0
+        self.event.wait()
+        n = int.from_bytes(self.shm.buf[0:4], "little")
+        method_name, *args = pickle.loads(self.shm.buf[4:n+4])
+        self.event.clear()
+        return method_name, args
+    def write_shm(self, method_name, *args):
+        assert self.world_size > 1 and self.rank == 0
+        data = pickle.dumps([method_name, *args])
+        n = len(data)
+        self.shm.buf[0:4] = n.to_bytes(4, "little")
+        self.shm.buf[4:n+4] = data
+        for event in self.event:
+            event.set()
+    def call(self, method_name, *args):
+        if self.world_size > 1 and self.rank == 0:
+            self.write_shm(method_name, *args)
+        method = getattr(self, method_name, None)
+        return method(*args)
+    def warmup_model(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        max_num_batched_tokens, max_model_len = self.config.max_num_batched_tokens, self.config.max_model_len
+        num_seqs = min(max_num_batched_tokens // max_model_len, self.config.max_num_seqs)
+        seqs = [Sequence([0] * max_model_len) for _ in range(num_seqs)]
+        self.run(seqs, True)
+        torch.cuda.empty_cache()
+    def allocate_kv_cache(self):
+        config = self.config
+        hf_config = config.hf_config
+        free, total = torch.cuda.mem_get_info()
+        current = torch.cuda.memory_stats()["allocated_bytes.all.current"]
+        num_kv_heads = hf_config.num_key_value_heads // self.world_size
+        head_dim = getattr(hf_config, "head_dim", hf_config.hidden_size // hf_config.num_attention_heads)
+        # Use dtype instead of deprecated torch_dtype
+        config_dtype = getattr(hf_config, 'dtype', getattr(hf_config, 'torch_dtype', torch.float32))
+        block_bytes = 2 * hf_config.num_hidden_layers * self.block_size * num_kv_heads * head_dim * config_dtype.itemsize
+        # Calculate available memory for KV cache
+        # After warmup_model, empty_cache has been called, so current represents model memory only
+        # Use free memory but respect the gpu_memory_utilization limit
+        target_total_usage = total * config.gpu_memory_utilization
+        available_for_kv_cache = min(free * 0.9, target_total_usage - current)
+        # Ensure we have positive memory available
+        if available_for_kv_cache <= 0:
+            available_for_kv_cache = free * 0.5  # Fallback to 50% of free memory
+        config.num_kvcache_blocks = max(1, int(available_for_kv_cache) // block_bytes)
+        if config.num_kvcache_blocks <= 0:
+            raise RuntimeError(
+                f"Insufficient GPU memory for KV cache. "
+                f"Free: {free / 1024**3:.2f} GB, Current: {current / 1024**3:.2f} GB, "
+                f"Available for KV: {available_for_kv_cache / 1024**3:.2f} GB, "
+                f"Block size: {block_bytes / 1024**2:.2f} MB"
+            )
+        self.kv_cache = torch.empty(2, hf_config.num_hidden_layers, config.num_kvcache_blocks, self.block_size, num_kv_heads, head_dim)
+        layer_id = 0
+        for module in self.model.modules():
+            if hasattr(module, "k_cache") and hasattr(module, "v_cache"):
+                module.k_cache = self.kv_cache[0, layer_id]
+                module.v_cache = self.kv_cache[1, layer_id]
+                layer_id += 1
+    def prepare_block_tables(self, seqs: list[Sequence]):
+        max_len = max(len(seq.block_table) for seq in seqs)
+        block_tables = [seq.block_table + [-1] * (max_len - len(seq.block_table)) for seq in seqs]
+        block_tables = torch.tensor(block_tables, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        return block_tables
+    def prepare_prefill(self, seqs: list[Sequence]):
+        input_ids = []
+        positions = []
+        cu_seqlens_q = [0]
+        cu_seqlens_k = [0]
+        max_seqlen_q = 0
+        max_seqlen_k = 0
+        slot_mapping = []
+        block_tables = None
+        for seq in seqs:
+            seqlen = len(seq)
+            input_ids.extend(seq[seq.num_cached_tokens:])
+            positions.extend(list(range(seq.num_cached_tokens, seqlen)))
+            seqlen_q = seqlen - seq.num_cached_tokens
+            seqlen_k = seqlen
+            cu_seqlens_q.append(cu_seqlens_q[-1] + seqlen_q)
+            cu_seqlens_k.append(cu_seqlens_k[-1] + seqlen_k)
+            max_seqlen_q = max(seqlen_q, max_seqlen_q)
+            max_seqlen_k = max(seqlen_k, max_seqlen_k)
+            if not seq.block_table:    # warmup
+                continue
+            for i in range(seq.num_cached_blocks, seq.num_blocks):
+                start = seq.block_table[i] * self.block_size
+                if i != seq.num_blocks - 1:
+                    end = start + self.block_size
+                else:
+                    end = start + seq.last_block_num_tokens
+                slot_mapping.extend(list(range(start, end)))
+        if cu_seqlens_k[-1] > cu_seqlens_q[-1]:    # prefix cache
+            block_tables = self.prepare_block_tables(seqs)
+        input_ids = torch.tensor(input_ids, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
+        positions = torch.tensor(positions, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
+        cu_seqlens_q = torch.tensor(cu_seqlens_q, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        cu_seqlens_k = torch.tensor(cu_seqlens_k, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        slot_mapping = torch.tensor(slot_mapping, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        set_context(True, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, slot_mapping, None, block_tables)
+        return input_ids, positions
+    def prepare_decode(self, seqs: list[Sequence]):
+        """Optimized decode preparation using pre-allocated buffers."""
+        bs = len(seqs)
+        # Use pre-allocated CPU buffers
+        for i, seq in enumerate(seqs):
+            self._cpu_input_ids[i] = seq.last_token
+            self._cpu_positions[i] = len(seq) - 1
+            self._cpu_context_lens[i] = len(seq)
+            self._cpu_slot_mapping[i] = seq.block_table[-1] * self.block_size + seq.last_block_num_tokens - 1
+        # Transfer to GPU using sliced views
+        input_ids = self._cpu_input_ids[:bs].cuda(non_blocking=True)
+        positions = self._cpu_positions[:bs].cuda(non_blocking=True)
+        slot_mapping = self._cpu_slot_mapping[:bs].cuda(non_blocking=True)
+        context_lens = self._cpu_context_lens[:bs].cuda(non_blocking=True)
+        block_tables = self.prepare_block_tables(seqs)
+        set_context(False, slot_mapping=slot_mapping, context_lens=context_lens, block_tables=block_tables)
+        return input_ids, positions
+    def prepare_sample(self, seqs: list[Sequence], is_cfg_batch: bool = False):
+        """Optimized sample preparation using pre-allocated buffers."""
+        if is_cfg_batch:
+            num_seqs = len(seqs) // 2
+            target_seqs = seqs[:num_seqs]
+        else:
+            num_seqs = len(seqs)
+            target_seqs = seqs
+        # Fill pre-allocated CPU buffers
+        top_ks_is_zero = True
+        top_ps_is_one = True
+        repetition_penalties_is_one = True
+        for i, seq in enumerate(target_seqs):
+            self._cpu_temperatures[i] = seq.temperature
+            self._cpu_cfg_scales[i] = seq.cfg_scale
+            self._cpu_top_ks[i] = seq.top_k if seq.top_k is not None else 0
+            if seq.top_k is not None and seq.top_k > 0:
+                top_ks_is_zero = False
+            self._cpu_top_ps[i] = seq.top_p if seq.top_p is not None else 1.0
+            if seq.top_p is not None and seq.top_p == 1.0:
+                top_ps_is_one = False
+            self._cpu_repetition_penalties[i] = seq.repetition_penalty if seq.repetition_penalty is not None else 1.0
+            if seq.repetition_penalty is not None and seq.repetition_penalty == 1.0:
+                repetition_penalties_is_one = False
+        # Transfer to GPU using sliced views (single batched transfer)
+        temperatures = self._cpu_temperatures[:num_seqs].cuda(non_blocking=True)
+        cfg_scales = self._cpu_cfg_scales[:num_seqs].cuda(non_blocking=True)
+        top_ks = self._cpu_top_ks[:num_seqs].cuda(non_blocking=True) if not top_ks_is_zero else None
+        top_ps = self._cpu_top_ps[:num_seqs].cuda(non_blocking=True) if not top_ps_is_one else None
+        repetition_penalties = self._cpu_repetition_penalties[:num_seqs].cuda(non_blocking=True) if not repetition_penalties_is_one else None
+        return temperatures, cfg_scales, top_ks, top_ps, repetition_penalties
+    @torch.inference_mode()
+    def run_model(self, input_ids: torch.Tensor, positions: torch.Tensor, is_prefill: bool):
+        if is_prefill or self.enforce_eager or input_ids.size(0) > 512:
+            return self.model.compute_logits(self.model(input_ids, positions))
+        else:
+            bs = input_ids.size(0)
+            context = get_context()
+            # Check if block_tables size exceeds pre-allocated buffer size
+            # This can happen when conditional and unconditional sequences have different lengths
+            # in CFG mode, causing block_tables to have more columns than expected
+            max_num_blocks = self.graph_vars["block_tables"].size(1)
+            if context.block_tables.size(1) > max_num_blocks:
+                # Fall back to eager mode when block_tables is too large for CUDA graph
+                return self.model.compute_logits(self.model(input_ids, positions))
+            graph = self.graphs[next(x for x in self.graph_bs if x >= bs)]
+            graph_vars = self.graph_vars
+            graph_vars["input_ids"][:bs] = input_ids
+            graph_vars["positions"][:bs] = positions
+            graph_vars["slot_mapping"].fill_(-1)
+            graph_vars["slot_mapping"][:bs] = context.slot_mapping
+            graph_vars["context_lens"].zero_()
+            graph_vars["context_lens"][:bs] = context.context_lens
+            # Clear block_tables first to ensure no stale data from previous runs
+            graph_vars["block_tables"][:bs].fill_(-1)
+            graph_vars["block_tables"][:bs, :context.block_tables.size(1)] = context.block_tables
+            graph.replay()
+            return self.model.compute_logits(graph_vars["outputs"][:bs])
+    def run(self, seqs: list[Sequence], is_prefill: bool) -> list[int]:
+        """Run model forward and sampling. For CFG sequences, batch is structured as:
+        [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
+        where uncond_seqi is the paired unconditional sequence of cond_seqi."""
+        # Check if this is a CFG batch (contains paired conditional and unconditional sequences)
+        is_cfg_batch = seqs[0].cfg_scale > 1.0 and seqs[0].paired_seq is not None
+        if is_cfg_batch:
+            # CFG batch: seqs = [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
+            num_cond = len(seqs) // 2
+            cond_seqs = seqs[:num_cond]
+            # uncond_seqs = seqs[num_cond:]
+            # Prepare inputs for both conditional and unconditional (they're already in the batch)
+            input_ids, positions = (self.prepare_prefill(seqs) if is_prefill else self.prepare_decode(seqs))
+            sample_params = self.prepare_sample(seqs, is_cfg_batch=True) if self.rank == 0 else None
+            if sample_params is not None:
+                temperatures, cfg_scales, top_ks, top_ps, repetition_penalties = sample_params
+            else:
+                temperatures = cfg_scales = top_ks = top_ps = repetition_penalties = None
+            # Run model forward (processes entire batch: cond + uncond)
+            logits_all = self.run_model(input_ids, positions, is_prefill)
+            reset_context()
+            if self.rank == 0:
+                # Split logits: first half is conditional, second half is unconditional
+                logits_cond = logits_all[:num_cond]
+                logits_uncond = logits_all[num_cond:]
+                # Apply repetition penalty to conditional logits (before CFG)
+                if repetition_penalties is not None:
+                    for i, seq in enumerate(cond_seqs):
+                        penalty = repetition_penalties[i].item()
+                        if penalty != 1.0:
+                            # Only penalize completion tokens (not prompt tokens)
+                            completion_tokens = torch.tensor(seq.completion_token_ids, device=logits_cond.device)
+                            if len(completion_tokens) > 0:
+                                # Create token mask: mark tokens that appeared in completion
+                                token_mask = torch.zeros(logits_cond.shape[1], dtype=torch.bool, device=logits_cond.device)
+                                token_mask[completion_tokens] = True
+                                # Apply standard repetition penalty formula (matching transformers implementation):
+                                # For tokens in completion: if score < 0 then score * penalty, else score / penalty
+                                penalty_scores = torch.where(
+                                    logits_cond[i] < 0,
+                                    logits_cond[i] * penalty,
+                                    logits_cond[i] / penalty
+                                )
+                                # Only apply penalty to tokens that appeared in completion
+                                logits_cond[i] = torch.where(token_mask, penalty_scores, logits_cond[i])
+                # Apply CFG formula: logits_cfg = logits_uncond + cfg_scale * (logits_cond - logits_uncond)
+                cfg_scales_tensor = cfg_scales.unsqueeze(1)  # [num_cond, 1]
+                logits_cfg = logits_uncond + cfg_scales_tensor * (logits_cond - logits_uncond)
+                # Apply logits processor for constrained decoding (if any sequence has one)
+                for i, seq in enumerate(cond_seqs):
+                    if seq.logits_processor is not None:
+                        # Create input_ids tensor for this sequence
+                        seq_input_ids = torch.tensor([seq.token_ids], device=logits_cfg.device)
+                        # Apply processor to this sequence's logits
+                        logits_cfg[i:i+1] = seq.logits_processor(seq_input_ids, logits_cfg[i:i+1])
+                # Prepare input_ids for sampler (for repetition penalty, though we already applied it)
+                # cond_input_ids = torch.tensor([seq.token_ids for seq in cond_seqs], device=logits_cfg.device)
+                # Sample from CFG logits
+                token_ids_cfg = self.sampler(
+                    logits_cfg,
+                    temperatures,
+                    top_ks=top_ks if top_ks is not None else None,
+                    top_ps=top_ps if top_ps is not None else None,
+                    repetition_penalties=None,  # Already applied above
+                    # input_ids=cond_input_ids,
+                ).tolist()
+                # Update logits processor state after sampling
+                for i, seq in enumerate(cond_seqs):
+                    if seq.logits_processor_update_state is not None:
+                        seq.logits_processor_update_state(token_ids_cfg[i])
+                # Return token_ids (will be applied to both conditional and unconditional sequences)
+                return token_ids_cfg
+            else:
+                return None
+        else:
+            # Normal batch (non-CFG)
+            input_ids, positions = (self.prepare_prefill(seqs) if is_prefill
+                                   else self.prepare_decode(seqs))
+            sample_params = self.prepare_sample(seqs, is_cfg_batch=False) if self.rank == 0 else None
+            if sample_params is not None:
+                temperatures, cfg_scales, top_ks, top_ps, repetition_penalties = sample_params
+            else:
+                temperatures = cfg_scales = top_ks = top_ps = repetition_penalties = None
+            logits = self.run_model(input_ids, positions, is_prefill)
+            reset_context()
+            if self.rank == 0:
+                # Apply repetition penalty to logits
+                if repetition_penalties is not None:
+                    for i, seq in enumerate(seqs):
+                        penalty = repetition_penalties[i].item()
+                        if penalty != 1.0:
+                            # Only penalize completion tokens (not prompt tokens)
+                            completion_tokens = torch.tensor(seq.completion_token_ids, device=logits.device)
+                            if len(completion_tokens) > 0:
+                                # Create token mask: mark tokens that appeared in completion
+                                token_mask = torch.zeros(logits.shape[1], dtype=torch.bool, device=logits.device)
+                                token_mask[completion_tokens] = True
+                                # Apply standard repetition penalty formula (matching transformers implementation):
+                                # For tokens in completion: if score < 0 then score * penalty, else score / penalty
+                                penalty_scores = torch.where(
+                                    logits[i] < 0,
+                                    logits[i] * penalty,
+                                    logits[i] / penalty
+                                )
+                                # Only apply penalty to tokens that appeared in completion
+                                logits[i] = torch.where(token_mask, penalty_scores, logits[i])
+                # Apply logits processor for constrained decoding (if any sequence has one)
+                # Clone logits to avoid in-place update issues in inference mode
+                logits = logits.clone()
+                for i, seq in enumerate(seqs):
+                    if seq.logits_processor is not None:
+                        # Create input_ids tensor for this sequence
+                        seq_input_ids = torch.tensor([seq.token_ids], device=logits.device)
+                        # Apply processor to this sequence's logits (clone to avoid inference mode issues)
+                        processed = seq.logits_processor(seq_input_ids, logits[i:i+1].clone())
+                        logits[i] = processed[0]
+                # Prepare input_ids for sampler
+                # seq_input_ids = torch.tensor([seq.token_ids for seq in seqs], device=logits.device)
+                token_ids = self.sampler(
+                    logits,
+                    temperatures,
+                    top_ks=top_ks if top_ks is not None else None,
+                    top_ps=top_ps if top_ps is not None else None,
+                    repetition_penalties=None,  # Already applied above
+                    # input_ids=seq_input_ids,
+                ).tolist()
+                # Update logits processor state after sampling
+                for i, seq in enumerate(seqs):
+                    if seq.logits_processor_update_state is not None:
+                        seq.logits_processor_update_state(token_ids[i])
+                return token_ids
+            else:
+                return None
+    @torch.inference_mode()
+    def capture_cudagraph(self):
+        config = self.config
+        hf_config = config.hf_config
+        max_bs = min(self.config.max_num_seqs, 512)
+        max_num_blocks = (config.max_model_len + self.block_size - 1) // self.block_size
+        input_ids = torch.zeros(max_bs, dtype=torch.int64)
+        positions = torch.zeros(max_bs, dtype=torch.int64)
+        slot_mapping = torch.zeros(max_bs, dtype=torch.int32)
+        context_lens = torch.zeros(max_bs, dtype=torch.int32)
+        block_tables = torch.zeros(max_bs, max_num_blocks, dtype=torch.int32)
+        outputs = torch.zeros(max_bs, hf_config.hidden_size)
+        self.graph_bs = [1, 2, 4, 8] + list(range(16, max_bs + 1, 16))
+        self.graphs = {}
+        self.graph_pool = None
+        for bs in reversed(self.graph_bs):
+            graph = torch.cuda.CUDAGraph()
+            set_context(False, slot_mapping=slot_mapping[:bs], context_lens=context_lens[:bs], block_tables=block_tables[:bs])
+            outputs[:bs] = self.model(input_ids[:bs], positions[:bs])    # warmup
+            with torch.cuda.graph(graph, self.graph_pool):
+                outputs[:bs] = self.model(input_ids[:bs], positions[:bs])    # capture
+            if self.graph_pool is None:
+                self.graph_pool = graph.pool()
+            self.graphs[bs] = graph
+            torch.cuda.synchronize()
+            reset_context()
+        self.graph_vars = dict(
+            input_ids=input_ids,
+            positions=positions,
+            slot_mapping=slot_mapping,
+            context_lens=context_lens,
+            block_tables=block_tables,
+            outputs=outputs,
+        )

code/acestep/third_parts/nano-vllm/nanovllm/engine/scheduler.py ADDED Viewed

	@@ -0,0 +1,222 @@

+from collections import deque
+from nanovllm.config import Config
+from nanovllm.engine.sequence import Sequence, SequenceStatus
+from nanovllm.engine.block_manager import BlockManager
+class Scheduler:
+    def __init__(self, config: Config):
+        self.max_num_seqs = config.max_num_seqs
+        self.max_num_batched_tokens = config.max_num_batched_tokens
+        self.eos = config.eos
+        self.block_manager = BlockManager(config.num_kvcache_blocks, config.kvcache_block_size)
+        self.waiting: deque[Sequence] = deque()
+        self.running: deque[Sequence] = deque()
+    def is_finished(self):
+        return not self.waiting and not self.running
+    def add(self, seq: Sequence):
+        self.waiting.append(seq)
+    def schedule(self) -> tuple[list[Sequence], bool]:
+        # prefill
+        scheduled_seqs = []
+        num_seqs = 0
+        num_batched_tokens = 0
+        processed_seqs = set()  # Track processed sequences to handle CFG pairs
+        while self.waiting and num_seqs < self.max_num_seqs:
+            seq = self.waiting[0]
+            # For CFG sequences, ensure conditional and unconditional are scheduled together
+            if seq.cfg_scale > 1.0 and seq.paired_seq is not None and not seq.is_unconditional:
+                # This is a conditional sequence, need to schedule its paired unconditional sequence too
+                paired_seq = seq.paired_seq
+                if paired_seq.status != SequenceStatus.WAITING:
+                    # Paired sequence not in waiting, skip this conditional sequence for now
+                    break
+                # Calculate tokens for both sequences
+                total_tokens = (len(seq) - seq.num_cached_tokens) + (len(paired_seq) - paired_seq.num_cached_tokens)
+                can_allocate_both = (self.block_manager.can_allocate(seq) and
+                                    self.block_manager.can_allocate(paired_seq))
+                if num_batched_tokens + total_tokens > self.max_num_batched_tokens or not can_allocate_both:
+                    break
+                # Schedule both sequences: conditional first, then unconditional
+                for s in [seq, paired_seq]:
+                    num_seqs += 1
+                    self.block_manager.allocate(s)
+                    num_batched_tokens += len(s) - s.num_cached_tokens
+                    s.status = SequenceStatus.RUNNING
+                    self.waiting.remove(s)
+                    self.running.append(s)
+                    scheduled_seqs.append(s)
+                    processed_seqs.add(s.seq_id)
+            else:
+                # Normal sequence or unconditional sequence (already processed with its conditional)
+                if seq.seq_id in processed_seqs:
+                    # Skip if already processed as part of a CFG pair
+                    self.waiting.popleft()
+                    continue
+                if num_batched_tokens + len(seq) > self.max_num_batched_tokens or not self.block_manager.can_allocate(seq):
+                    break
+                num_seqs += 1
+                self.block_manager.allocate(seq)
+                num_batched_tokens += len(seq) - seq.num_cached_tokens
+                seq.status = SequenceStatus.RUNNING
+                self.waiting.popleft()
+                self.running.append(seq)
+                scheduled_seqs.append(seq)
+        if scheduled_seqs:
+            # For CFG batches, ensure conditional sequences come before their unconditional pairs
+            cfg_cond_seqs = [s for s in scheduled_seqs if s.cfg_scale > 1.0 and not s.is_unconditional]
+            cfg_uncond_seqs = [s for s in scheduled_seqs if s.is_unconditional]
+            non_cfg_seqs = [s for s in scheduled_seqs if s.cfg_scale <= 1.0]
+            # Reorder: non-CFG, then CFG conditional, then CFG unconditional
+            scheduled_seqs = non_cfg_seqs + cfg_cond_seqs + cfg_uncond_seqs
+            return scheduled_seqs, True
+        # decode
+        processed_seqs = set()
+        temp_running = list(self.running)  # Work with a copy
+        while temp_running and num_seqs < self.max_num_seqs:
+            seq = temp_running.pop(0)
+            # For CFG sequences, ensure conditional and unconditional are scheduled together
+            if seq.cfg_scale > 1.0 and seq.paired_seq is not None and not seq.is_unconditional:
+                paired_seq = seq.paired_seq
+                if paired_seq not in temp_running:
+                    # Paired sequence not available, skip for now
+                    continue
+                # Remove paired_seq from temp_running
+                temp_running.remove(paired_seq)
+                # Check if both can append
+                can_append_both = (self.block_manager.can_append(seq) and
+                                  self.block_manager.can_append(paired_seq))
+                if not can_append_both:
+                    # Try preempting other sequences
+                    preempted = False
+                    while not can_append_both and temp_running:
+                        other_seq = temp_running.pop(0)
+                        if other_seq != seq and other_seq != paired_seq:
+                            self.preempt(other_seq)
+                            can_append_both = (self.block_manager.can_append(seq) and
+                                              self.block_manager.can_append(paired_seq))
+                            preempted = True
+                        else:
+                            temp_running.append(other_seq)
+                            break
+                    if not can_append_both:
+                        # Can't schedule this pair right now
+                        temp_running.append(seq)
+                        temp_running.append(paired_seq)
+                        continue
+                # Schedule both sequences
+                for s in [seq, paired_seq]:
+                    num_seqs += 1
+                    self.block_manager.may_append(s)
+                    scheduled_seqs.append(s)
+                    processed_seqs.add(s.seq_id)
+                    # Remove from actual running list if scheduled
+                    if s in self.running:
+                        self.running.remove(s)
+            else:
+                # Normal sequence or unconditional (already processed)
+                if seq.seq_id in processed_seqs:
+                    continue
+                while not self.block_manager.can_append(seq):
+                    if temp_running:
+                        other_seq = temp_running.pop(0)
+                        if other_seq != seq:
+                            self.preempt(other_seq)
+                        else:
+                            temp_running.append(other_seq)
+                            break
+                    else:
+                        self.preempt(seq)
+                        if seq in self.running:
+                            self.running.remove(seq)
+                        break
+                else:
+                    num_seqs += 1
+                    self.block_manager.may_append(seq)
+                    scheduled_seqs.append(seq)
+                    if seq in self.running:
+                        self.running.remove(seq)
+        assert scheduled_seqs
+        # For CFG batches in decode, ensure conditional sequences come before unconditional
+        cfg_cond_seqs = [s for s in scheduled_seqs if s.cfg_scale > 1.0 and not s.is_unconditional]
+        cfg_uncond_seqs = [s for s in scheduled_seqs if s.is_unconditional]
+        non_cfg_seqs = [s for s in scheduled_seqs if s.cfg_scale <= 1.0]
+        scheduled_seqs = non_cfg_seqs + cfg_cond_seqs + cfg_uncond_seqs
+        self.running.extendleft(reversed(scheduled_seqs))
+        return scheduled_seqs, False
+    def preempt(self, seq: Sequence):
+        seq.status = SequenceStatus.WAITING
+        self.block_manager.deallocate(seq)
+        self.waiting.appendleft(seq)
+    def postprocess(self, seqs: list[Sequence], token_ids: list[int]) -> list[bool]:
+        # Check if this is a CFG batch
+        is_cfg_batch = False
+        if len(seqs) > 0 and seqs[0].cfg_scale > 1.0 and seqs[0].paired_seq is not None:
+            num_cond = len(seqs) // 2
+            is_cfg_batch = (num_cond > 0 and
+                           not seqs[0].is_unconditional and
+                           seqs[num_cond].is_unconditional)
+        if is_cfg_batch:
+            # CFG batch: seqs = [cond_seq1, cond_seq2, ..., uncond_seq1, uncond_seq2, ...]
+            # token_ids correspond to conditional sequences only (sampled from CFG logits)
+            num_cond = len(seqs) // 2
+            cond_seqs = seqs[:num_cond]
+            uncond_seqs = seqs[num_cond:]
+            # Apply the same sampled token to both conditional and unconditional sequences
+            for i, (cond_seq, uncond_seq, token_id) in enumerate(zip(cond_seqs, uncond_seqs, token_ids)):
+                cond_seq.append_token(token_id)
+                uncond_seq.append_token(token_id)  # Same token for unconditional
+                # Check if either sequence is finished
+                cond_finished = ((not cond_seq.ignore_eos and token_id == self.eos) or
+                                cond_seq.num_completion_tokens == cond_seq.max_tokens)
+                uncond_finished = ((not uncond_seq.ignore_eos and token_id == self.eos) or
+                                  uncond_seq.num_completion_tokens == uncond_seq.max_tokens)
+                if cond_finished or uncond_finished:
+                    # Mark both as finished
+                    cond_seq.status = SequenceStatus.FINISHED
+                    uncond_seq.status = SequenceStatus.FINISHED
+                    self.block_manager.deallocate(cond_seq)
+                    self.block_manager.deallocate(uncond_seq)
+                    if cond_seq in self.running:
+                        self.running.remove(cond_seq)
+                    if uncond_seq in self.running:
+                        self.running.remove(uncond_seq)
+        else:
+            # Normal batch
+            for seq, token_id in zip(seqs, token_ids):
+                seq.append_token(token_id)
+                if (not seq.ignore_eos and token_id == self.eos) or seq.num_completion_tokens == seq.max_tokens:
+                    seq.status = SequenceStatus.FINISHED
+                    self.block_manager.deallocate(seq)
+                    self.running.remove(seq)

code/acestep/third_parts/nano-vllm/nanovllm/engine/sequence.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from copy import copy
+from enum import Enum, auto
+from itertools import count
+from typing import Optional, Callable, Any
+from nanovllm.sampling_params import SamplingParams
+class SequenceStatus(Enum):
+    WAITING = auto()
+    RUNNING = auto()
+    FINISHED = auto()
+class Sequence:
+    block_size = 256
+    counter = count()
+    def __init__(self, token_ids: list[int], sampling_params = SamplingParams(), is_unconditional: bool = False, conditional_seq = None):
+        self.seq_id = next(Sequence.counter)
+        self.status = SequenceStatus.WAITING
+        self.token_ids = copy(token_ids)
+        self.last_token = token_ids[-1]
+        self.num_tokens = len(self.token_ids)
+        self.num_prompt_tokens = len(token_ids)
+        self.num_cached_tokens = 0
+        self.block_table = []
+        self.temperature = sampling_params.temperature
+        self.max_tokens = sampling_params.max_tokens
+        self.ignore_eos = sampling_params.ignore_eos
+        self.cfg_scale = sampling_params.cfg_scale
+        self.top_k = sampling_params.top_k
+        self.top_p = sampling_params.top_p
+        self.repetition_penalty = sampling_params.repetition_penalty
+        # For CFG: mark if this is an unconditional sequence
+        self.is_unconditional = is_unconditional
+        # For CFG: reference to the corresponding conditional sequence (if this is unconditional)
+        # For conditional sequences, this points to the unconditional sequence
+        self.paired_seq = conditional_seq  # For conditional seq, points to uncond; for uncond seq, points to cond
+        # For constrained decoding: logits processor and state update callback
+        self.logits_processor: Optional[Any] = sampling_params.logits_processor
+        self.logits_processor_update_state: Optional[Callable[[int], None]] = sampling_params.logits_processor_update_state
+    def __len__(self):
+        return self.num_tokens
+    def __getitem__(self, key):
+        return self.token_ids[key]
+    @property
+    def is_finished(self):
+        return self.status == SequenceStatus.FINISHED
+    @property
+    def num_completion_tokens(self):
+        return self.num_tokens - self.num_prompt_tokens
+    @property
+    def prompt_token_ids(self):
+        return self.token_ids[:self.num_prompt_tokens]
+    @property
+    def completion_token_ids(self):
+        return self.token_ids[self.num_prompt_tokens:]
+    @property
+    def num_cached_blocks(self):
+        return self.num_cached_tokens // self.block_size
+    @property
+    def num_blocks(self):
+        return (self.num_tokens + self.block_size - 1) // self.block_size
+    @property
+    def last_block_num_tokens(self):
+        return self.num_tokens - (self.num_blocks - 1) * self.block_size
+    def block(self, i):
+        assert 0 <= i < self.num_blocks
+        return self.token_ids[i*self.block_size: (i+1)*self.block_size]
+    def append_token(self, token_id: int):
+        self.token_ids.append(token_id)
+        self.last_token = token_id
+        self.num_tokens += 1
+    def __getstate__(self):
+        return (self.num_tokens, self.num_prompt_tokens, self.num_cached_tokens, self.block_table,
+                self.token_ids if self.num_completion_tokens == 0 else self.last_token)
+    def __setstate__(self, state):
+        self.num_tokens, self.num_prompt_tokens, self.num_cached_tokens, self.block_table = state[:-1]
+        if self.num_completion_tokens == 0:
+            self.token_ids = state[-1]
+        else:
+            self.last_token = state[-1]

code/acestep/third_parts/nano-vllm/nanovllm/layers/activation.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+class SiluAndMul(nn.Module):
+    def __init__(self):
+        super().__init__()
+    @torch.compile
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, y = x.chunk(2, -1)
+        return F.silu(x) * y

code/acestep/third_parts/nano-vllm/nanovllm/layers/attention.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+from torch import nn
+import triton
+import triton.language as tl
+from flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
+from nanovllm.utils.context import get_context
+@triton.jit
+def store_kvcache_kernel(
+    key_ptr,
+    key_stride,
+    value_ptr,
+    value_stride,
+    k_cache_ptr,
+    v_cache_ptr,
+    slot_mapping_ptr,
+    D: tl.constexpr,
+):
+    idx = tl.program_id(0)
+    slot = tl.load(slot_mapping_ptr + idx)
+    if slot == -1: return
+    key_offsets = idx * key_stride + tl.arange(0, D)
+    value_offsets = idx * value_stride + tl.arange(0, D)
+    key = tl.load(key_ptr + key_offsets)
+    value = tl.load(value_ptr + value_offsets)
+    cache_offsets = slot * D + tl.arange(0, D)
+    tl.store(k_cache_ptr + cache_offsets, key)
+    tl.store(v_cache_ptr + cache_offsets, value)
+def store_kvcache(key: torch.Tensor, value: torch.Tensor, k_cache: torch.Tensor, v_cache: torch.Tensor, slot_mapping: torch.Tensor):
+    N, num_heads, head_dim = key.shape
+    D = num_heads * head_dim
+    assert key.stride(-1) == 1 and value.stride(-1) == 1
+    assert key.stride(1) == head_dim and value.stride(1) == head_dim
+    assert k_cache.stride(1) == D and v_cache.stride(1) == D
+    assert slot_mapping.numel() == N
+    store_kvcache_kernel[(N,)](key, key.stride(0), value, value.stride(0), k_cache, v_cache, slot_mapping, D)
+class Attention(nn.Module):
+    def __init__(
+        self,
+        num_heads,
+        head_dim,
+        scale,
+        num_kv_heads,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.scale = scale
+        self.num_kv_heads = num_kv_heads
+        self.k_cache = self.v_cache = torch.tensor([])
+    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
+        context = get_context()
+        k_cache, v_cache = self.k_cache, self.v_cache
+        if k_cache.numel() and v_cache.numel():
+            store_kvcache(k, v, k_cache, v_cache, context.slot_mapping)
+        if context.is_prefill:
+            if context.block_tables is not None:    # prefix cache
+                k, v = k_cache, v_cache
+            o = flash_attn_varlen_func(q, k, v,
+                                       max_seqlen_q=context.max_seqlen_q, cu_seqlens_q=context.cu_seqlens_q,
+                                       max_seqlen_k=context.max_seqlen_k, cu_seqlens_k=context.cu_seqlens_k,
+                                       softmax_scale=self.scale, causal=True, block_table=context.block_tables)
+        else:    # decode
+            o = flash_attn_with_kvcache(q.unsqueeze(1), k_cache, v_cache,
+                                        cache_seqlens=context.context_lens, block_table=context.block_tables,
+                                        softmax_scale=self.scale, causal=True)
+        return o

code/acestep/third_parts/nano-vllm/nanovllm/layers/embed_head.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from nanovllm.utils.context import get_context
+class VocabParallelEmbedding(nn.Module):
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+    ):
+        super().__init__()
+        self.tp_rank = dist.get_rank()
+        self.tp_size = dist.get_world_size()
+        assert num_embeddings % self.tp_size == 0
+        self.num_embeddings = num_embeddings
+        self.num_embeddings_per_partition = self.num_embeddings // self.tp_size
+        self.vocab_start_idx = self.num_embeddings_per_partition * self.tp_rank
+        self.vocab_end_idx = self.vocab_start_idx + self.num_embeddings_per_partition
+        self.weight = nn.Parameter(torch.empty(self.num_embeddings_per_partition, embedding_dim))
+        self.weight.weight_loader = self.weight_loader
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        shard_size = param_data.size(0)
+        start_idx = self.tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
+        param_data.copy_(loaded_weight)
+    def forward(self, x: torch.Tensor):
+        if self.tp_size > 1:
+            mask = (x >= self.vocab_start_idx) & (x < self.vocab_end_idx)
+            x = mask * (x - self.vocab_start_idx)
+        y = F.embedding(x, self.weight)
+        if self.tp_size > 1:
+            y = mask.unsqueeze(1) * y
+            dist.all_reduce(y)
+        return y
+class ParallelLMHead(VocabParallelEmbedding):
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        bias: bool = False,
+    ):
+        assert not bias
+        super().__init__(num_embeddings, embedding_dim)
+    def forward(self, x: torch.Tensor):
+        context = get_context()
+        if context.is_prefill:
+            last_indices = context.cu_seqlens_q[1:] - 1
+            x = x[last_indices].contiguous()
+        logits = F.linear(x, self.weight)
+        if self.tp_size > 1:
+            all_logits = [torch.empty_like(logits) for _ in range(self.tp_size)] if self.tp_rank == 0 else None
+            dist.gather(logits, all_logits, 0)
+            logits = torch.cat(all_logits, -1) if self.tp_rank == 0 else None
+        return logits

code/acestep/third_parts/nano-vllm/nanovllm/layers/layernorm.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+from torch import nn
+class RMSNorm(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+    @torch.compile
+    def rms_forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        orig_dtype = x.dtype
+        x = x.float()
+        var = x.pow(2).mean(dim=-1, keepdim=True)
+        x.mul_(torch.rsqrt(var + self.eps))
+        x = x.to(orig_dtype).mul_(self.weight)
+        return x
+    @torch.compile
+    def add_rms_forward(
+        self,
+        x: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        orig_dtype = x.dtype
+        x = x.float().add_(residual.float())
+        residual = x.to(orig_dtype)
+        var = x.pow(2).mean(dim=-1, keepdim=True)
+        x.mul_(torch.rsqrt(var + self.eps))
+        x = x.to(orig_dtype).mul_(self.weight)
+        return x, residual
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            return self.rms_forward(x)
+        else:
+            return self.add_rms_forward(x, residual)

code/acestep/third_parts/nano-vllm/nanovllm/layers/linear.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.distributed as dist
+def divide(numerator, denominator):
+    assert numerator % denominator == 0
+    return numerator // denominator
+class LinearBase(nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = False,
+        tp_dim: int | None = None,
+    ):
+        super().__init__()
+        self.tp_dim = tp_dim
+        self.tp_rank = dist.get_rank()
+        self.tp_size = dist.get_world_size()
+        self.weight = nn.Parameter(torch.empty(output_size, input_size))
+        self.weight.weight_loader = self.weight_loader
+        if bias:
+            self.bias = nn.Parameter(torch.empty(output_size))
+            self.bias.weight_loader = self.weight_loader
+        else:
+            self.register_parameter("bias", None)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+class ReplicatedLinear(LinearBase):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = False,
+    ):
+        super().__init__(input_size, output_size, bias)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param.data.copy_(loaded_weight)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(x, self.weight, self.bias)
+class ColumnParallelLinear(LinearBase):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = False,
+    ):
+        tp_size = dist.get_world_size()
+        super().__init__(input_size, divide(output_size, tp_size), bias, 0)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        shard_size = param_data.size(self.tp_dim)
+        start_idx = self.tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(self.tp_dim, start_idx, shard_size)
+        param_data.copy_(loaded_weight)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(x, self.weight, self.bias)
+class MergedColumnParallelLinear(ColumnParallelLinear):
+    def __init__(
+        self,
+        input_size: int,
+        output_sizes: list[int],
+        bias: bool = False,
+    ):
+        self.output_sizes = output_sizes
+        super().__init__(input_size, sum(output_sizes), bias)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, loaded_shard_id: int):
+        param_data = param.data
+        shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
+        shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
+        param_data = param_data.narrow(self.tp_dim, shard_offset, shard_size)
+        loaded_weight = loaded_weight.chunk(self.tp_size, self.tp_dim)[self.tp_rank]
+        param_data.copy_(loaded_weight)
+class QKVParallelLinear(ColumnParallelLinear):
+    def __init__(
+        self,
+        hidden_size: int,
+        head_size: int,
+        total_num_heads: int,
+        total_num_kv_heads: int | None = None,
+        bias: bool = False,
+    ):
+        tp_size = dist.get_world_size()
+        total_num_kv_heads = total_num_kv_heads or total_num_heads
+        self.head_size = head_size
+        self.num_heads = divide(total_num_heads, tp_size)
+        self.num_kv_heads = divide(total_num_kv_heads, tp_size)
+        output_size = (total_num_heads + 2 * total_num_kv_heads) * self.head_size
+        super().__init__(hidden_size, output_size, bias)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, loaded_shard_id: str):
+        param_data = param.data
+        assert loaded_shard_id in ["q", "k", "v"]
+        if loaded_shard_id == "q":
+            shard_size = self.num_heads * self.head_size
+            shard_offset = 0
+        elif loaded_shard_id == "k":
+            shard_size = self.num_kv_heads * self.head_size
+            shard_offset = self.num_heads * self.head_size
+        else:
+            shard_size = self.num_kv_heads * self.head_size
+            shard_offset = self.num_heads * self.head_size + self.num_kv_heads * self.head_size
+        param_data = param_data.narrow(self.tp_dim, shard_offset, shard_size)
+        loaded_weight = loaded_weight.chunk(self.tp_size, self.tp_dim)[self.tp_rank]
+        param_data.copy_(loaded_weight)
+class RowParallelLinear(LinearBase):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = False,
+    ):
+        tp_size = dist.get_world_size()
+        super().__init__(divide(input_size, tp_size), output_size, bias, 1)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        shard_size = param_data.size(self.tp_dim)
+        start_idx = self.tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(self.tp_dim, start_idx, shard_size)
+        param_data.copy_(loaded_weight)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        y = F.linear(x, self.weight, self.bias if self.tp_rank == 0 else None)
+        if self.tp_size > 1:
+            dist.all_reduce(y)
+        return y