Spaces:
Sleeping
Sleeping
Merge pull request #7 from ace-step/add_rewrite_lyrics
Browse files- .gitignore +2 -1
- acestep/api_server.py +4 -1
- acestep/gradio_ui/events/__init__.py +35 -0
- acestep/gradio_ui/events/generation_handlers.py +147 -43
- acestep/gradio_ui/events/results_handlers.py +22 -7
- acestep/gradio_ui/i18n/en.json +8 -2
- acestep/gradio_ui/i18n/ja.json +8 -2
- acestep/gradio_ui/i18n/zh.json +8 -2
- acestep/gradio_ui/interfaces/generation.py +44 -18
- acestep/handler.py +2 -0
- acestep/inference.py +181 -19
- acestep/llm_inference.py +217 -25
- examples/simple_mode/example_01.json +1 -1
- examples/simple_mode/example_02.json +1 -1
- examples/simple_mode/example_03.json +1 -1
- examples/simple_mode/example_04.json +1 -1
- examples/simple_mode/example_05.json +1 -1
- examples/simple_mode/example_06.json +1 -1
- examples/simple_mode/example_07.json +1 -1
- examples/simple_mode/example_08.json +1 -1
- examples/simple_mode/example_09.json +1 -1
- examples/simple_mode/example_10.json +1 -1
.gitignore
CHANGED
|
@@ -220,4 +220,5 @@ discord_bot/
|
|
| 220 |
feishu_bot/
|
| 221 |
tmp*
|
| 222 |
torchinductor_root/
|
| 223 |
-
scripts/
|
|
|
|
|
|
| 220 |
feishu_bot/
|
| 221 |
tmp*
|
| 222 |
torchinductor_root/
|
| 223 |
+
scripts/
|
| 224 |
+
checkpoints_legacy/
|
acestep/api_server.py
CHANGED
|
@@ -94,6 +94,7 @@ class GenerateMusicRequest(BaseModel):
|
|
| 94 |
use_adg: bool = False
|
| 95 |
cfg_interval_start: float = 0.0
|
| 96 |
cfg_interval_end: float = 1.0
|
|
|
|
| 97 |
|
| 98 |
audio_format: str = "mp3"
|
| 99 |
use_tiled_decode: bool = True
|
|
@@ -535,10 +536,10 @@ def create_app() -> FastAPI:
|
|
| 535 |
|
| 536 |
if sample_mode:
|
| 537 |
print("[api_server] Sample mode: generating random caption/lyrics via LM")
|
|
|
|
| 538 |
sample_metadata, sample_status = llm.understand_audio_from_codes(
|
| 539 |
audio_codes="NO USER INPUT",
|
| 540 |
temperature=req.lm_temperature,
|
| 541 |
-
negative_prompt=req.lm_negative_prompt,
|
| 542 |
top_k=lm_top_k if lm_top_k > 0 else None,
|
| 543 |
top_p=lm_top_p if lm_top_p < 1.0 else None,
|
| 544 |
repetition_penalty=req.lm_repetition_penalty,
|
|
@@ -584,6 +585,7 @@ def create_app() -> FastAPI:
|
|
| 584 |
use_adg=req.use_adg,
|
| 585 |
cfg_interval_start=req.cfg_interval_start,
|
| 586 |
cfg_interval_end=req.cfg_interval_end,
|
|
|
|
| 587 |
repainting_start=req.repainting_start,
|
| 588 |
repainting_end=req.repainting_end if req.repainting_end else -1,
|
| 589 |
audio_cover_strength=req.audio_cover_strength,
|
|
@@ -854,6 +856,7 @@ def create_app() -> FastAPI:
|
|
| 854 |
use_adg=_to_bool(get("use_adg"), False),
|
| 855 |
cfg_interval_start=_to_float(get("cfg_interval_start"), 0.0) or 0.0,
|
| 856 |
cfg_interval_end=_to_float(get("cfg_interval_end"), 1.0) or 1.0,
|
|
|
|
| 857 |
audio_format=str(get("audio_format", "mp3") or "mp3"),
|
| 858 |
use_tiled_decode=_to_bool(_get_any("use_tiled_decode", "useTiledDecode"), True),
|
| 859 |
lm_model_path=str(get("lm_model_path") or "").strip() or None,
|
|
|
|
| 94 |
use_adg: bool = False
|
| 95 |
cfg_interval_start: float = 0.0
|
| 96 |
cfg_interval_end: float = 1.0
|
| 97 |
+
infer_method: str = "ode" # "ode" or "sde" - diffusion inference method
|
| 98 |
|
| 99 |
audio_format: str = "mp3"
|
| 100 |
use_tiled_decode: bool = True
|
|
|
|
| 536 |
|
| 537 |
if sample_mode:
|
| 538 |
print("[api_server] Sample mode: generating random caption/lyrics via LM")
|
| 539 |
+
# Note: understand_audio_from_codes does not support cfg_scale or negative_prompt
|
| 540 |
sample_metadata, sample_status = llm.understand_audio_from_codes(
|
| 541 |
audio_codes="NO USER INPUT",
|
| 542 |
temperature=req.lm_temperature,
|
|
|
|
| 543 |
top_k=lm_top_k if lm_top_k > 0 else None,
|
| 544 |
top_p=lm_top_p if lm_top_p < 1.0 else None,
|
| 545 |
repetition_penalty=req.lm_repetition_penalty,
|
|
|
|
| 585 |
use_adg=req.use_adg,
|
| 586 |
cfg_interval_start=req.cfg_interval_start,
|
| 587 |
cfg_interval_end=req.cfg_interval_end,
|
| 588 |
+
infer_method=req.infer_method,
|
| 589 |
repainting_start=req.repainting_start,
|
| 590 |
repainting_end=req.repainting_end if req.repainting_end else -1,
|
| 591 |
audio_cover_strength=req.audio_cover_strength,
|
|
|
|
| 856 |
use_adg=_to_bool(get("use_adg"), False),
|
| 857 |
cfg_interval_start=_to_float(get("cfg_interval_start"), 0.0) or 0.0,
|
| 858 |
cfg_interval_end=_to_float(get("cfg_interval_end"), 1.0) or 1.0,
|
| 859 |
+
infer_method=str(_get_any("infer_method", "inferMethod", default="ode") or "ode"),
|
| 860 |
audio_format=str(get("audio_format", "mp3") or "mp3"),
|
| 861 |
use_tiled_decode=_to_bool(_get_any("use_tiled_decode", "useTiledDecode"), True),
|
| 862 |
lm_model_path=str(get("lm_model_path") or "").strip() or None,
|
acestep/gradio_ui/events/__init__.py
CHANGED
|
@@ -190,6 +190,37 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
|
|
| 190 |
outputs=[generation_section["lyrics"]]
|
| 191 |
)
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
# ========== Simple/Custom Mode Toggle ==========
|
| 194 |
generation_section["generation_mode"].change(
|
| 195 |
fn=gen_h.handle_generation_mode_change,
|
|
@@ -245,6 +276,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
|
|
| 245 |
generation_section["audio_duration"],
|
| 246 |
generation_section["key_scale"],
|
| 247 |
generation_section["vocal_language"],
|
|
|
|
| 248 |
generation_section["time_signature"],
|
| 249 |
generation_section["instrumental_checkbox"],
|
| 250 |
generation_section["caption_accordion"],
|
|
@@ -279,6 +311,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
|
|
| 279 |
generation_section["cfg_interval_start"],
|
| 280 |
generation_section["cfg_interval_end"],
|
| 281 |
generation_section["shift"],
|
|
|
|
| 282 |
generation_section["audio_format"],
|
| 283 |
generation_section["lm_temperature"],
|
| 284 |
generation_section["lm_cfg_scale"],
|
|
@@ -476,6 +509,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
|
|
| 476 |
generation_section["cfg_interval_start"],
|
| 477 |
generation_section["cfg_interval_end"],
|
| 478 |
generation_section["shift"],
|
|
|
|
| 479 |
generation_section["audio_format"],
|
| 480 |
generation_section["lm_temperature"],
|
| 481 |
generation_section["think_checkbox"],
|
|
@@ -662,6 +696,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
|
|
| 662 |
generation_section["cfg_interval_start"],
|
| 663 |
generation_section["cfg_interval_end"],
|
| 664 |
generation_section["shift"],
|
|
|
|
| 665 |
generation_section["audio_format"],
|
| 666 |
generation_section["lm_temperature"],
|
| 667 |
generation_section["think_checkbox"],
|
|
|
|
| 190 |
outputs=[generation_section["lyrics"]]
|
| 191 |
)
|
| 192 |
|
| 193 |
+
# ========== Format Button ==========
|
| 194 |
+
# Note: cfg_scale and negative_prompt are not supported in format mode
|
| 195 |
+
generation_section["format_btn"].click(
|
| 196 |
+
fn=lambda caption, lyrics, bpm, duration, key_scale, time_sig, temp, top_k, top_p, debug: gen_h.handle_format_sample(
|
| 197 |
+
llm_handler, caption, lyrics, bpm, duration, key_scale, time_sig, temp, top_k, top_p, debug
|
| 198 |
+
),
|
| 199 |
+
inputs=[
|
| 200 |
+
generation_section["captions"],
|
| 201 |
+
generation_section["lyrics"],
|
| 202 |
+
generation_section["bpm"],
|
| 203 |
+
generation_section["audio_duration"],
|
| 204 |
+
generation_section["key_scale"],
|
| 205 |
+
generation_section["time_signature"],
|
| 206 |
+
generation_section["lm_temperature"],
|
| 207 |
+
generation_section["lm_top_k"],
|
| 208 |
+
generation_section["lm_top_p"],
|
| 209 |
+
generation_section["constrained_decoding_debug"],
|
| 210 |
+
],
|
| 211 |
+
outputs=[
|
| 212 |
+
generation_section["captions"],
|
| 213 |
+
generation_section["lyrics"],
|
| 214 |
+
generation_section["bpm"],
|
| 215 |
+
generation_section["audio_duration"],
|
| 216 |
+
generation_section["key_scale"],
|
| 217 |
+
generation_section["vocal_language"],
|
| 218 |
+
generation_section["time_signature"],
|
| 219 |
+
results_section["is_format_caption_state"],
|
| 220 |
+
results_section["status_output"],
|
| 221 |
+
]
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
# ========== Simple/Custom Mode Toggle ==========
|
| 225 |
generation_section["generation_mode"].change(
|
| 226 |
fn=gen_h.handle_generation_mode_change,
|
|
|
|
| 276 |
generation_section["audio_duration"],
|
| 277 |
generation_section["key_scale"],
|
| 278 |
generation_section["vocal_language"],
|
| 279 |
+
generation_section["simple_vocal_language"],
|
| 280 |
generation_section["time_signature"],
|
| 281 |
generation_section["instrumental_checkbox"],
|
| 282 |
generation_section["caption_accordion"],
|
|
|
|
| 311 |
generation_section["cfg_interval_start"],
|
| 312 |
generation_section["cfg_interval_end"],
|
| 313 |
generation_section["shift"],
|
| 314 |
+
generation_section["infer_method"],
|
| 315 |
generation_section["audio_format"],
|
| 316 |
generation_section["lm_temperature"],
|
| 317 |
generation_section["lm_cfg_scale"],
|
|
|
|
| 509 |
generation_section["cfg_interval_start"],
|
| 510 |
generation_section["cfg_interval_end"],
|
| 511 |
generation_section["shift"],
|
| 512 |
+
generation_section["infer_method"],
|
| 513 |
generation_section["audio_format"],
|
| 514 |
generation_section["lm_temperature"],
|
| 515 |
generation_section["think_checkbox"],
|
|
|
|
| 696 |
generation_section["cfg_interval_start"],
|
| 697 |
generation_section["cfg_interval_end"],
|
| 698 |
generation_section["shift"],
|
| 699 |
+
generation_section["infer_method"],
|
| 700 |
generation_section["audio_format"],
|
| 701 |
generation_section["lm_temperature"],
|
| 702 |
generation_section["think_checkbox"],
|
acestep/gradio_ui/events/generation_handlers.py
CHANGED
|
@@ -13,7 +13,7 @@ from acestep.constants import (
|
|
| 13 |
TASK_TYPES_BASE,
|
| 14 |
)
|
| 15 |
from acestep.gradio_ui.i18n import t
|
| 16 |
-
from acestep.inference import understand_music, create_sample
|
| 17 |
|
| 18 |
|
| 19 |
def load_metadata(file_obj):
|
|
@@ -86,6 +86,7 @@ def load_metadata(file_obj):
|
|
| 86 |
track_name = metadata.get('track_name')
|
| 87 |
complete_track_classes = metadata.get('complete_track_classes', [])
|
| 88 |
shift = metadata.get('shift', 3.0) # Default 3.0 for base models
|
|
|
|
| 89 |
instrumental = metadata.get('instrumental', False) # Added: read instrumental
|
| 90 |
|
| 91 |
gr.Info(t("messages.params_loaded", filename=os.path.basename(filepath)))
|
|
@@ -93,7 +94,7 @@ def load_metadata(file_obj):
|
|
| 93 |
return (
|
| 94 |
task_type, captions, lyrics, vocal_language, bpm, key_scale, time_signature,
|
| 95 |
audio_duration, batch_size, inference_steps, guidance_scale, seed, random_seed,
|
| 96 |
-
use_adg, cfg_interval_start, cfg_interval_end, shift, audio_format,
|
| 97 |
lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
|
| 98 |
use_cot_metas, use_cot_caption, use_cot_language, audio_cover_strength,
|
| 99 |
think, audio_codes, repainting_start, repainting_end,
|
|
@@ -103,10 +104,10 @@ def load_metadata(file_obj):
|
|
| 103 |
|
| 104 |
except json.JSONDecodeError as e:
|
| 105 |
gr.Warning(t("messages.invalid_json", error=str(e)))
|
| 106 |
-
return [None] *
|
| 107 |
except Exception as e:
|
| 108 |
gr.Warning(t("messages.load_error", error=str(e)))
|
| 109 |
-
return [None] *
|
| 110 |
|
| 111 |
|
| 112 |
def load_random_example(task_type: str):
|
|
@@ -256,7 +257,7 @@ def sample_example_smart(llm_handler, task_type: str, constrained_decoding_debug
|
|
| 256 |
|
| 257 |
def load_random_simple_description():
|
| 258 |
"""Load a random description from the simple_mode examples directory.
|
| 259 |
-
|
| 260 |
Returns:
|
| 261 |
Tuple of (description, instrumental, vocal_language) for updating UI components
|
| 262 |
"""
|
|
@@ -265,39 +266,39 @@ def load_random_simple_description():
|
|
| 265 |
current_file = os.path.abspath(__file__)
|
| 266 |
# This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
|
| 267 |
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
|
| 268 |
-
|
| 269 |
# Construct the examples directory path
|
| 270 |
examples_dir = os.path.join(project_root, "examples", "simple_mode")
|
| 271 |
-
|
| 272 |
# Check if directory exists
|
| 273 |
if not os.path.exists(examples_dir):
|
| 274 |
gr.Warning(t("messages.simple_examples_not_found"))
|
| 275 |
return gr.update(), gr.update(), gr.update()
|
| 276 |
-
|
| 277 |
# Find all JSON files in the directory
|
| 278 |
json_files = glob.glob(os.path.join(examples_dir, "*.json"))
|
| 279 |
-
|
| 280 |
if not json_files:
|
| 281 |
gr.Warning(t("messages.simple_examples_empty"))
|
| 282 |
return gr.update(), gr.update(), gr.update()
|
| 283 |
-
|
| 284 |
# Randomly select one file
|
| 285 |
selected_file = random.choice(json_files)
|
| 286 |
-
|
| 287 |
# Read and parse JSON
|
| 288 |
try:
|
| 289 |
with open(selected_file, 'r', encoding='utf-8') as f:
|
| 290 |
data = json.load(f)
|
| 291 |
-
|
| 292 |
# Extract fields
|
| 293 |
description = data.get('description', '')
|
| 294 |
instrumental = data.get('instrumental', False)
|
| 295 |
-
vocal_language = data.get('vocal_language',
|
| 296 |
-
|
| 297 |
-
# Ensure vocal_language is a
|
| 298 |
-
if isinstance(vocal_language,
|
| 299 |
-
vocal_language =
|
| 300 |
-
|
| 301 |
gr.Info(t("messages.simple_example_loaded", filename=os.path.basename(selected_file)))
|
| 302 |
return description, instrumental, vocal_language
|
| 303 |
|
|
@@ -564,7 +565,7 @@ def handle_instrumental_checkbox(instrumental_checked, current_lyrics):
|
|
| 564 |
def handle_simple_instrumental_change(is_instrumental: bool):
|
| 565 |
"""
|
| 566 |
Handle simple mode instrumental checkbox changes.
|
| 567 |
-
When checked: set vocal_language to
|
| 568 |
When unchecked: enable vocal_language editing.
|
| 569 |
|
| 570 |
Args:
|
|
@@ -574,7 +575,7 @@ def handle_simple_instrumental_change(is_instrumental: bool):
|
|
| 574 |
gr.update for simple_vocal_language dropdown
|
| 575 |
"""
|
| 576 |
if is_instrumental:
|
| 577 |
-
return gr.update(value=
|
| 578 |
else:
|
| 579 |
return gr.update(interactive=True)
|
| 580 |
|
|
@@ -653,7 +654,7 @@ def handle_create_sample(
|
|
| 653 |
llm_handler,
|
| 654 |
query: str,
|
| 655 |
instrumental: bool,
|
| 656 |
-
vocal_language:
|
| 657 |
lm_temperature: float,
|
| 658 |
lm_top_k: int,
|
| 659 |
lm_top_p: float,
|
|
@@ -671,7 +672,7 @@ def handle_create_sample(
|
|
| 671 |
llm_handler: LLM handler instance
|
| 672 |
query: User's natural language music description
|
| 673 |
instrumental: Whether to generate instrumental music
|
| 674 |
-
vocal_language:
|
| 675 |
lm_temperature: LLM temperature for generation
|
| 676 |
lm_top_k: LLM top-k sampling
|
| 677 |
lm_top_p: LLM top-p sampling
|
|
@@ -695,27 +696,6 @@ def handle_create_sample(
|
|
| 695 |
- is_format_caption_state (True)
|
| 696 |
- status_output
|
| 697 |
"""
|
| 698 |
-
# Validate query
|
| 699 |
-
if not query or not query.strip():
|
| 700 |
-
gr.Warning(t("messages.empty_query"))
|
| 701 |
-
return (
|
| 702 |
-
gr.update(), # captions - no change
|
| 703 |
-
gr.update(), # lyrics - no change
|
| 704 |
-
gr.update(), # bpm - no change
|
| 705 |
-
gr.update(), # audio_duration - no change
|
| 706 |
-
gr.update(), # key_scale - no change
|
| 707 |
-
gr.update(), # vocal_language - no change
|
| 708 |
-
gr.update(), # time_signature - no change
|
| 709 |
-
gr.update(), # instrumental_checkbox - no change
|
| 710 |
-
gr.update(), # caption_accordion - no change
|
| 711 |
-
gr.update(), # lyrics_accordion - no change
|
| 712 |
-
gr.update(interactive=False), # generate_btn - keep disabled
|
| 713 |
-
False, # simple_sample_created - still False
|
| 714 |
-
gr.update(), # think_checkbox - no change
|
| 715 |
-
gr.update(), # is_format_caption_state - no change
|
| 716 |
-
t("messages.empty_query"), # status_output
|
| 717 |
-
)
|
| 718 |
-
|
| 719 |
# Check if LLM is initialized
|
| 720 |
if not llm_handler.llm_initialized:
|
| 721 |
gr.Warning(t("messages.lm_not_initialized"))
|
|
@@ -765,6 +745,7 @@ def handle_create_sample(
|
|
| 765 |
gr.update(), # audio_duration - no change
|
| 766 |
gr.update(), # key_scale - no change
|
| 767 |
gr.update(), # vocal_language - no change
|
|
|
|
| 768 |
gr.update(), # time_signature - no change
|
| 769 |
gr.update(), # instrumental_checkbox - no change
|
| 770 |
gr.update(), # caption_accordion - no change
|
|
@@ -786,6 +767,7 @@ def handle_create_sample(
|
|
| 786 |
result.duration if result.duration and result.duration > 0 else -1, # audio_duration
|
| 787 |
result.keyscale, # key_scale
|
| 788 |
result.language, # vocal_language
|
|
|
|
| 789 |
result.timesignature, # time_signature
|
| 790 |
result.instrumental, # instrumental_checkbox
|
| 791 |
gr.update(open=True), # caption_accordion - expand
|
|
@@ -798,3 +780,125 @@ def handle_create_sample(
|
|
| 798 |
)
|
| 799 |
|
| 800 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
TASK_TYPES_BASE,
|
| 14 |
)
|
| 15 |
from acestep.gradio_ui.i18n import t
|
| 16 |
+
from acestep.inference import understand_music, create_sample, format_sample
|
| 17 |
|
| 18 |
|
| 19 |
def load_metadata(file_obj):
|
|
|
|
| 86 |
track_name = metadata.get('track_name')
|
| 87 |
complete_track_classes = metadata.get('complete_track_classes', [])
|
| 88 |
shift = metadata.get('shift', 3.0) # Default 3.0 for base models
|
| 89 |
+
infer_method = metadata.get('infer_method', 'ode') # Default 'ode' for diffusion inference
|
| 90 |
instrumental = metadata.get('instrumental', False) # Added: read instrumental
|
| 91 |
|
| 92 |
gr.Info(t("messages.params_loaded", filename=os.path.basename(filepath)))
|
|
|
|
| 94 |
return (
|
| 95 |
task_type, captions, lyrics, vocal_language, bpm, key_scale, time_signature,
|
| 96 |
audio_duration, batch_size, inference_steps, guidance_scale, seed, random_seed,
|
| 97 |
+
use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, audio_format,
|
| 98 |
lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
|
| 99 |
use_cot_metas, use_cot_caption, use_cot_language, audio_cover_strength,
|
| 100 |
think, audio_codes, repainting_start, repainting_end,
|
|
|
|
| 104 |
|
| 105 |
except json.JSONDecodeError as e:
|
| 106 |
gr.Warning(t("messages.invalid_json", error=str(e)))
|
| 107 |
+
return [None] * 35 + [False]
|
| 108 |
except Exception as e:
|
| 109 |
gr.Warning(t("messages.load_error", error=str(e)))
|
| 110 |
+
return [None] * 35 + [False]
|
| 111 |
|
| 112 |
|
| 113 |
def load_random_example(task_type: str):
|
|
|
|
| 257 |
|
| 258 |
def load_random_simple_description():
|
| 259 |
"""Load a random description from the simple_mode examples directory.
|
| 260 |
+
|
| 261 |
Returns:
|
| 262 |
Tuple of (description, instrumental, vocal_language) for updating UI components
|
| 263 |
"""
|
|
|
|
| 266 |
current_file = os.path.abspath(__file__)
|
| 267 |
# This file is in acestep/gradio_ui/events/, need 4 levels up to reach project root
|
| 268 |
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
|
| 269 |
+
|
| 270 |
# Construct the examples directory path
|
| 271 |
examples_dir = os.path.join(project_root, "examples", "simple_mode")
|
| 272 |
+
|
| 273 |
# Check if directory exists
|
| 274 |
if not os.path.exists(examples_dir):
|
| 275 |
gr.Warning(t("messages.simple_examples_not_found"))
|
| 276 |
return gr.update(), gr.update(), gr.update()
|
| 277 |
+
|
| 278 |
# Find all JSON files in the directory
|
| 279 |
json_files = glob.glob(os.path.join(examples_dir, "*.json"))
|
| 280 |
+
|
| 281 |
if not json_files:
|
| 282 |
gr.Warning(t("messages.simple_examples_empty"))
|
| 283 |
return gr.update(), gr.update(), gr.update()
|
| 284 |
+
|
| 285 |
# Randomly select one file
|
| 286 |
selected_file = random.choice(json_files)
|
| 287 |
+
|
| 288 |
# Read and parse JSON
|
| 289 |
try:
|
| 290 |
with open(selected_file, 'r', encoding='utf-8') as f:
|
| 291 |
data = json.load(f)
|
| 292 |
+
|
| 293 |
# Extract fields
|
| 294 |
description = data.get('description', '')
|
| 295 |
instrumental = data.get('instrumental', False)
|
| 296 |
+
vocal_language = data.get('vocal_language', 'unknown')
|
| 297 |
+
|
| 298 |
+
# Ensure vocal_language is a string
|
| 299 |
+
if isinstance(vocal_language, list):
|
| 300 |
+
vocal_language = vocal_language[0] if vocal_language else 'unknown'
|
| 301 |
+
|
| 302 |
gr.Info(t("messages.simple_example_loaded", filename=os.path.basename(selected_file)))
|
| 303 |
return description, instrumental, vocal_language
|
| 304 |
|
|
|
|
| 565 |
def handle_simple_instrumental_change(is_instrumental: bool):
|
| 566 |
"""
|
| 567 |
Handle simple mode instrumental checkbox changes.
|
| 568 |
+
When checked: set vocal_language to "unknown" and disable editing.
|
| 569 |
When unchecked: enable vocal_language editing.
|
| 570 |
|
| 571 |
Args:
|
|
|
|
| 575 |
gr.update for simple_vocal_language dropdown
|
| 576 |
"""
|
| 577 |
if is_instrumental:
|
| 578 |
+
return gr.update(value="unknown", interactive=False)
|
| 579 |
else:
|
| 580 |
return gr.update(interactive=True)
|
| 581 |
|
|
|
|
| 654 |
llm_handler,
|
| 655 |
query: str,
|
| 656 |
instrumental: bool,
|
| 657 |
+
vocal_language: str,
|
| 658 |
lm_temperature: float,
|
| 659 |
lm_top_k: int,
|
| 660 |
lm_top_p: float,
|
|
|
|
| 672 |
llm_handler: LLM handler instance
|
| 673 |
query: User's natural language music description
|
| 674 |
instrumental: Whether to generate instrumental music
|
| 675 |
+
vocal_language: Preferred vocal language for constrained decoding
|
| 676 |
lm_temperature: LLM temperature for generation
|
| 677 |
lm_top_k: LLM top-k sampling
|
| 678 |
lm_top_p: LLM top-p sampling
|
|
|
|
| 696 |
- is_format_caption_state (True)
|
| 697 |
- status_output
|
| 698 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 699 |
# Check if LLM is initialized
|
| 700 |
if not llm_handler.llm_initialized:
|
| 701 |
gr.Warning(t("messages.lm_not_initialized"))
|
|
|
|
| 745 |
gr.update(), # audio_duration - no change
|
| 746 |
gr.update(), # key_scale - no change
|
| 747 |
gr.update(), # vocal_language - no change
|
| 748 |
+
gr.update(), # simple vocal_language - no change
|
| 749 |
gr.update(), # time_signature - no change
|
| 750 |
gr.update(), # instrumental_checkbox - no change
|
| 751 |
gr.update(), # caption_accordion - no change
|
|
|
|
| 767 |
result.duration if result.duration and result.duration > 0 else -1, # audio_duration
|
| 768 |
result.keyscale, # key_scale
|
| 769 |
result.language, # vocal_language
|
| 770 |
+
result.language, # simple vocal_language
|
| 771 |
result.timesignature, # time_signature
|
| 772 |
result.instrumental, # instrumental_checkbox
|
| 773 |
gr.update(open=True), # caption_accordion - expand
|
|
|
|
| 780 |
)
|
| 781 |
|
| 782 |
|
| 783 |
+
def handle_format_sample(
|
| 784 |
+
llm_handler,
|
| 785 |
+
caption: str,
|
| 786 |
+
lyrics: str,
|
| 787 |
+
bpm,
|
| 788 |
+
audio_duration,
|
| 789 |
+
key_scale: str,
|
| 790 |
+
time_signature: str,
|
| 791 |
+
lm_temperature: float,
|
| 792 |
+
lm_top_k: int,
|
| 793 |
+
lm_top_p: float,
|
| 794 |
+
constrained_decoding_debug: bool = False,
|
| 795 |
+
):
|
| 796 |
+
"""
|
| 797 |
+
Handle the Format button click to format caption and lyrics.
|
| 798 |
+
|
| 799 |
+
Takes user-provided caption and lyrics, and uses the LLM to generate
|
| 800 |
+
structured music metadata and an enhanced description.
|
| 801 |
+
|
| 802 |
+
Note: cfg_scale and negative_prompt are not supported in format mode.
|
| 803 |
+
|
| 804 |
+
Args:
|
| 805 |
+
llm_handler: LLM handler instance
|
| 806 |
+
caption: User's caption/description
|
| 807 |
+
lyrics: User's lyrics
|
| 808 |
+
bpm: User-provided BPM (optional, for constrained decoding)
|
| 809 |
+
audio_duration: User-provided duration (optional, for constrained decoding)
|
| 810 |
+
key_scale: User-provided key scale (optional, for constrained decoding)
|
| 811 |
+
time_signature: User-provided time signature (optional, for constrained decoding)
|
| 812 |
+
lm_temperature: LLM temperature for generation
|
| 813 |
+
lm_top_k: LLM top-k sampling
|
| 814 |
+
lm_top_p: LLM top-p sampling
|
| 815 |
+
constrained_decoding_debug: Whether to enable debug logging
|
| 816 |
+
|
| 817 |
+
Returns:
|
| 818 |
+
Tuple of updates for:
|
| 819 |
+
- captions
|
| 820 |
+
- lyrics
|
| 821 |
+
- bpm
|
| 822 |
+
- audio_duration
|
| 823 |
+
- key_scale
|
| 824 |
+
- vocal_language
|
| 825 |
+
- time_signature
|
| 826 |
+
- is_format_caption_state
|
| 827 |
+
- status_output
|
| 828 |
+
"""
|
| 829 |
+
# Check if LLM is initialized
|
| 830 |
+
if not llm_handler.llm_initialized:
|
| 831 |
+
gr.Warning(t("messages.lm_not_initialized"))
|
| 832 |
+
return (
|
| 833 |
+
gr.update(), # captions - no change
|
| 834 |
+
gr.update(), # lyrics - no change
|
| 835 |
+
gr.update(), # bpm - no change
|
| 836 |
+
gr.update(), # audio_duration - no change
|
| 837 |
+
gr.update(), # key_scale - no change
|
| 838 |
+
gr.update(), # vocal_language - no change
|
| 839 |
+
gr.update(), # time_signature - no change
|
| 840 |
+
gr.update(), # is_format_caption_state - no change
|
| 841 |
+
t("messages.lm_not_initialized"), # status_output
|
| 842 |
+
)
|
| 843 |
+
|
| 844 |
+
# Build user_metadata from provided values for constrained decoding
|
| 845 |
+
user_metadata = {}
|
| 846 |
+
if bpm is not None and bpm > 0:
|
| 847 |
+
user_metadata['bpm'] = int(bpm)
|
| 848 |
+
if audio_duration is not None and audio_duration > 0:
|
| 849 |
+
user_metadata['duration'] = int(audio_duration)
|
| 850 |
+
if key_scale and key_scale.strip():
|
| 851 |
+
user_metadata['keyscale'] = key_scale.strip()
|
| 852 |
+
if time_signature and time_signature.strip():
|
| 853 |
+
user_metadata['timesignature'] = time_signature.strip()
|
| 854 |
+
|
| 855 |
+
# Only pass user_metadata if we have at least one field
|
| 856 |
+
user_metadata_to_pass = user_metadata if user_metadata else None
|
| 857 |
+
|
| 858 |
+
# Convert LM parameters
|
| 859 |
+
top_k_value = None if not lm_top_k or lm_top_k == 0 else int(lm_top_k)
|
| 860 |
+
top_p_value = None if not lm_top_p or lm_top_p >= 1.0 else lm_top_p
|
| 861 |
+
|
| 862 |
+
# Call format_sample API
|
| 863 |
+
result = format_sample(
|
| 864 |
+
llm_handler=llm_handler,
|
| 865 |
+
caption=caption,
|
| 866 |
+
lyrics=lyrics,
|
| 867 |
+
user_metadata=user_metadata_to_pass,
|
| 868 |
+
temperature=lm_temperature,
|
| 869 |
+
top_k=top_k_value,
|
| 870 |
+
top_p=top_p_value,
|
| 871 |
+
use_constrained_decoding=True,
|
| 872 |
+
constrained_decoding_debug=constrained_decoding_debug,
|
| 873 |
+
)
|
| 874 |
+
|
| 875 |
+
# Handle error
|
| 876 |
+
if not result.success:
|
| 877 |
+
gr.Warning(result.status_message or t("messages.format_failed"))
|
| 878 |
+
return (
|
| 879 |
+
gr.update(), # captions - no change
|
| 880 |
+
gr.update(), # lyrics - no change
|
| 881 |
+
gr.update(), # bpm - no change
|
| 882 |
+
gr.update(), # audio_duration - no change
|
| 883 |
+
gr.update(), # key_scale - no change
|
| 884 |
+
gr.update(), # vocal_language - no change
|
| 885 |
+
gr.update(), # time_signature - no change
|
| 886 |
+
gr.update(), # is_format_caption_state - no change
|
| 887 |
+
result.status_message or t("messages.format_failed"), # status_output
|
| 888 |
+
)
|
| 889 |
+
|
| 890 |
+
# Success - populate fields
|
| 891 |
+
gr.Info(t("messages.format_success"))
|
| 892 |
+
|
| 893 |
+
return (
|
| 894 |
+
result.caption, # captions
|
| 895 |
+
result.lyrics, # lyrics
|
| 896 |
+
result.bpm, # bpm
|
| 897 |
+
result.duration if result.duration and result.duration > 0 else -1, # audio_duration
|
| 898 |
+
result.keyscale, # key_scale
|
| 899 |
+
result.language, # vocal_language
|
| 900 |
+
result.timesignature, # time_signature
|
| 901 |
+
True, # is_format_caption_state - True (LM-formatted)
|
| 902 |
+
result.status_message, # status_output
|
| 903 |
+
)
|
| 904 |
+
|
acestep/gradio_ui/events/results_handlers.py
CHANGED
|
@@ -452,7 +452,7 @@ def generate_with_progress(
|
|
| 452 |
reference_audio, audio_duration, batch_size_input, src_audio,
|
| 453 |
text2music_audio_code_string, repainting_start, repainting_end,
|
| 454 |
instruction_display_gen, audio_cover_strength, task_type,
|
| 455 |
-
use_adg, cfg_interval_start, cfg_interval_end, shift, audio_format, lm_temperature,
|
| 456 |
think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
|
| 457 |
use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
|
| 458 |
constrained_decoding_debug,
|
|
@@ -465,6 +465,14 @@ def generate_with_progress(
|
|
| 465 |
):
|
| 466 |
"""Generate audio with progress tracking"""
|
| 467 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
# step 1: prepare inputs
|
| 469 |
# generate_music, GenerationParams, GenerationConfig
|
| 470 |
gen_params = GenerationParams(
|
|
@@ -487,6 +495,7 @@ def generate_with_progress(
|
|
| 487 |
cfg_interval_start=cfg_interval_start,
|
| 488 |
cfg_interval_end=cfg_interval_end,
|
| 489 |
shift=shift,
|
|
|
|
| 490 |
repainting_start=repainting_start,
|
| 491 |
repainting_end=repainting_end,
|
| 492 |
audio_cover_strength=audio_cover_strength,
|
|
@@ -496,7 +505,7 @@ def generate_with_progress(
|
|
| 496 |
lm_top_k=lm_top_k,
|
| 497 |
lm_top_p=lm_top_p,
|
| 498 |
lm_negative_prompt=lm_negative_prompt,
|
| 499 |
-
use_cot_metas=
|
| 500 |
use_cot_caption=use_cot_caption,
|
| 501 |
use_cot_language=use_cot_language,
|
| 502 |
use_constrained_decoding=True,
|
|
@@ -587,7 +596,7 @@ def generate_with_progress(
|
|
| 587 |
# Clear lrc_display with empty string - this triggers .change() to clear subtitles
|
| 588 |
clear_lrcs = [gr.update(value="", visible=True) for _ in range(8)]
|
| 589 |
clear_accordions = [gr.skip() for _ in range(8)] # Don't change accordion visibility
|
| 590 |
-
dump_audio = [None for _ in range(8)]
|
| 591 |
yield (
|
| 592 |
# Audio outputs - just skip, value will be updated in loop
|
| 593 |
# Subtitles will be cleared via lrc_display.change()
|
|
@@ -1302,7 +1311,7 @@ def capture_current_params(
|
|
| 1302 |
reference_audio, audio_duration, batch_size_input, src_audio,
|
| 1303 |
text2music_audio_code_string, repainting_start, repainting_end,
|
| 1304 |
instruction_display_gen, audio_cover_strength, task_type,
|
| 1305 |
-
use_adg, cfg_interval_start, cfg_interval_end, shift, audio_format, lm_temperature,
|
| 1306 |
think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
|
| 1307 |
use_cot_metas, use_cot_caption, use_cot_language,
|
| 1308 |
constrained_decoding_debug, allow_lm_batch, auto_score, auto_lrc, score_scale, lm_batch_chunk_size,
|
|
@@ -1339,6 +1348,7 @@ def capture_current_params(
|
|
| 1339 |
"cfg_interval_start": cfg_interval_start,
|
| 1340 |
"cfg_interval_end": cfg_interval_end,
|
| 1341 |
"shift": shift,
|
|
|
|
| 1342 |
"audio_format": audio_format,
|
| 1343 |
"lm_temperature": lm_temperature,
|
| 1344 |
"think_checkbox": think_checkbox,
|
|
@@ -1367,7 +1377,7 @@ def generate_with_batch_management(
|
|
| 1367 |
reference_audio, audio_duration, batch_size_input, src_audio,
|
| 1368 |
text2music_audio_code_string, repainting_start, repainting_end,
|
| 1369 |
instruction_display_gen, audio_cover_strength, task_type,
|
| 1370 |
-
use_adg, cfg_interval_start, cfg_interval_end, shift, audio_format, lm_temperature,
|
| 1371 |
think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
|
| 1372 |
use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
|
| 1373 |
constrained_decoding_debug,
|
|
@@ -1396,7 +1406,7 @@ def generate_with_batch_management(
|
|
| 1396 |
reference_audio, audio_duration, batch_size_input, src_audio,
|
| 1397 |
text2music_audio_code_string, repainting_start, repainting_end,
|
| 1398 |
instruction_display_gen, audio_cover_strength, task_type,
|
| 1399 |
-
use_adg, cfg_interval_start, cfg_interval_end, shift, audio_format, lm_temperature,
|
| 1400 |
think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
|
| 1401 |
use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
|
| 1402 |
constrained_decoding_debug,
|
|
@@ -1476,6 +1486,7 @@ def generate_with_batch_management(
|
|
| 1476 |
"cfg_interval_start": cfg_interval_start,
|
| 1477 |
"cfg_interval_end": cfg_interval_end,
|
| 1478 |
"shift": shift,
|
|
|
|
| 1479 |
"audio_format": audio_format,
|
| 1480 |
"lm_temperature": lm_temperature,
|
| 1481 |
"think_checkbox": think_checkbox,
|
|
@@ -1661,6 +1672,7 @@ def generate_next_batch_background(
|
|
| 1661 |
params.setdefault("cfg_interval_start", 0.0)
|
| 1662 |
params.setdefault("cfg_interval_end", 1.0)
|
| 1663 |
params.setdefault("shift", 1.0)
|
|
|
|
| 1664 |
params.setdefault("audio_format", "mp3")
|
| 1665 |
params.setdefault("lm_temperature", 0.85)
|
| 1666 |
params.setdefault("think_checkbox", True)
|
|
@@ -1682,6 +1694,8 @@ def generate_next_batch_background(
|
|
| 1682 |
|
| 1683 |
# Call generate_with_progress with the saved parameters
|
| 1684 |
# Note: generate_with_progress is a generator, need to iterate through it
|
|
|
|
|
|
|
| 1685 |
generator = generate_with_progress(
|
| 1686 |
dit_handler,
|
| 1687 |
llm_handler,
|
|
@@ -1709,6 +1723,7 @@ def generate_next_batch_background(
|
|
| 1709 |
cfg_interval_start=params.get("cfg_interval_start"),
|
| 1710 |
cfg_interval_end=params.get("cfg_interval_end"),
|
| 1711 |
shift=params.get("shift"),
|
|
|
|
| 1712 |
audio_format=params.get("audio_format"),
|
| 1713 |
lm_temperature=params.get("lm_temperature"),
|
| 1714 |
think_checkbox=params.get("think_checkbox"),
|
|
@@ -1719,7 +1734,7 @@ def generate_next_batch_background(
|
|
| 1719 |
use_cot_metas=params.get("use_cot_metas"),
|
| 1720 |
use_cot_caption=params.get("use_cot_caption"),
|
| 1721 |
use_cot_language=params.get("use_cot_language"),
|
| 1722 |
-
is_format_caption=is_format_caption,
|
| 1723 |
constrained_decoding_debug=params.get("constrained_decoding_debug"),
|
| 1724 |
allow_lm_batch=params.get("allow_lm_batch"),
|
| 1725 |
auto_score=params.get("auto_score"),
|
|
|
|
| 452 |
reference_audio, audio_duration, batch_size_input, src_audio,
|
| 453 |
text2music_audio_code_string, repainting_start, repainting_end,
|
| 454 |
instruction_display_gen, audio_cover_strength, task_type,
|
| 455 |
+
use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, audio_format, lm_temperature,
|
| 456 |
think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
|
| 457 |
use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
|
| 458 |
constrained_decoding_debug,
|
|
|
|
| 465 |
):
|
| 466 |
"""Generate audio with progress tracking"""
|
| 467 |
|
| 468 |
+
# Skip Phase 1 metas COT if sample is already formatted (from LLM/file/random)
|
| 469 |
+
# This avoids redundant LLM calls since metas (bpm, keyscale, etc.) are already generated
|
| 470 |
+
actual_use_cot_metas = use_cot_metas
|
| 471 |
+
if is_format_caption and use_cot_metas:
|
| 472 |
+
actual_use_cot_metas = False
|
| 473 |
+
logger.info("[generate_with_progress] Skipping Phase 1 metas COT: sample is already formatted (is_format_caption=True)")
|
| 474 |
+
gr.Info(t("messages.skipping_metas_cot"))
|
| 475 |
+
|
| 476 |
# step 1: prepare inputs
|
| 477 |
# generate_music, GenerationParams, GenerationConfig
|
| 478 |
gen_params = GenerationParams(
|
|
|
|
| 495 |
cfg_interval_start=cfg_interval_start,
|
| 496 |
cfg_interval_end=cfg_interval_end,
|
| 497 |
shift=shift,
|
| 498 |
+
infer_method=infer_method,
|
| 499 |
repainting_start=repainting_start,
|
| 500 |
repainting_end=repainting_end,
|
| 501 |
audio_cover_strength=audio_cover_strength,
|
|
|
|
| 505 |
lm_top_k=lm_top_k,
|
| 506 |
lm_top_p=lm_top_p,
|
| 507 |
lm_negative_prompt=lm_negative_prompt,
|
| 508 |
+
use_cot_metas=actual_use_cot_metas,
|
| 509 |
use_cot_caption=use_cot_caption,
|
| 510 |
use_cot_language=use_cot_language,
|
| 511 |
use_constrained_decoding=True,
|
|
|
|
| 596 |
# Clear lrc_display with empty string - this triggers .change() to clear subtitles
|
| 597 |
clear_lrcs = [gr.update(value="", visible=True) for _ in range(8)]
|
| 598 |
clear_accordions = [gr.skip() for _ in range(8)] # Don't change accordion visibility
|
| 599 |
+
dump_audio = [gr.update(value=None, subtitles=None) for _ in range(8)]
|
| 600 |
yield (
|
| 601 |
# Audio outputs - just skip, value will be updated in loop
|
| 602 |
# Subtitles will be cleared via lrc_display.change()
|
|
|
|
| 1311 |
reference_audio, audio_duration, batch_size_input, src_audio,
|
| 1312 |
text2music_audio_code_string, repainting_start, repainting_end,
|
| 1313 |
instruction_display_gen, audio_cover_strength, task_type,
|
| 1314 |
+
use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, audio_format, lm_temperature,
|
| 1315 |
think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
|
| 1316 |
use_cot_metas, use_cot_caption, use_cot_language,
|
| 1317 |
constrained_decoding_debug, allow_lm_batch, auto_score, auto_lrc, score_scale, lm_batch_chunk_size,
|
|
|
|
| 1348 |
"cfg_interval_start": cfg_interval_start,
|
| 1349 |
"cfg_interval_end": cfg_interval_end,
|
| 1350 |
"shift": shift,
|
| 1351 |
+
"infer_method": infer_method,
|
| 1352 |
"audio_format": audio_format,
|
| 1353 |
"lm_temperature": lm_temperature,
|
| 1354 |
"think_checkbox": think_checkbox,
|
|
|
|
| 1377 |
reference_audio, audio_duration, batch_size_input, src_audio,
|
| 1378 |
text2music_audio_code_string, repainting_start, repainting_end,
|
| 1379 |
instruction_display_gen, audio_cover_strength, task_type,
|
| 1380 |
+
use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, audio_format, lm_temperature,
|
| 1381 |
think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
|
| 1382 |
use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
|
| 1383 |
constrained_decoding_debug,
|
|
|
|
| 1406 |
reference_audio, audio_duration, batch_size_input, src_audio,
|
| 1407 |
text2music_audio_code_string, repainting_start, repainting_end,
|
| 1408 |
instruction_display_gen, audio_cover_strength, task_type,
|
| 1409 |
+
use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, audio_format, lm_temperature,
|
| 1410 |
think_checkbox, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
|
| 1411 |
use_cot_metas, use_cot_caption, use_cot_language, is_format_caption,
|
| 1412 |
constrained_decoding_debug,
|
|
|
|
| 1486 |
"cfg_interval_start": cfg_interval_start,
|
| 1487 |
"cfg_interval_end": cfg_interval_end,
|
| 1488 |
"shift": shift,
|
| 1489 |
+
"infer_method": infer_method,
|
| 1490 |
"audio_format": audio_format,
|
| 1491 |
"lm_temperature": lm_temperature,
|
| 1492 |
"think_checkbox": think_checkbox,
|
|
|
|
| 1672 |
params.setdefault("cfg_interval_start", 0.0)
|
| 1673 |
params.setdefault("cfg_interval_end", 1.0)
|
| 1674 |
params.setdefault("shift", 1.0)
|
| 1675 |
+
params.setdefault("infer_method", "ode")
|
| 1676 |
params.setdefault("audio_format", "mp3")
|
| 1677 |
params.setdefault("lm_temperature", 0.85)
|
| 1678 |
params.setdefault("think_checkbox", True)
|
|
|
|
| 1694 |
|
| 1695 |
# Call generate_with_progress with the saved parameters
|
| 1696 |
# Note: generate_with_progress is a generator, need to iterate through it
|
| 1697 |
+
# For AutoGen background batches, always skip metas COT since we want to
|
| 1698 |
+
# generate NEW audio codes with new seeds, not regenerate the same metas
|
| 1699 |
generator = generate_with_progress(
|
| 1700 |
dit_handler,
|
| 1701 |
llm_handler,
|
|
|
|
| 1723 |
cfg_interval_start=params.get("cfg_interval_start"),
|
| 1724 |
cfg_interval_end=params.get("cfg_interval_end"),
|
| 1725 |
shift=params.get("shift"),
|
| 1726 |
+
infer_method=params.get("infer_method"),
|
| 1727 |
audio_format=params.get("audio_format"),
|
| 1728 |
lm_temperature=params.get("lm_temperature"),
|
| 1729 |
think_checkbox=params.get("think_checkbox"),
|
|
|
|
| 1734 |
use_cot_metas=params.get("use_cot_metas"),
|
| 1735 |
use_cot_caption=params.get("use_cot_caption"),
|
| 1736 |
use_cot_language=params.get("use_cot_language"),
|
| 1737 |
+
is_format_caption=is_format_caption, # Pass through - will skip metas COT if True
|
| 1738 |
constrained_decoding_debug=params.get("constrained_decoding_debug"),
|
| 1739 |
allow_lm_batch=params.get("allow_lm_batch"),
|
| 1740 |
auto_score=params.get("auto_score"),
|
acestep/gradio_ui/i18n/en.json
CHANGED
|
@@ -84,7 +84,7 @@
|
|
| 84 |
"mode_simple": "Simple",
|
| 85 |
"mode_custom": "Custom",
|
| 86 |
"simple_query_label": "Song Description",
|
| 87 |
-
"simple_query_placeholder": "Describe the music you want to create, e.g., 'a soft Bengali love song for a quiet evening'",
|
| 88 |
"simple_query_info": "Enter a natural language description of the music you want to generate",
|
| 89 |
"simple_vocal_language_label": "Vocal Language (optional)",
|
| 90 |
"simple_vocal_language_info": "Select preferred language(s) for lyrics. Use 'unknown' for any language.",
|
|
@@ -98,6 +98,7 @@
|
|
| 98 |
"lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
|
| 99 |
"lyrics_info": "Song lyrics with structure",
|
| 100 |
"instrumental_label": "Instrumental",
|
|
|
|
| 101 |
"optional_params": "⚙️ Optional Parameters",
|
| 102 |
"vocal_language_label": "Vocal Language (optional)",
|
| 103 |
"vocal_language_info": "use `unknown` for inst",
|
|
@@ -127,6 +128,8 @@
|
|
| 127 |
"use_adg_info": "Enable Angle Domain Guidance",
|
| 128 |
"shift_label": "Shift",
|
| 129 |
"shift_info": "Timestep shift factor for base models (range 1.0~5.0, default 3.0). Not effective for turbo models.",
|
|
|
|
|
|
|
| 130 |
"cfg_interval_start": "CFG Interval Start",
|
| 131 |
"cfg_interval_end": "CFG Interval End",
|
| 132 |
"lm_params_title": "🤖 LM Generation Parameters",
|
|
@@ -227,6 +230,9 @@
|
|
| 227 |
"sample_created": "✅ Sample created! Review the caption and lyrics, then click Generate Music.",
|
| 228 |
"simple_examples_not_found": "⚠️ Simple mode examples directory not found.",
|
| 229 |
"simple_examples_empty": "⚠️ No example files found in simple mode examples.",
|
| 230 |
-
"simple_example_loaded": "🎲 Loaded random example from {filename}"
|
|
|
|
|
|
|
|
|
|
| 231 |
}
|
| 232 |
}
|
|
|
|
| 84 |
"mode_simple": "Simple",
|
| 85 |
"mode_custom": "Custom",
|
| 86 |
"simple_query_label": "Song Description",
|
| 87 |
+
"simple_query_placeholder": "Describe the music you want to create, e.g., 'a soft Bengali love song for a quiet evening'. Leave empty for a random sample.",
|
| 88 |
"simple_query_info": "Enter a natural language description of the music you want to generate",
|
| 89 |
"simple_vocal_language_label": "Vocal Language (optional)",
|
| 90 |
"simple_vocal_language_info": "Select preferred language(s) for lyrics. Use 'unknown' for any language.",
|
|
|
|
| 98 |
"lyrics_placeholder": "[Verse 1]\\nUnder the starry night\\nI feel so alive...",
|
| 99 |
"lyrics_info": "Song lyrics with structure",
|
| 100 |
"instrumental_label": "Instrumental",
|
| 101 |
+
"format_btn": "Format",
|
| 102 |
"optional_params": "⚙️ Optional Parameters",
|
| 103 |
"vocal_language_label": "Vocal Language (optional)",
|
| 104 |
"vocal_language_info": "use `unknown` for inst",
|
|
|
|
| 128 |
"use_adg_info": "Enable Angle Domain Guidance",
|
| 129 |
"shift_label": "Shift",
|
| 130 |
"shift_info": "Timestep shift factor for base models (range 1.0~5.0, default 3.0). Not effective for turbo models.",
|
| 131 |
+
"infer_method_label": "Inference Method",
|
| 132 |
+
"infer_method_info": "Diffusion inference method. ODE (Euler) is faster, SDE (stochastic) may produce different results.",
|
| 133 |
"cfg_interval_start": "CFG Interval Start",
|
| 134 |
"cfg_interval_end": "CFG Interval End",
|
| 135 |
"lm_params_title": "🤖 LM Generation Parameters",
|
|
|
|
| 230 |
"sample_created": "✅ Sample created! Review the caption and lyrics, then click Generate Music.",
|
| 231 |
"simple_examples_not_found": "⚠️ Simple mode examples directory not found.",
|
| 232 |
"simple_examples_empty": "⚠️ No example files found in simple mode examples.",
|
| 233 |
+
"simple_example_loaded": "🎲 Loaded random example from {filename}",
|
| 234 |
+
"format_success": "✅ Caption and lyrics formatted successfully",
|
| 235 |
+
"format_failed": "❌ Format failed: {error}",
|
| 236 |
+
"skipping_metas_cot": "⚡ Skipping Phase 1 metas COT (sample already formatted)"
|
| 237 |
}
|
| 238 |
}
|
acestep/gradio_ui/i18n/ja.json
CHANGED
|
@@ -84,7 +84,7 @@
|
|
| 84 |
"mode_simple": "シンプル",
|
| 85 |
"mode_custom": "カスタム",
|
| 86 |
"simple_query_label": "曲の説明",
|
| 87 |
-
"simple_query_placeholder": "作成したい音楽を説明してください。例:'静かな夜のための優しいベンガルのラブソング'",
|
| 88 |
"simple_query_info": "生成したい音楽の自然言語の説明を入力",
|
| 89 |
"simple_vocal_language_label": "ボーカル言語(オプション)",
|
| 90 |
"simple_vocal_language_info": "歌詞の希望言語を選択。任意の言語の場合は'unknown'を使用。",
|
|
@@ -98,6 +98,7 @@
|
|
| 98 |
"lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
|
| 99 |
"lyrics_info": "構造を持つ曲の歌詞",
|
| 100 |
"instrumental_label": "インストゥルメンタル",
|
|
|
|
| 101 |
"optional_params": "⚙️ オプションパラメータ",
|
| 102 |
"vocal_language_label": "ボーカル言語(オプション)",
|
| 103 |
"vocal_language_info": "インストには`unknown`を使用",
|
|
@@ -127,6 +128,8 @@
|
|
| 127 |
"use_adg_info": "角度ドメインガイダンスを有効化",
|
| 128 |
"shift_label": "シフト",
|
| 129 |
"shift_info": "baseモデル用タイムステップシフト係数 (範囲 1.0~5.0、デフォルト 3.0)。turboモデルには無効。",
|
|
|
|
|
|
|
| 130 |
"cfg_interval_start": "CFG 間隔開始",
|
| 131 |
"cfg_interval_end": "CFG 間隔終了",
|
| 132 |
"lm_params_title": "🤖 LM 生成パラメータ",
|
|
@@ -227,6 +230,9 @@
|
|
| 227 |
"sample_created": "✅ サンプルが作成されました!キャプションと歌詞を確認して、音楽を生成をクリックしてください。",
|
| 228 |
"simple_examples_not_found": "⚠️ シンプルモードサンプルディレクトリが見つかりません。",
|
| 229 |
"simple_examples_empty": "⚠️ シンプルモードサンプルにファイルがありません。",
|
| 230 |
-
"simple_example_loaded": "🎲 {filename} からランダムサンプルを読み込みました"
|
|
|
|
|
|
|
|
|
|
| 231 |
}
|
| 232 |
}
|
|
|
|
| 84 |
"mode_simple": "シンプル",
|
| 85 |
"mode_custom": "カスタム",
|
| 86 |
"simple_query_label": "曲の説明",
|
| 87 |
+
"simple_query_placeholder": "作成したい音楽を説明してください。例:'静かな夜のための優しいベンガルのラブソング'。空欄の場合はランダムなサンプルが生成されます。",
|
| 88 |
"simple_query_info": "生成したい音楽の自然言語の説明を入力",
|
| 89 |
"simple_vocal_language_label": "ボーカル言語(オプション)",
|
| 90 |
"simple_vocal_language_info": "歌詞の希望言語を選択。任意の言語の場合は'unknown'を使用。",
|
|
|
|
| 98 |
"lyrics_placeholder": "[バース1]\\n星空の下で\\nとても生きていると感じる...",
|
| 99 |
"lyrics_info": "構造を持つ曲の歌詞",
|
| 100 |
"instrumental_label": "インストゥルメンタル",
|
| 101 |
+
"format_btn": "フォーマット",
|
| 102 |
"optional_params": "⚙️ オプションパラメータ",
|
| 103 |
"vocal_language_label": "ボーカル言語(オプション)",
|
| 104 |
"vocal_language_info": "インストには`unknown`を使用",
|
|
|
|
| 128 |
"use_adg_info": "角度ドメインガイダンスを有効化",
|
| 129 |
"shift_label": "シフト",
|
| 130 |
"shift_info": "baseモデル用タイムステップシフト係数 (範囲 1.0~5.0、デフォルト 3.0)。turboモデルには無効。",
|
| 131 |
+
"infer_method_label": "推論方法",
|
| 132 |
+
"infer_method_info": "拡散推論方法。ODE (オイラー) は高速、SDE (確率的) は異なる結果を生成する可能性があります。",
|
| 133 |
"cfg_interval_start": "CFG 間隔開始",
|
| 134 |
"cfg_interval_end": "CFG 間隔終了",
|
| 135 |
"lm_params_title": "🤖 LM 生成パラメータ",
|
|
|
|
| 230 |
"sample_created": "✅ サンプルが作成されました!キャプションと歌詞を確認して、音楽を生成をクリックしてください。",
|
| 231 |
"simple_examples_not_found": "⚠️ シンプルモードサンプルディレクトリが見つかりません。",
|
| 232 |
"simple_examples_empty": "⚠️ シンプルモードサンプルにファイルがありません。",
|
| 233 |
+
"simple_example_loaded": "🎲 {filename} からランダムサンプルを読み込みました",
|
| 234 |
+
"format_success": "✅ キャプションと歌詞のフォーマットに成功しました",
|
| 235 |
+
"format_failed": "❌ フォーマットに失敗しました: {error}",
|
| 236 |
+
"skipping_metas_cot": "⚡ Phase 1 メタデータ COT をスキップ(サンプルは既にフォーマット済み)"
|
| 237 |
}
|
| 238 |
}
|
acestep/gradio_ui/i18n/zh.json
CHANGED
|
@@ -84,7 +84,7 @@
|
|
| 84 |
"mode_simple": "简单",
|
| 85 |
"mode_custom": "自定义",
|
| 86 |
"simple_query_label": "歌曲描述",
|
| 87 |
-
"simple_query_placeholder": "描述你想创作的音乐,例如:'给我生成一首暗黑的戏剧古风,歌词要华丽'",
|
| 88 |
"simple_query_info": "输入你想生成的音乐的自然语言描述",
|
| 89 |
"simple_vocal_language_label": "人声语言(可选)",
|
| 90 |
"simple_vocal_language_info": "选择歌词的首选语言。使用 'unknown' 表示任意语言。",
|
|
@@ -98,6 +98,7 @@
|
|
| 98 |
"lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
|
| 99 |
"lyrics_info": "带有结构的歌曲歌词",
|
| 100 |
"instrumental_label": "纯音乐",
|
|
|
|
| 101 |
"optional_params": "⚙️ 可选参数",
|
| 102 |
"vocal_language_label": "人声语言(可选)",
|
| 103 |
"vocal_language_info": "纯音乐使用 `unknown`",
|
|
@@ -127,6 +128,8 @@
|
|
| 127 |
"use_adg_info": "启用角域引导",
|
| 128 |
"shift_label": "Shift",
|
| 129 |
"shift_info": "时间步偏移因子,仅对 base 模型生效 (范围 1.0~5.0,默认 3.0)。对 turbo 模型无效。",
|
|
|
|
|
|
|
| 130 |
"cfg_interval_start": "CFG 间隔开始",
|
| 131 |
"cfg_interval_end": "CFG 间隔结束",
|
| 132 |
"lm_params_title": "🤖 LM 生成参数",
|
|
@@ -227,6 +230,9 @@
|
|
| 227 |
"sample_created": "✅ 样本已创建!检查描述和歌词,然后点击生成音乐。",
|
| 228 |
"simple_examples_not_found": "⚠️ 未找到简单模式示例目录。",
|
| 229 |
"simple_examples_empty": "⚠️ 简单模式示例中没有示例文件。",
|
| 230 |
-
"simple_example_loaded": "🎲 已从 {filename} 加载随机示例"
|
|
|
|
|
|
|
|
|
|
| 231 |
}
|
| 232 |
}
|
|
|
|
| 84 |
"mode_simple": "简单",
|
| 85 |
"mode_custom": "自定义",
|
| 86 |
"simple_query_label": "歌曲描述",
|
| 87 |
+
"simple_query_placeholder": "描述你想创作的音乐,例如:'给我生成一首暗黑的戏剧古风,歌词要华丽'。留空则随机生成样本。",
|
| 88 |
"simple_query_info": "输入你想生成的音乐的自然语言描述",
|
| 89 |
"simple_vocal_language_label": "人声语言(可选)",
|
| 90 |
"simple_vocal_language_info": "选择歌词的首选语言。使用 'unknown' 表示任意语言。",
|
|
|
|
| 98 |
"lyrics_placeholder": "[第一段]\\n在星空下\\n我感到如此活跃...",
|
| 99 |
"lyrics_info": "带有结构的歌曲歌词",
|
| 100 |
"instrumental_label": "纯音乐",
|
| 101 |
+
"format_btn": "格式化",
|
| 102 |
"optional_params": "⚙️ 可选参数",
|
| 103 |
"vocal_language_label": "人声语言(可选)",
|
| 104 |
"vocal_language_info": "纯音乐使用 `unknown`",
|
|
|
|
| 128 |
"use_adg_info": "启用角域引导",
|
| 129 |
"shift_label": "Shift",
|
| 130 |
"shift_info": "时间步偏移因子,仅对 base 模型生效 (范围 1.0~5.0,默认 3.0)。对 turbo 模型无效。",
|
| 131 |
+
"infer_method_label": "推理方法",
|
| 132 |
+
"infer_method_info": "扩散推理方法。ODE (欧拉) 更快,SDE (随机) 可能产生不同结果。",
|
| 133 |
"cfg_interval_start": "CFG 间隔开始",
|
| 134 |
"cfg_interval_end": "CFG 间隔结束",
|
| 135 |
"lm_params_title": "🤖 LM 生成参数",
|
|
|
|
| 230 |
"sample_created": "✅ 样本已创建!检查描述和歌词,然后点击生成音乐。",
|
| 231 |
"simple_examples_not_found": "⚠️ 未找到简单模式示例目录。",
|
| 232 |
"simple_examples_empty": "⚠️ 简单模式示例中没有示例文件。",
|
| 233 |
+
"simple_example_loaded": "🎲 已从 {filename} 加载随机示例",
|
| 234 |
+
"format_success": "✅ 描述和歌词格式化成功",
|
| 235 |
+
"format_failed": "❌ 格式化失败: {error}",
|
| 236 |
+
"skipping_metas_cot": "⚡ 跳过 Phase 1 元数据 COT(样本已格式化)"
|
| 237 |
}
|
| 238 |
}
|
acestep/gradio_ui/interfaces/generation.py
CHANGED
|
@@ -314,15 +314,15 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
|
|
| 314 |
placeholder=t("generation.caption_placeholder"),
|
| 315 |
lines=3,
|
| 316 |
info=t("generation.caption_info"),
|
| 317 |
-
scale=
|
| 318 |
)
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
# Lyrics - wrapped in accordion that can be collapsed in Simple mode
|
| 327 |
with gr.Accordion(t("generation.lyrics_title"), open=False) as lyrics_accordion:
|
| 328 |
lyrics = gr.Textbox(
|
|
@@ -331,22 +331,40 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
|
|
| 331 |
lines=8,
|
| 332 |
info=t("generation.lyrics_info")
|
| 333 |
)
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
|
|
|
|
|
|
|
|
|
| 343 |
vocal_language = gr.Dropdown(
|
| 344 |
choices=VALID_LANGUAGES,
|
| 345 |
value="unknown",
|
| 346 |
label=t("generation.vocal_language_label"),
|
|
|
|
|
|
|
| 347 |
allow_custom_value=True,
|
| 348 |
-
|
| 349 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
bpm = gr.Number(
|
| 351 |
label=t("generation.bpm_label"),
|
| 352 |
value=None,
|
|
@@ -437,6 +455,12 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
|
|
| 437 |
info=t("generation.shift_info"),
|
| 438 |
visible=False
|
| 439 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
|
| 441 |
with gr.Row():
|
| 442 |
cfg_interval_start = gr.Slider(
|
|
@@ -673,12 +697,14 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
|
|
| 673 |
"cfg_interval_start": cfg_interval_start,
|
| 674 |
"cfg_interval_end": cfg_interval_end,
|
| 675 |
"shift": shift,
|
|
|
|
| 676 |
"audio_format": audio_format,
|
| 677 |
"output_alignment_preference": output_alignment_preference,
|
| 678 |
"think_checkbox": think_checkbox,
|
| 679 |
"autogen_checkbox": autogen_checkbox,
|
| 680 |
"generate_btn": generate_btn,
|
| 681 |
"instrumental_checkbox": instrumental_checkbox,
|
|
|
|
| 682 |
"constrained_decoding_debug": constrained_decoding_debug,
|
| 683 |
"score_scale": score_scale,
|
| 684 |
"allow_lm_batch": allow_lm_batch,
|
|
|
|
| 314 |
placeholder=t("generation.caption_placeholder"),
|
| 315 |
lines=3,
|
| 316 |
info=t("generation.caption_info"),
|
| 317 |
+
scale=12,
|
| 318 |
)
|
| 319 |
+
with gr.Column(scale=1, min_width=100):
|
| 320 |
+
sample_btn = gr.Button(
|
| 321 |
+
"🎲",
|
| 322 |
+
variant="secondary",
|
| 323 |
+
size="sm",
|
| 324 |
+
scale=2,
|
| 325 |
+
)
|
| 326 |
# Lyrics - wrapped in accordion that can be collapsed in Simple mode
|
| 327 |
with gr.Accordion(t("generation.lyrics_title"), open=False) as lyrics_accordion:
|
| 328 |
lyrics = gr.Textbox(
|
|
|
|
| 331 |
lines=8,
|
| 332 |
info=t("generation.lyrics_info")
|
| 333 |
)
|
| 334 |
+
|
| 335 |
+
with gr.Row(variant="compact", equal_height=True):
|
| 336 |
+
instrumental_checkbox = gr.Checkbox(
|
| 337 |
+
label=t("generation.instrumental_label"),
|
| 338 |
+
value=False,
|
| 339 |
+
scale=1,
|
| 340 |
+
min_width=120,
|
| 341 |
+
container=True,
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
# 中间:语言选择 (Dropdown)
|
| 345 |
+
# 移除 gr.HTML hack,直接使用 label 参数,Gradio 会自动处理对齐
|
| 346 |
vocal_language = gr.Dropdown(
|
| 347 |
choices=VALID_LANGUAGES,
|
| 348 |
value="unknown",
|
| 349 |
label=t("generation.vocal_language_label"),
|
| 350 |
+
show_label=False,
|
| 351 |
+
container=True,
|
| 352 |
allow_custom_value=True,
|
| 353 |
+
scale=3,
|
| 354 |
)
|
| 355 |
+
|
| 356 |
+
# 右侧:格式化按钮 (Button)
|
| 357 |
+
# 放在同一行最右侧,操作更顺手
|
| 358 |
+
format_btn = gr.Button(
|
| 359 |
+
t("generation.format_btn"),
|
| 360 |
+
variant="secondary",
|
| 361 |
+
scale=1,
|
| 362 |
+
min_width=80,
|
| 363 |
+
)
|
| 364 |
+
|
| 365 |
+
# Optional Parameters
|
| 366 |
+
with gr.Accordion(t("generation.optional_params"), open=False) as optional_params_accordion:
|
| 367 |
+
with gr.Row():
|
| 368 |
bpm = gr.Number(
|
| 369 |
label=t("generation.bpm_label"),
|
| 370 |
value=None,
|
|
|
|
| 455 |
info=t("generation.shift_info"),
|
| 456 |
visible=False
|
| 457 |
)
|
| 458 |
+
infer_method = gr.Dropdown(
|
| 459 |
+
choices=["ode", "sde"],
|
| 460 |
+
value="ode",
|
| 461 |
+
label=t("generation.infer_method_label"),
|
| 462 |
+
info=t("generation.infer_method_info"),
|
| 463 |
+
)
|
| 464 |
|
| 465 |
with gr.Row():
|
| 466 |
cfg_interval_start = gr.Slider(
|
|
|
|
| 697 |
"cfg_interval_start": cfg_interval_start,
|
| 698 |
"cfg_interval_end": cfg_interval_end,
|
| 699 |
"shift": shift,
|
| 700 |
+
"infer_method": infer_method,
|
| 701 |
"audio_format": audio_format,
|
| 702 |
"output_alignment_preference": output_alignment_preference,
|
| 703 |
"think_checkbox": think_checkbox,
|
| 704 |
"autogen_checkbox": autogen_checkbox,
|
| 705 |
"generate_btn": generate_btn,
|
| 706 |
"instrumental_checkbox": instrumental_checkbox,
|
| 707 |
+
"format_btn": format_btn,
|
| 708 |
"constrained_decoding_debug": constrained_decoding_debug,
|
| 709 |
"score_scale": score_scale,
|
| 710 |
"allow_lm_batch": allow_lm_batch,
|
acestep/handler.py
CHANGED
|
@@ -2079,6 +2079,7 @@ class AceStepHandler:
|
|
| 2079 |
cfg_interval_start: float = 0.0,
|
| 2080 |
cfg_interval_end: float = 1.0,
|
| 2081 |
shift: float = 1.0,
|
|
|
|
| 2082 |
use_tiled_decode: bool = True,
|
| 2083 |
progress=None
|
| 2084 |
) -> Dict[str, Any]:
|
|
@@ -2227,6 +2228,7 @@ class AceStepHandler:
|
|
| 2227 |
cfg_interval_start=cfg_interval_start, # Pass CFG interval start
|
| 2228 |
cfg_interval_end=cfg_interval_end, # Pass CFG interval end
|
| 2229 |
shift=shift, # Pass shift parameter
|
|
|
|
| 2230 |
audio_code_hints=audio_code_hints_batch, # Pass audio code hints as list
|
| 2231 |
return_intermediate=should_return_intermediate
|
| 2232 |
)
|
|
|
|
| 2079 |
cfg_interval_start: float = 0.0,
|
| 2080 |
cfg_interval_end: float = 1.0,
|
| 2081 |
shift: float = 1.0,
|
| 2082 |
+
infer_method: str = "ode",
|
| 2083 |
use_tiled_decode: bool = True,
|
| 2084 |
progress=None
|
| 2085 |
) -> Dict[str, Any]:
|
|
|
|
| 2228 |
cfg_interval_start=cfg_interval_start, # Pass CFG interval start
|
| 2229 |
cfg_interval_end=cfg_interval_end, # Pass CFG interval end
|
| 2230 |
shift=shift, # Pass shift parameter
|
| 2231 |
+
infer_method=infer_method, # Pass infer method (ode or sde)
|
| 2232 |
audio_code_hints=audio_code_hints_batch, # Pass audio code hints as list
|
| 2233 |
return_intermediate=should_return_intermediate
|
| 2234 |
)
|
acestep/inference.py
CHANGED
|
@@ -96,6 +96,7 @@ class GenerationParams:
|
|
| 96 |
cfg_interval_start: float = 0.0
|
| 97 |
cfg_interval_end: float = 1.0
|
| 98 |
shift: float = 1.0
|
|
|
|
| 99 |
|
| 100 |
repainting_start: float = 0.0
|
| 101 |
repainting_end: float = -1
|
|
@@ -532,6 +533,7 @@ def generate_music(
|
|
| 532 |
cfg_interval_start=params.cfg_interval_start,
|
| 533 |
cfg_interval_end=params.cfg_interval_end,
|
| 534 |
shift=params.shift,
|
|
|
|
| 535 |
progress=progress,
|
| 536 |
)
|
| 537 |
|
|
@@ -671,8 +673,6 @@ def understand_music(
|
|
| 671 |
llm_handler,
|
| 672 |
audio_codes: str,
|
| 673 |
temperature: float = 0.85,
|
| 674 |
-
cfg_scale: float = 1.0,
|
| 675 |
-
negative_prompt: str = "NO USER INPUT",
|
| 676 |
top_k: Optional[int] = None,
|
| 677 |
top_p: Optional[float] = None,
|
| 678 |
repetition_penalty: float = 1.0,
|
|
@@ -687,13 +687,13 @@ def understand_music(
|
|
| 687 |
If audio_codes is empty or "NO USER INPUT", the LM will generate a sample example
|
| 688 |
instead of analyzing existing codes.
|
| 689 |
|
|
|
|
|
|
|
| 690 |
Args:
|
| 691 |
llm_handler: Initialized LLM handler (LLMHandler instance)
|
| 692 |
audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
|
| 693 |
Use empty string or "NO USER INPUT" to generate a sample example.
|
| 694 |
temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
|
| 695 |
-
cfg_scale: Classifier-Free Guidance scale (1.0 = no CFG, >1.0 = use CFG)
|
| 696 |
-
negative_prompt: Negative prompt for CFG guidance
|
| 697 |
top_k: Top-K sampling (None or 0 = disabled)
|
| 698 |
top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
|
| 699 |
repetition_penalty: Repetition penalty (1.0 = no penalty)
|
|
@@ -727,8 +727,6 @@ def understand_music(
|
|
| 727 |
metadata, status = llm_handler.understand_audio_from_codes(
|
| 728 |
audio_codes=audio_codes,
|
| 729 |
temperature=temperature,
|
| 730 |
-
cfg_scale=cfg_scale,
|
| 731 |
-
negative_prompt=negative_prompt,
|
| 732 |
top_k=top_k,
|
| 733 |
top_p=top_p,
|
| 734 |
repetition_penalty=repetition_penalty,
|
|
@@ -847,7 +845,7 @@ def create_sample(
|
|
| 847 |
llm_handler,
|
| 848 |
query: str,
|
| 849 |
instrumental: bool = False,
|
| 850 |
-
vocal_language: Optional[
|
| 851 |
temperature: float = 0.85,
|
| 852 |
top_k: Optional[int] = None,
|
| 853 |
top_p: Optional[float] = None,
|
|
@@ -869,9 +867,9 @@ def create_sample(
|
|
| 869 |
llm_handler: Initialized LLM handler (LLMHandler instance)
|
| 870 |
query: User's natural language music description (e.g., "a soft Bengali love song")
|
| 871 |
instrumental: Whether to generate instrumental music (no vocals)
|
| 872 |
-
vocal_language:
|
| 873 |
-
If provided, the model will be constrained to generate lyrics in
|
| 874 |
-
If None or
|
| 875 |
temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
|
| 876 |
top_k: Top-K sampling (None or 0 = disabled)
|
| 877 |
top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
|
|
@@ -883,7 +881,7 @@ def create_sample(
|
|
| 883 |
CreateSampleResult with generated sample fields and status
|
| 884 |
|
| 885 |
Example:
|
| 886 |
-
>>> result = create_sample(llm_handler, "a soft Bengali love song for a quiet evening", vocal_language=
|
| 887 |
>>> if result.success:
|
| 888 |
... print(f"Caption: {result.caption}")
|
| 889 |
... print(f"Lyrics: {result.lyrics}")
|
|
@@ -897,14 +895,6 @@ def create_sample(
|
|
| 897 |
error="LLM not initialized",
|
| 898 |
)
|
| 899 |
|
| 900 |
-
# Validate query
|
| 901 |
-
if not query or not query.strip():
|
| 902 |
-
return CreateSampleResult(
|
| 903 |
-
status_message="No query provided. Please enter a music description.",
|
| 904 |
-
success=False,
|
| 905 |
-
error="Empty query",
|
| 906 |
-
)
|
| 907 |
-
|
| 908 |
try:
|
| 909 |
# Call LLM to create sample
|
| 910 |
metadata, status = llm_handler.create_sample_from_query(
|
|
@@ -982,3 +972,175 @@ def create_sample(
|
|
| 982 |
success=False,
|
| 983 |
error=str(e),
|
| 984 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
cfg_interval_start: float = 0.0
|
| 97 |
cfg_interval_end: float = 1.0
|
| 98 |
shift: float = 1.0
|
| 99 |
+
infer_method: str = "ode" # "ode" or "sde" - diffusion inference method
|
| 100 |
|
| 101 |
repainting_start: float = 0.0
|
| 102 |
repainting_end: float = -1
|
|
|
|
| 533 |
cfg_interval_start=params.cfg_interval_start,
|
| 534 |
cfg_interval_end=params.cfg_interval_end,
|
| 535 |
shift=params.shift,
|
| 536 |
+
infer_method=params.infer_method,
|
| 537 |
progress=progress,
|
| 538 |
)
|
| 539 |
|
|
|
|
| 673 |
llm_handler,
|
| 674 |
audio_codes: str,
|
| 675 |
temperature: float = 0.85,
|
|
|
|
|
|
|
| 676 |
top_k: Optional[int] = None,
|
| 677 |
top_p: Optional[float] = None,
|
| 678 |
repetition_penalty: float = 1.0,
|
|
|
|
| 687 |
If audio_codes is empty or "NO USER INPUT", the LM will generate a sample example
|
| 688 |
instead of analyzing existing codes.
|
| 689 |
|
| 690 |
+
Note: cfg_scale and negative_prompt are not supported in understand mode.
|
| 691 |
+
|
| 692 |
Args:
|
| 693 |
llm_handler: Initialized LLM handler (LLMHandler instance)
|
| 694 |
audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
|
| 695 |
Use empty string or "NO USER INPUT" to generate a sample example.
|
| 696 |
temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
|
|
|
|
|
|
|
| 697 |
top_k: Top-K sampling (None or 0 = disabled)
|
| 698 |
top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
|
| 699 |
repetition_penalty: Repetition penalty (1.0 = no penalty)
|
|
|
|
| 727 |
metadata, status = llm_handler.understand_audio_from_codes(
|
| 728 |
audio_codes=audio_codes,
|
| 729 |
temperature=temperature,
|
|
|
|
|
|
|
| 730 |
top_k=top_k,
|
| 731 |
top_p=top_p,
|
| 732 |
repetition_penalty=repetition_penalty,
|
|
|
|
| 845 |
llm_handler,
|
| 846 |
query: str,
|
| 847 |
instrumental: bool = False,
|
| 848 |
+
vocal_language: Optional[str] = None,
|
| 849 |
temperature: float = 0.85,
|
| 850 |
top_k: Optional[int] = None,
|
| 851 |
top_p: Optional[float] = None,
|
|
|
|
| 867 |
llm_handler: Initialized LLM handler (LLMHandler instance)
|
| 868 |
query: User's natural language music description (e.g., "a soft Bengali love song")
|
| 869 |
instrumental: Whether to generate instrumental music (no vocals)
|
| 870 |
+
vocal_language: Allowed vocal language for constrained decoding (e.g., "en", "zh").
|
| 871 |
+
If provided, the model will be constrained to generate lyrics in this language.
|
| 872 |
+
If None or "unknown", no language constraint is applied.
|
| 873 |
temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
|
| 874 |
top_k: Top-K sampling (None or 0 = disabled)
|
| 875 |
top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
|
|
|
|
| 881 |
CreateSampleResult with generated sample fields and status
|
| 882 |
|
| 883 |
Example:
|
| 884 |
+
>>> result = create_sample(llm_handler, "a soft Bengali love song for a quiet evening", vocal_language="bn")
|
| 885 |
>>> if result.success:
|
| 886 |
... print(f"Caption: {result.caption}")
|
| 887 |
... print(f"Lyrics: {result.lyrics}")
|
|
|
|
| 895 |
error="LLM not initialized",
|
| 896 |
)
|
| 897 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 898 |
try:
|
| 899 |
# Call LLM to create sample
|
| 900 |
metadata, status = llm_handler.create_sample_from_query(
|
|
|
|
| 972 |
success=False,
|
| 973 |
error=str(e),
|
| 974 |
)
|
| 975 |
+
|
| 976 |
+
|
| 977 |
+
@dataclass
|
| 978 |
+
class FormatSampleResult:
|
| 979 |
+
"""Result of formatting user-provided caption and lyrics.
|
| 980 |
+
|
| 981 |
+
This is used by the "Format" feature where users provide caption and lyrics,
|
| 982 |
+
and the LLM formats them into structured music metadata and an enhanced description.
|
| 983 |
+
|
| 984 |
+
Attributes:
|
| 985 |
+
# Metadata Fields
|
| 986 |
+
caption: Enhanced/formatted music description/caption
|
| 987 |
+
lyrics: Formatted lyrics (may be same as input or reformatted)
|
| 988 |
+
bpm: Beats per minute (None if not detected)
|
| 989 |
+
duration: Duration in seconds (None if not detected)
|
| 990 |
+
keyscale: Musical key (e.g., "C Major")
|
| 991 |
+
language: Vocal language code (e.g., "en", "zh")
|
| 992 |
+
timesignature: Time signature (e.g., "4")
|
| 993 |
+
|
| 994 |
+
# Status
|
| 995 |
+
status_message: Status message from formatting
|
| 996 |
+
success: Whether formatting completed successfully
|
| 997 |
+
error: Error message if formatting failed
|
| 998 |
+
"""
|
| 999 |
+
# Metadata Fields
|
| 1000 |
+
caption: str = ""
|
| 1001 |
+
lyrics: str = ""
|
| 1002 |
+
bpm: Optional[int] = None
|
| 1003 |
+
duration: Optional[float] = None
|
| 1004 |
+
keyscale: str = ""
|
| 1005 |
+
language: str = ""
|
| 1006 |
+
timesignature: str = ""
|
| 1007 |
+
|
| 1008 |
+
# Status
|
| 1009 |
+
status_message: str = ""
|
| 1010 |
+
success: bool = True
|
| 1011 |
+
error: Optional[str] = None
|
| 1012 |
+
|
| 1013 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 1014 |
+
"""Convert result to dictionary for JSON serialization."""
|
| 1015 |
+
return asdict(self)
|
| 1016 |
+
|
| 1017 |
+
|
| 1018 |
+
def format_sample(
|
| 1019 |
+
llm_handler,
|
| 1020 |
+
caption: str,
|
| 1021 |
+
lyrics: str,
|
| 1022 |
+
user_metadata: Optional[Dict[str, Any]] = None,
|
| 1023 |
+
temperature: float = 0.85,
|
| 1024 |
+
top_k: Optional[int] = None,
|
| 1025 |
+
top_p: Optional[float] = None,
|
| 1026 |
+
repetition_penalty: float = 1.0,
|
| 1027 |
+
use_constrained_decoding: bool = True,
|
| 1028 |
+
constrained_decoding_debug: bool = False,
|
| 1029 |
+
) -> FormatSampleResult:
|
| 1030 |
+
"""Format user-provided caption and lyrics using the 5Hz Language Model.
|
| 1031 |
+
|
| 1032 |
+
This function takes user input (caption and lyrics) and generates structured
|
| 1033 |
+
music metadata including an enhanced caption, BPM, duration, key, language,
|
| 1034 |
+
and time signature.
|
| 1035 |
+
|
| 1036 |
+
If user_metadata is provided, those values will be used to constrain the
|
| 1037 |
+
decoding, ensuring the output matches user-specified values.
|
| 1038 |
+
|
| 1039 |
+
Note: cfg_scale and negative_prompt are not supported in format mode.
|
| 1040 |
+
|
| 1041 |
+
Args:
|
| 1042 |
+
llm_handler: Initialized LLM handler (LLMHandler instance)
|
| 1043 |
+
caption: User's caption/description (e.g., "Latin pop, reggaeton")
|
| 1044 |
+
lyrics: User's lyrics with structure tags
|
| 1045 |
+
user_metadata: Optional dict with user-provided metadata to constrain decoding.
|
| 1046 |
+
Supported keys: bpm, duration, keyscale, timesignature, language
|
| 1047 |
+
temperature: Sampling temperature for generation (0.0-2.0). Higher = more creative.
|
| 1048 |
+
top_k: Top-K sampling (None or 0 = disabled)
|
| 1049 |
+
top_p: Top-P (nucleus) sampling (None or 1.0 = disabled)
|
| 1050 |
+
repetition_penalty: Repetition penalty (1.0 = no penalty)
|
| 1051 |
+
use_constrained_decoding: Whether to use FSM-based constrained decoding for metadata
|
| 1052 |
+
constrained_decoding_debug: Whether to enable debug logging for constrained decoding
|
| 1053 |
+
|
| 1054 |
+
Returns:
|
| 1055 |
+
FormatSampleResult with formatted metadata fields and status
|
| 1056 |
+
|
| 1057 |
+
Example:
|
| 1058 |
+
>>> result = format_sample(llm_handler, "Latin pop, reggaeton", "[Verse 1]\\nHola mundo...")
|
| 1059 |
+
>>> if result.success:
|
| 1060 |
+
... print(f"Caption: {result.caption}")
|
| 1061 |
+
... print(f"BPM: {result.bpm}")
|
| 1062 |
+
... print(f"Lyrics: {result.lyrics}")
|
| 1063 |
+
"""
|
| 1064 |
+
# Check if LLM is initialized
|
| 1065 |
+
if not llm_handler.llm_initialized:
|
| 1066 |
+
return FormatSampleResult(
|
| 1067 |
+
status_message="5Hz LM not initialized. Please initialize it first.",
|
| 1068 |
+
success=False,
|
| 1069 |
+
error="LLM not initialized",
|
| 1070 |
+
)
|
| 1071 |
+
|
| 1072 |
+
try:
|
| 1073 |
+
# Call LLM formatting
|
| 1074 |
+
metadata, status = llm_handler.format_sample_from_input(
|
| 1075 |
+
caption=caption,
|
| 1076 |
+
lyrics=lyrics,
|
| 1077 |
+
user_metadata=user_metadata,
|
| 1078 |
+
temperature=temperature,
|
| 1079 |
+
top_k=top_k,
|
| 1080 |
+
top_p=top_p,
|
| 1081 |
+
repetition_penalty=repetition_penalty,
|
| 1082 |
+
use_constrained_decoding=use_constrained_decoding,
|
| 1083 |
+
constrained_decoding_debug=constrained_decoding_debug,
|
| 1084 |
+
)
|
| 1085 |
+
|
| 1086 |
+
# Check if LLM returned empty metadata (error case)
|
| 1087 |
+
if not metadata:
|
| 1088 |
+
return FormatSampleResult(
|
| 1089 |
+
status_message=status or "Failed to format input",
|
| 1090 |
+
success=False,
|
| 1091 |
+
error=status or "Empty metadata returned",
|
| 1092 |
+
)
|
| 1093 |
+
|
| 1094 |
+
# Extract and convert fields
|
| 1095 |
+
result_caption = metadata.get('caption', '')
|
| 1096 |
+
result_lyrics = metadata.get('lyrics', lyrics) # Fall back to input lyrics
|
| 1097 |
+
keyscale = metadata.get('keyscale', '')
|
| 1098 |
+
language = metadata.get('language', metadata.get('vocal_language', ''))
|
| 1099 |
+
timesignature = metadata.get('timesignature', '')
|
| 1100 |
+
|
| 1101 |
+
# Convert BPM to int
|
| 1102 |
+
bpm = None
|
| 1103 |
+
bpm_value = metadata.get('bpm')
|
| 1104 |
+
if bpm_value is not None and bpm_value != 'N/A' and bpm_value != '':
|
| 1105 |
+
try:
|
| 1106 |
+
bpm = int(bpm_value)
|
| 1107 |
+
except (ValueError, TypeError):
|
| 1108 |
+
pass
|
| 1109 |
+
|
| 1110 |
+
# Convert duration to float
|
| 1111 |
+
duration = None
|
| 1112 |
+
duration_value = metadata.get('duration')
|
| 1113 |
+
if duration_value is not None and duration_value != 'N/A' and duration_value != '':
|
| 1114 |
+
try:
|
| 1115 |
+
duration = float(duration_value)
|
| 1116 |
+
except (ValueError, TypeError):
|
| 1117 |
+
pass
|
| 1118 |
+
|
| 1119 |
+
# Clean up N/A values
|
| 1120 |
+
if keyscale == 'N/A':
|
| 1121 |
+
keyscale = ''
|
| 1122 |
+
if language == 'N/A':
|
| 1123 |
+
language = ''
|
| 1124 |
+
if timesignature == 'N/A':
|
| 1125 |
+
timesignature = ''
|
| 1126 |
+
|
| 1127 |
+
return FormatSampleResult(
|
| 1128 |
+
caption=result_caption,
|
| 1129 |
+
lyrics=result_lyrics,
|
| 1130 |
+
bpm=bpm,
|
| 1131 |
+
duration=duration,
|
| 1132 |
+
keyscale=keyscale,
|
| 1133 |
+
language=language,
|
| 1134 |
+
timesignature=timesignature,
|
| 1135 |
+
status_message=status,
|
| 1136 |
+
success=True,
|
| 1137 |
+
error=None,
|
| 1138 |
+
)
|
| 1139 |
+
|
| 1140 |
+
except Exception as e:
|
| 1141 |
+
logger.exception("Format sample failed")
|
| 1142 |
+
return FormatSampleResult(
|
| 1143 |
+
status_message=f"Error: {str(e)}",
|
| 1144 |
+
success=False,
|
| 1145 |
+
error=str(e),
|
| 1146 |
+
)
|
acestep/llm_inference.py
CHANGED
|
@@ -19,7 +19,7 @@ from transformers.generation.logits_process import (
|
|
| 19 |
RepetitionPenaltyLogitsProcessor,
|
| 20 |
)
|
| 21 |
from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
|
| 22 |
-
from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION, DEFAULT_LM_INSPIRED_INSTRUCTION
|
| 23 |
|
| 24 |
|
| 25 |
class LLMHandler:
|
|
@@ -1296,8 +1296,6 @@ class LLMHandler:
|
|
| 1296 |
self,
|
| 1297 |
audio_codes: str,
|
| 1298 |
temperature: float = 0.3,
|
| 1299 |
-
cfg_scale: float = 1.0,
|
| 1300 |
-
negative_prompt: str = "NO USER INPUT",
|
| 1301 |
top_k: Optional[int] = None,
|
| 1302 |
top_p: Optional[float] = None,
|
| 1303 |
repetition_penalty: float = 1.0,
|
|
@@ -1306,16 +1304,16 @@ class LLMHandler:
|
|
| 1306 |
) -> Tuple[Dict[str, Any], str]:
|
| 1307 |
"""
|
| 1308 |
Understand audio codes and generate metadata + lyrics.
|
| 1309 |
-
|
| 1310 |
This is the reverse of the normal generation flow:
|
| 1311 |
- Input: Audio codes
|
| 1312 |
- Output: Metadata (bpm, caption, duration, etc.) + Lyrics
|
| 1313 |
-
|
|
|
|
|
|
|
| 1314 |
Args:
|
| 1315 |
audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
|
| 1316 |
temperature: Sampling temperature for generation
|
| 1317 |
-
cfg_scale: Classifier-Free Guidance scale (1.0 = no CFG, >1.0 = use CFG)
|
| 1318 |
-
negative_prompt: Negative prompt for CFG
|
| 1319 |
top_k: Top-K sampling (None = disabled)
|
| 1320 |
top_p: Top-P (nucleus) sampling (None = disabled)
|
| 1321 |
repetition_penalty: Repetition penalty (1.0 = no penalty)
|
|
@@ -1352,12 +1350,11 @@ class LLMHandler:
|
|
| 1352 |
print(f"formatted_prompt: {formatted_prompt}")
|
| 1353 |
# Generate using constrained decoding (understand phase)
|
| 1354 |
# We want to generate metadata first (CoT), then lyrics (natural text)
|
|
|
|
| 1355 |
output_text, status = self.generate_from_formatted_prompt(
|
| 1356 |
formatted_prompt=formatted_prompt,
|
| 1357 |
cfg={
|
| 1358 |
"temperature": temperature,
|
| 1359 |
-
"cfg_scale": cfg_scale,
|
| 1360 |
-
"negative_prompt": negative_prompt,
|
| 1361 |
"top_k": top_k,
|
| 1362 |
"top_p": top_p,
|
| 1363 |
"repetition_penalty": repetition_penalty,
|
|
@@ -1491,7 +1488,7 @@ class LLMHandler:
|
|
| 1491 |
self,
|
| 1492 |
query: str,
|
| 1493 |
instrumental: bool = False,
|
| 1494 |
-
vocal_language: Optional[
|
| 1495 |
temperature: float = 0.85,
|
| 1496 |
top_k: Optional[int] = None,
|
| 1497 |
top_p: Optional[float] = None,
|
|
@@ -1509,8 +1506,8 @@ class LLMHandler:
|
|
| 1509 |
Args:
|
| 1510 |
query: User's natural language music description
|
| 1511 |
instrumental: Whether to generate instrumental music (no vocals)
|
| 1512 |
-
vocal_language:
|
| 1513 |
-
If provided and not
|
| 1514 |
temperature: Sampling temperature for generation (0.0-2.0)
|
| 1515 |
top_k: Top-K sampling (None = disabled)
|
| 1516 |
top_p: Top-P (nucleus) sampling (None = disabled)
|
|
@@ -1532,7 +1529,7 @@ class LLMHandler:
|
|
| 1532 |
|
| 1533 |
Example:
|
| 1534 |
query = "a soft Bengali love song for a quiet evening"
|
| 1535 |
-
metadata, status = handler.create_sample_from_query(query, instrumental=False, vocal_language=
|
| 1536 |
print(metadata['caption']) # "A gentle romantic acoustic pop ballad..."
|
| 1537 |
print(metadata['lyrics']) # "[Intro: ...]\\n..."
|
| 1538 |
"""
|
|
@@ -1540,7 +1537,7 @@ class LLMHandler:
|
|
| 1540 |
return {}, "❌ 5Hz LM not initialized. Please initialize it first."
|
| 1541 |
|
| 1542 |
if not query or not query.strip():
|
| 1543 |
-
|
| 1544 |
|
| 1545 |
logger.info(f"Creating sample from query: {query[:100]}... (instrumental={instrumental}, vocal_language={vocal_language})")
|
| 1546 |
|
|
@@ -1554,14 +1551,11 @@ class LLMHandler:
|
|
| 1554 |
# Build user_metadata if vocal_language is specified and is not "unknown"
|
| 1555 |
user_metadata = None
|
| 1556 |
skip_language = False
|
| 1557 |
-
if vocal_language and
|
| 1558 |
-
#
|
| 1559 |
-
|
| 1560 |
-
|
| 1561 |
-
|
| 1562 |
-
user_metadata = {"language": valid_languages[0]}
|
| 1563 |
-
skip_language = True # Skip language generation since we're injecting it
|
| 1564 |
-
logger.info(f"Using user-specified language: {valid_languages[0]}")
|
| 1565 |
|
| 1566 |
# Generate using constrained decoding (inspiration phase)
|
| 1567 |
# Similar to understand mode - generate metadata first (CoT), then lyrics
|
|
@@ -1576,7 +1570,7 @@ class LLMHandler:
|
|
| 1576 |
"target_duration": None, # No duration constraint
|
| 1577 |
"user_metadata": user_metadata, # Inject language if specified
|
| 1578 |
"skip_caption": False, # Generate caption
|
| 1579 |
-
"skip_language":
|
| 1580 |
"skip_genres": False, # Generate genres
|
| 1581 |
"generation_phase": "understand", # Use understand phase for metadata + free-form lyrics
|
| 1582 |
"caption": "",
|
|
@@ -1604,12 +1598,210 @@ class LLMHandler:
|
|
| 1604 |
# Echo back the instrumental flag
|
| 1605 |
metadata['instrumental'] = instrumental
|
| 1606 |
|
| 1607 |
-
logger.info(f"Sample created successfully. Generated {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1608 |
if constrained_decoding_debug:
|
| 1609 |
logger.debug(f"Generated metadata: {list(metadata.keys())}")
|
| 1610 |
logger.debug(f"Output text preview: {output_text[:300]}...")
|
| 1611 |
|
| 1612 |
-
status_msg = f"✅
|
| 1613 |
return metadata, status_msg
|
| 1614 |
|
| 1615 |
def generate_from_formatted_prompt(
|
|
|
|
| 19 |
RepetitionPenaltyLogitsProcessor,
|
| 20 |
)
|
| 21 |
from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
|
| 22 |
+
from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION, DEFAULT_LM_INSPIRED_INSTRUCTION, DEFAULT_LM_REWRITE_INSTRUCTION
|
| 23 |
|
| 24 |
|
| 25 |
class LLMHandler:
|
|
|
|
| 1296 |
self,
|
| 1297 |
audio_codes: str,
|
| 1298 |
temperature: float = 0.3,
|
|
|
|
|
|
|
| 1299 |
top_k: Optional[int] = None,
|
| 1300 |
top_p: Optional[float] = None,
|
| 1301 |
repetition_penalty: float = 1.0,
|
|
|
|
| 1304 |
) -> Tuple[Dict[str, Any], str]:
|
| 1305 |
"""
|
| 1306 |
Understand audio codes and generate metadata + lyrics.
|
| 1307 |
+
|
| 1308 |
This is the reverse of the normal generation flow:
|
| 1309 |
- Input: Audio codes
|
| 1310 |
- Output: Metadata (bpm, caption, duration, etc.) + Lyrics
|
| 1311 |
+
|
| 1312 |
+
Note: cfg_scale and negative_prompt are not supported in understand mode.
|
| 1313 |
+
|
| 1314 |
Args:
|
| 1315 |
audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
|
| 1316 |
temperature: Sampling temperature for generation
|
|
|
|
|
|
|
| 1317 |
top_k: Top-K sampling (None = disabled)
|
| 1318 |
top_p: Top-P (nucleus) sampling (None = disabled)
|
| 1319 |
repetition_penalty: Repetition penalty (1.0 = no penalty)
|
|
|
|
| 1350 |
print(f"formatted_prompt: {formatted_prompt}")
|
| 1351 |
# Generate using constrained decoding (understand phase)
|
| 1352 |
# We want to generate metadata first (CoT), then lyrics (natural text)
|
| 1353 |
+
# Note: cfg_scale and negative_prompt are not used in understand mode
|
| 1354 |
output_text, status = self.generate_from_formatted_prompt(
|
| 1355 |
formatted_prompt=formatted_prompt,
|
| 1356 |
cfg={
|
| 1357 |
"temperature": temperature,
|
|
|
|
|
|
|
| 1358 |
"top_k": top_k,
|
| 1359 |
"top_p": top_p,
|
| 1360 |
"repetition_penalty": repetition_penalty,
|
|
|
|
| 1488 |
self,
|
| 1489 |
query: str,
|
| 1490 |
instrumental: bool = False,
|
| 1491 |
+
vocal_language: Optional[str] = None,
|
| 1492 |
temperature: float = 0.85,
|
| 1493 |
top_k: Optional[int] = None,
|
| 1494 |
top_p: Optional[float] = None,
|
|
|
|
| 1506 |
Args:
|
| 1507 |
query: User's natural language music description
|
| 1508 |
instrumental: Whether to generate instrumental music (no vocals)
|
| 1509 |
+
vocal_language: Allowed vocal language for constrained decoding (e.g., "en", "zh").
|
| 1510 |
+
If provided and not "unknown", it will be used.
|
| 1511 |
temperature: Sampling temperature for generation (0.0-2.0)
|
| 1512 |
top_k: Top-K sampling (None = disabled)
|
| 1513 |
top_p: Top-P (nucleus) sampling (None = disabled)
|
|
|
|
| 1529 |
|
| 1530 |
Example:
|
| 1531 |
query = "a soft Bengali love song for a quiet evening"
|
| 1532 |
+
metadata, status = handler.create_sample_from_query(query, instrumental=False, vocal_language="bn")
|
| 1533 |
print(metadata['caption']) # "A gentle romantic acoustic pop ballad..."
|
| 1534 |
print(metadata['lyrics']) # "[Intro: ...]\\n..."
|
| 1535 |
"""
|
|
|
|
| 1537 |
return {}, "❌ 5Hz LM not initialized. Please initialize it first."
|
| 1538 |
|
| 1539 |
if not query or not query.strip():
|
| 1540 |
+
query = "NO USER INPUT"
|
| 1541 |
|
| 1542 |
logger.info(f"Creating sample from query: {query[:100]}... (instrumental={instrumental}, vocal_language={vocal_language})")
|
| 1543 |
|
|
|
|
| 1551 |
# Build user_metadata if vocal_language is specified and is not "unknown"
|
| 1552 |
user_metadata = None
|
| 1553 |
skip_language = False
|
| 1554 |
+
if vocal_language and vocal_language.strip() and vocal_language.strip().lower() != "unknown":
|
| 1555 |
+
# Use the specified language for constrained decoding
|
| 1556 |
+
user_metadata = {"language": vocal_language.strip()}
|
| 1557 |
+
# skip_language = True # Skip language generation since we're injecting it
|
| 1558 |
+
logger.info(f"Using user-specified language: {vocal_language.strip()}")
|
|
|
|
|
|
|
|
|
|
| 1559 |
|
| 1560 |
# Generate using constrained decoding (inspiration phase)
|
| 1561 |
# Similar to understand mode - generate metadata first (CoT), then lyrics
|
|
|
|
| 1570 |
"target_duration": None, # No duration constraint
|
| 1571 |
"user_metadata": user_metadata, # Inject language if specified
|
| 1572 |
"skip_caption": False, # Generate caption
|
| 1573 |
+
"skip_language": False,
|
| 1574 |
"skip_genres": False, # Generate genres
|
| 1575 |
"generation_phase": "understand", # Use understand phase for metadata + free-form lyrics
|
| 1576 |
"caption": "",
|
|
|
|
| 1598 |
# Echo back the instrumental flag
|
| 1599 |
metadata['instrumental'] = instrumental
|
| 1600 |
|
| 1601 |
+
logger.info(f"Sample created successfully. Generated {metadata} fields")
|
| 1602 |
+
if constrained_decoding_debug:
|
| 1603 |
+
logger.debug(f"Generated metadata: {list(metadata.keys())}")
|
| 1604 |
+
logger.debug(f"Output text preview: {output_text[:300]}...")
|
| 1605 |
+
|
| 1606 |
+
status_msg = f"✅ Sample created successfully\nGenerated fields: {metadata}"
|
| 1607 |
+
return metadata, status_msg
|
| 1608 |
+
|
| 1609 |
+
def build_formatted_prompt_for_format(
|
| 1610 |
+
self,
|
| 1611 |
+
caption: str,
|
| 1612 |
+
lyrics: str,
|
| 1613 |
+
is_negative_prompt: bool = False,
|
| 1614 |
+
negative_prompt: str = "NO USER INPUT"
|
| 1615 |
+
) -> str:
|
| 1616 |
+
"""
|
| 1617 |
+
Build the chat-formatted prompt for format/rewrite mode.
|
| 1618 |
+
|
| 1619 |
+
This formats user-provided caption and lyrics into a more detailed and specific
|
| 1620 |
+
musical description with metadata.
|
| 1621 |
+
|
| 1622 |
+
Args:
|
| 1623 |
+
caption: User's caption/description of the music
|
| 1624 |
+
lyrics: User's lyrics
|
| 1625 |
+
is_negative_prompt: If True, builds unconditional prompt for CFG
|
| 1626 |
+
negative_prompt: Negative prompt for CFG (used when is_negative_prompt=True)
|
| 1627 |
+
|
| 1628 |
+
Returns:
|
| 1629 |
+
Formatted prompt string
|
| 1630 |
+
|
| 1631 |
+
Example:
|
| 1632 |
+
caption = "Latin pop, reggaeton, flamenco-pop"
|
| 1633 |
+
lyrics = "[Verse 1]\\nTengo un nudo..."
|
| 1634 |
+
prompt = handler.build_formatted_prompt_for_format(caption, lyrics)
|
| 1635 |
+
"""
|
| 1636 |
+
if self.llm_tokenizer is None:
|
| 1637 |
+
raise ValueError("LLM tokenizer is not initialized. Call initialize() first.")
|
| 1638 |
+
|
| 1639 |
+
if is_negative_prompt:
|
| 1640 |
+
# For CFG unconditional prompt
|
| 1641 |
+
user_content = negative_prompt if negative_prompt and negative_prompt.strip() else ""
|
| 1642 |
+
else:
|
| 1643 |
+
# Normal prompt: caption + lyrics
|
| 1644 |
+
user_content = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}"
|
| 1645 |
+
|
| 1646 |
+
return self.llm_tokenizer.apply_chat_template(
|
| 1647 |
+
[
|
| 1648 |
+
{
|
| 1649 |
+
"role": "system",
|
| 1650 |
+
"content": f"# Instruction\n{DEFAULT_LM_REWRITE_INSTRUCTION}\n\n"
|
| 1651 |
+
},
|
| 1652 |
+
{
|
| 1653 |
+
"role": "user",
|
| 1654 |
+
"content": user_content
|
| 1655 |
+
},
|
| 1656 |
+
],
|
| 1657 |
+
tokenize=False,
|
| 1658 |
+
add_generation_prompt=True,
|
| 1659 |
+
)
|
| 1660 |
+
|
| 1661 |
+
def format_sample_from_input(
|
| 1662 |
+
self,
|
| 1663 |
+
caption: str,
|
| 1664 |
+
lyrics: str,
|
| 1665 |
+
user_metadata: Optional[Dict[str, Any]] = None,
|
| 1666 |
+
temperature: float = 0.85,
|
| 1667 |
+
top_k: Optional[int] = None,
|
| 1668 |
+
top_p: Optional[float] = None,
|
| 1669 |
+
repetition_penalty: float = 1.0,
|
| 1670 |
+
use_constrained_decoding: bool = True,
|
| 1671 |
+
constrained_decoding_debug: bool = False,
|
| 1672 |
+
) -> Tuple[Dict[str, Any], str]:
|
| 1673 |
+
"""
|
| 1674 |
+
Format user-provided caption and lyrics into structured music metadata.
|
| 1675 |
+
|
| 1676 |
+
This is the "Format" feature that takes user input and generates:
|
| 1677 |
+
- Enhanced caption with detailed music description
|
| 1678 |
+
- Metadata (bpm, duration, keyscale, language, timesignature)
|
| 1679 |
+
- Formatted lyrics (preserved from input)
|
| 1680 |
+
|
| 1681 |
+
Note: cfg_scale and negative_prompt are not supported in format mode.
|
| 1682 |
+
|
| 1683 |
+
Args:
|
| 1684 |
+
caption: User's caption/description (e.g., "Latin pop, reggaeton")
|
| 1685 |
+
lyrics: User's lyrics with structure tags
|
| 1686 |
+
user_metadata: Optional dict with user-provided metadata to constrain decoding.
|
| 1687 |
+
Supported keys: bpm, duration, keyscale, timesignature, language
|
| 1688 |
+
temperature: Sampling temperature for generation (0.0-2.0)
|
| 1689 |
+
top_k: Top-K sampling (None = disabled)
|
| 1690 |
+
top_p: Top-P (nucleus) sampling (None = disabled)
|
| 1691 |
+
repetition_penalty: Repetition penalty (1.0 = no penalty)
|
| 1692 |
+
use_constrained_decoding: Whether to use FSM-based constrained decoding
|
| 1693 |
+
constrained_decoding_debug: Whether to enable debug logging
|
| 1694 |
+
|
| 1695 |
+
Returns:
|
| 1696 |
+
Tuple of (metadata_dict, status_message)
|
| 1697 |
+
metadata_dict contains:
|
| 1698 |
+
- bpm: int or str
|
| 1699 |
+
- caption: str (enhanced)
|
| 1700 |
+
- duration: int or str
|
| 1701 |
+
- keyscale: str
|
| 1702 |
+
- language: str
|
| 1703 |
+
- timesignature: str
|
| 1704 |
+
- lyrics: str (from input, possibly formatted)
|
| 1705 |
+
|
| 1706 |
+
Example:
|
| 1707 |
+
caption = "Latin pop, reggaeton, flamenco-pop"
|
| 1708 |
+
lyrics = "[Verse 1]\\nTengo un nudo en la garganta..."
|
| 1709 |
+
metadata, status = handler.format_sample_from_input(caption, lyrics)
|
| 1710 |
+
print(metadata['caption']) # "A dramatic and powerful Latin pop track..."
|
| 1711 |
+
print(metadata['bpm']) # 100
|
| 1712 |
+
"""
|
| 1713 |
+
if not getattr(self, "llm_initialized", False):
|
| 1714 |
+
return {}, "❌ 5Hz LM not initialized. Please initialize it first."
|
| 1715 |
+
|
| 1716 |
+
if not caption or not caption.strip():
|
| 1717 |
+
caption = "NO USER INPUT"
|
| 1718 |
+
if not lyrics or not lyrics.strip():
|
| 1719 |
+
lyrics = "[Instrumental]"
|
| 1720 |
+
|
| 1721 |
+
logger.info(f"Formatting sample from input: caption={caption[:50]}..., lyrics length={len(lyrics)}")
|
| 1722 |
+
|
| 1723 |
+
# Build formatted prompt for format task
|
| 1724 |
+
formatted_prompt = self.build_formatted_prompt_for_format(
|
| 1725 |
+
caption=caption,
|
| 1726 |
+
lyrics=lyrics,
|
| 1727 |
+
)
|
| 1728 |
+
logger.debug(f"Formatted prompt for format: {formatted_prompt}")
|
| 1729 |
+
|
| 1730 |
+
# Build constrained decoding metadata from user_metadata
|
| 1731 |
+
constrained_metadata = None
|
| 1732 |
+
if user_metadata:
|
| 1733 |
+
constrained_metadata = {}
|
| 1734 |
+
if user_metadata.get('bpm') is not None:
|
| 1735 |
+
try:
|
| 1736 |
+
bpm_val = int(user_metadata['bpm'])
|
| 1737 |
+
if bpm_val > 0:
|
| 1738 |
+
constrained_metadata['bpm'] = bpm_val
|
| 1739 |
+
except (ValueError, TypeError):
|
| 1740 |
+
pass
|
| 1741 |
+
if user_metadata.get('duration') is not None:
|
| 1742 |
+
try:
|
| 1743 |
+
dur_val = int(user_metadata['duration'])
|
| 1744 |
+
if dur_val > 0:
|
| 1745 |
+
constrained_metadata['duration'] = dur_val
|
| 1746 |
+
except (ValueError, TypeError):
|
| 1747 |
+
pass
|
| 1748 |
+
if user_metadata.get('keyscale'):
|
| 1749 |
+
constrained_metadata['keyscale'] = user_metadata['keyscale']
|
| 1750 |
+
if user_metadata.get('timesignature'):
|
| 1751 |
+
constrained_metadata['timesignature'] = user_metadata['timesignature']
|
| 1752 |
+
if user_metadata.get('language'):
|
| 1753 |
+
constrained_metadata['language'] = user_metadata['language']
|
| 1754 |
+
|
| 1755 |
+
# Only use if we have at least one field
|
| 1756 |
+
if not constrained_metadata:
|
| 1757 |
+
constrained_metadata = None
|
| 1758 |
+
else:
|
| 1759 |
+
logger.info(f"Using user-provided metadata constraints: {constrained_metadata}")
|
| 1760 |
+
|
| 1761 |
+
# Generate using constrained decoding (format phase)
|
| 1762 |
+
# Similar to understand/inspiration mode - generate metadata first (CoT), then formatted lyrics
|
| 1763 |
+
# Note: cfg_scale and negative_prompt are not used in format mode
|
| 1764 |
+
output_text, status = self.generate_from_formatted_prompt(
|
| 1765 |
+
formatted_prompt=formatted_prompt,
|
| 1766 |
+
cfg={
|
| 1767 |
+
"temperature": temperature,
|
| 1768 |
+
"top_k": top_k,
|
| 1769 |
+
"top_p": top_p,
|
| 1770 |
+
"repetition_penalty": repetition_penalty,
|
| 1771 |
+
"target_duration": None, # No duration constraint for generation length
|
| 1772 |
+
"user_metadata": constrained_metadata, # Inject user-provided metadata
|
| 1773 |
+
"skip_caption": False, # Generate caption
|
| 1774 |
+
"skip_language": constrained_metadata.get('language') is not None if constrained_metadata else False,
|
| 1775 |
+
"skip_genres": False, # Generate genres
|
| 1776 |
+
"generation_phase": "understand", # Use understand phase for metadata + free-form lyrics
|
| 1777 |
+
"caption": "",
|
| 1778 |
+
"lyrics": "",
|
| 1779 |
+
},
|
| 1780 |
+
use_constrained_decoding=use_constrained_decoding,
|
| 1781 |
+
constrained_decoding_debug=constrained_decoding_debug,
|
| 1782 |
+
stop_at_reasoning=False, # Continue after </think> to get formatted lyrics
|
| 1783 |
+
)
|
| 1784 |
+
|
| 1785 |
+
if not output_text:
|
| 1786 |
+
return {}, status
|
| 1787 |
+
|
| 1788 |
+
# Parse metadata and extract lyrics
|
| 1789 |
+
metadata, _ = self.parse_lm_output(output_text)
|
| 1790 |
+
|
| 1791 |
+
# Extract formatted lyrics section (everything after </think>)
|
| 1792 |
+
formatted_lyrics = self._extract_lyrics_from_output(output_text)
|
| 1793 |
+
if formatted_lyrics:
|
| 1794 |
+
metadata['lyrics'] = formatted_lyrics
|
| 1795 |
+
else:
|
| 1796 |
+
# If no lyrics generated, keep original input
|
| 1797 |
+
metadata['lyrics'] = lyrics
|
| 1798 |
+
|
| 1799 |
+
logger.info(f"Format completed successfully. Generated {len(metadata)} fields")
|
| 1800 |
if constrained_decoding_debug:
|
| 1801 |
logger.debug(f"Generated metadata: {list(metadata.keys())}")
|
| 1802 |
logger.debug(f"Output text preview: {output_text[:300]}...")
|
| 1803 |
|
| 1804 |
+
status_msg = f"✅ Format completed successfully\nGenerated fields: {', '.join(metadata.keys())}"
|
| 1805 |
return metadata, status_msg
|
| 1806 |
|
| 1807 |
def generate_from_formatted_prompt(
|
examples/simple_mode/example_01.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "a soft Bengali love song for a quiet evening",
|
| 3 |
"instrumental": false,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "a soft Bengali love song for a quiet evening",
|
| 3 |
"instrumental": false,
|
| 4 |
+
"vocal_language": "bn"
|
| 5 |
}
|
examples/simple_mode/example_02.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "an upbeat summer pop song with catchy hooks",
|
| 3 |
"instrumental": false,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "an upbeat summer pop song with catchy hooks",
|
| 3 |
"instrumental": false,
|
| 4 |
+
"vocal_language": "en"
|
| 5 |
}
|
examples/simple_mode/example_03.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "epic orchestral cinematic music for a movie trailer",
|
| 3 |
"instrumental": true,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "epic orchestral cinematic music for a movie trailer",
|
| 3 |
"instrumental": true,
|
| 4 |
+
"vocal_language": "unknown"
|
| 5 |
}
|
examples/simple_mode/example_04.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "一首深情的中文抒情歌曲,适合夜晚独自聆听",
|
| 3 |
"instrumental": false,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "一首深情的中文抒情歌曲,适合夜晚独自聆听",
|
| 3 |
"instrumental": false,
|
| 4 |
+
"vocal_language": "zh"
|
| 5 |
}
|
examples/simple_mode/example_05.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "Japanese city pop with nostalgic 80s vibes",
|
| 3 |
"instrumental": false,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "Japanese city pop with nostalgic 80s vibes",
|
| 3 |
"instrumental": false,
|
| 4 |
+
"vocal_language": "ja"
|
| 5 |
}
|
examples/simple_mode/example_06.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "lo-fi hip hop beats for studying and relaxing",
|
| 3 |
"instrumental": true,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "lo-fi hip hop beats for studying and relaxing",
|
| 3 |
"instrumental": true,
|
| 4 |
+
"vocal_language": "unknown"
|
| 5 |
}
|
examples/simple_mode/example_07.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "energetic K-pop dance track with powerful vocals",
|
| 3 |
"instrumental": false,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "energetic K-pop dance track with powerful vocals",
|
| 3 |
"instrumental": false,
|
| 4 |
+
"vocal_language": "ko"
|
| 5 |
}
|
examples/simple_mode/example_08.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "romantic Spanish guitar ballad with heartfelt lyrics",
|
| 3 |
"instrumental": false,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "romantic Spanish guitar ballad with heartfelt lyrics",
|
| 3 |
"instrumental": false,
|
| 4 |
+
"vocal_language": "es"
|
| 5 |
}
|
examples/simple_mode/example_09.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "中国风电子舞曲,融合古典乐器与现代节拍",
|
| 3 |
"instrumental": false,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "中国风电子舞曲,融合古典乐器与现代节拍",
|
| 3 |
"instrumental": false,
|
| 4 |
+
"vocal_language": "zh"
|
| 5 |
}
|
examples/simple_mode/example_10.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"description": "peaceful piano melody for meditation and relaxation",
|
| 3 |
"instrumental": true,
|
| 4 |
-
"vocal_language":
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"description": "peaceful piano melody for meditation and relaxation",
|
| 3 |
"instrumental": true,
|
| 4 |
+
"vocal_language": "unknown"
|
| 5 |
}
|