|
|
""" |
|
|
Constants for ACE-Step |
|
|
Centralized constants used across the codebase |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
VALID_LANGUAGES = [ |
|
|
'ar', 'az', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', |
|
|
'es', 'fa', 'fi', 'fr', 'he', 'hi', 'hr', 'ht', 'hu', 'id', |
|
|
'is', 'it', 'ja', 'ko', 'la', 'lt', 'ms', 'ne', 'nl', 'no', |
|
|
'pa', 'pl', 'pt', 'ro', 'ru', 'sa', 'sk', 'sr', 'sv', 'sw', |
|
|
'ta', 'te', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'yue', 'zh', |
|
|
'unknown' |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
KEYSCALE_NOTES = ['A', 'B', 'C', 'D', 'E', 'F', 'G'] |
|
|
KEYSCALE_ACCIDENTALS = ['', '#', 'b', '♯', '♭'] |
|
|
KEYSCALE_MODES = ['major', 'minor'] |
|
|
|
|
|
|
|
|
VALID_KEYSCALES = set() |
|
|
for note in KEYSCALE_NOTES: |
|
|
for acc in KEYSCALE_ACCIDENTALS: |
|
|
for mode in KEYSCALE_MODES: |
|
|
VALID_KEYSCALES.add(f"{note}{acc} {mode}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
BPM_MIN = 30 |
|
|
BPM_MAX = 300 |
|
|
|
|
|
|
|
|
DURATION_MIN = 10 |
|
|
DURATION_MAX = 600 |
|
|
|
|
|
|
|
|
VALID_TIME_SIGNATURES = [2, 3, 4, 6] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TASK_TYPES = ["text2music", "repaint", "cover", "extract", "lego", "complete"] |
|
|
|
|
|
|
|
|
TASK_TYPES_TURBO = ["text2music", "repaint", "cover"] |
|
|
|
|
|
|
|
|
TASK_TYPES_BASE = ["text2music", "repaint", "cover", "extract", "lego", "complete"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_DIT_INSTRUCTION = "Fill the audio semantic mask based on the given conditions:" |
|
|
DEFAULT_LM_INSTRUCTION = "Generate audio semantic tokens based on the given conditions:" |
|
|
DEFAULT_LM_UNDERSTAND_INSTRUCTION = "Understand the given musical conditions and describe the audio semantics accordingly:" |
|
|
DEFAULT_LM_INSPIRED_INSTRUCTION = "Expand the user's input into a more detailed and specific musical description:" |
|
|
DEFAULT_LM_REWRITE_INSTRUCTION = "Format the user's input into a more detailed and specific musical description:" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TASK_INSTRUCTIONS = { |
|
|
"text2music": "Fill the audio semantic mask based on the given conditions:", |
|
|
"repaint": "Repaint the mask area based on the given conditions:", |
|
|
"cover": "Generate audio semantic tokens based on the given conditions:", |
|
|
"extract": "Extract the {TRACK_NAME} track from the audio:", |
|
|
"extract_default": "Extract the track from the audio:", |
|
|
"lego": "Generate the {TRACK_NAME} track based on the audio context:", |
|
|
"lego_default": "Generate the track based on the audio context:", |
|
|
"complete": "Complete the input track with {TRACK_CLASSES}:", |
|
|
"complete_default": "Complete the input track:", |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TRACK_NAMES = [ |
|
|
"woodwinds", "brass", "fx", "synth", "strings", "percussion", |
|
|
"keyboard", "guitar", "bass", "drums", "backing_vocals", "vocals" |
|
|
] |
|
|
|
|
|
SFT_GEN_PROMPT = """# Instruction |
|
|
{} |
|
|
|
|
|
# Caption |
|
|
{} |
|
|
|
|
|
# Metas |
|
|
{}<|endoftext|> |
|
|
""" |
|
|
|