Spaces:
Running on Zero
Running on Zero
File size: 7,279 Bytes
a602628 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 | """
Constants for ACE-Step
Centralized constants used across the codebase
"""
# ==============================================================================
# Language Constants
# ==============================================================================
# Supported languages for vocal generation and language detection
# Covers major world languages with good TTS support in the underlying model
# 'unknown' is used when language cannot be determined automatically
VALID_LANGUAGES = [
'ar', 'az', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en',
'es', 'fa', 'fi', 'fr', 'he', 'hi', 'hr', 'ht', 'hu', 'id',
'is', 'it', 'ja', 'ko', 'la', 'lt', 'ms', 'ne', 'nl', 'no',
'pa', 'pl', 'pt', 'ro', 'ru', 'sa', 'sk', 'sr', 'sv', 'sw',
'ta', 'te', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'yue', 'zh',
'unknown'
]
# ==============================================================================
# Keyscale Constants
# ==============================================================================
# Musical note names using standard Western notation
KEYSCALE_NOTES = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
# Supported accidentals: natural, ASCII sharp/flat, Unicode sharp/flat
KEYSCALE_ACCIDENTALS = ['', '#', 'b', '♯', '♭'] # empty + ASCII sharp/flat + Unicode sharp/flat
# Major and minor scale modes
KEYSCALE_MODES = ['major', 'minor']
# Generate all valid keyscales: 7 notes × 5 accidentals × 2 modes = 70 combinations
# Examples: "C major", "F# minor", "B♭ major"
VALID_KEYSCALES = set()
for note in KEYSCALE_NOTES:
for acc in KEYSCALE_ACCIDENTALS:
for mode in KEYSCALE_MODES:
VALID_KEYSCALES.add(f"{note}{acc} {mode}")
# ==============================================================================
# Metadata Range Constants
# ==============================================================================
# BPM (Beats Per Minute) range - covers most musical styles
# 30 BPM: Very slow ballads, ambient music
# 300 BPM: Fast electronic dance music, extreme metal
BPM_MIN = 30
BPM_MAX = 300
# Duration range (in seconds) - balances quality vs. computational cost
# 10s: Short loops, musical excerpts
# 600s: Full songs, extended compositions (10 minutes)
DURATION_MIN = 10
DURATION_MAX = 600
# Valid time signatures - common musical meter patterns
# 2: 2/4 time (marches, polka)
# 3: 3/4 time (waltzes, ballads)
# 4: 4/4 time (most pop, rock, hip-hop)
# 6: 6/8 time (compound time, folk dances)
VALID_TIME_SIGNATURES = [2, 3, 4, 6]
# ==============================================================================
# Task Type Constants
# ==============================================================================
# All supported generation tasks across different model variants
TASK_TYPES = ["text2music", "repaint", "cover", "extract", "lego", "complete"]
# Task types available for turbo models (optimized subset for speed)
# - text2music: Generate from text descriptions
# - repaint: Selective audio editing/regeneration
# - cover: Style transfer using reference audio
TASK_TYPES_TURBO = ["text2music", "repaint", "cover"]
# Task types available for base models (full feature set)
# Additional tasks requiring more computational resources:
# - extract: Separate individual tracks/stems from audio
# - lego: Multi-track generation (add layers)
# - complete: Automatic completion of partial audio
TASK_TYPES_BASE = ["text2music", "repaint", "cover", "extract", "lego", "complete"]
# ==============================================================================
# Instruction Constants
# ==============================================================================
# Default instructions
DEFAULT_DIT_INSTRUCTION = "Fill the audio semantic mask based on the given conditions:"
DEFAULT_LM_INSTRUCTION = "Generate audio semantic tokens based on the given conditions:"
DEFAULT_LM_UNDERSTAND_INSTRUCTION = "Understand the given musical conditions and describe the audio semantics accordingly:"
DEFAULT_LM_INSPIRED_INSTRUCTION = "Expand the user's input into a more detailed and specific musical description:"
DEFAULT_LM_REWRITE_INSTRUCTION = "Format the user's input into a more detailed and specific musical description:"
# Instruction templates for each task type
# Note: Some instructions use placeholders like {TRACK_NAME} or {TRACK_CLASSES}
# These should be formatted using .format() or f-strings when used
TASK_INSTRUCTIONS = {
"text2music": "Fill the audio semantic mask based on the given conditions:",
"repaint": "Repaint the mask area based on the given conditions:",
"cover": "Generate audio semantic tokens based on the given conditions:",
"extract": "Extract the {TRACK_NAME} track from the audio:",
"extract_default": "Extract the track from the audio:",
"lego": "Generate the {TRACK_NAME} track based on the audio context:",
"lego_default": "Generate the track based on the audio context:",
"complete": "Complete the input track with {TRACK_CLASSES}:",
"complete_default": "Complete the input track:",
}
# ==============================================================================
# Track/Instrument Constants
# ==============================================================================
# Supported instrumental track types for multi-track generation and extraction
# Organized by instrument families for logical grouping:
# - Wind instruments: woodwinds, brass
# - Electronic: fx (effects), synth (synthesizer)
# - String instruments: strings, guitar, bass
# - Rhythm section: percussion, drums, keyboard
# - Vocals: backing_vocals, vocals (lead vocals)
TRACK_NAMES = [
"woodwinds", "brass", "fx", "synth", "strings", "percussion",
"keyboard", "guitar", "bass", "drums", "backing_vocals", "vocals"
]
# Template for SFT (Supervised Fine-Tuning) model prompts
# Used to format inputs for the language model with instruction, caption, and metadata
SFT_GEN_PROMPT = """# Instruction
{}
# Caption
{}
# Metas
{}<|endoftext|>
"""
# ==============================================================================
# GPU Memory Configuration Constants
# ==============================================================================
# GPU tier thresholds (in GB)
GPU_TIER_THRESHOLDS = {
"tier1": 4, # <= 4GB
"tier2": 6, # 4-6GB
"tier3": 8, # 6-8GB
"tier4": 12, # 8-12GB
"tier5": 16, # 12-16GB
"tier6": 24, # 16-24GB
# "unlimited" for >= 24GB
}
# LM model memory requirements (in GB)
LM_MODEL_MEMORY_GB = {
"0.6B": 3.0,
"1.7B": 8.0,
"4B": 12.0,
}
# LM model names mapping
LM_MODEL_NAMES = {
"0.6B": "acestep-5Hz-lm-0.6B",
"1.7B": "acestep-5Hz-lm-1.7B",
"4B": "acestep-5Hz-lm-4B",
}
# ==============================================================================
# Debug Constants
# ==============================================================================
# Tensor debug mode (values: "OFF" | "ON" | "VERBOSE")
TENSOR_DEBUG_MODE = "OFF"
# Placeholder debug switches for other main functionality (default "OFF")
# Update names/usage as features adopt them.
DEBUG_API_SERVER = "OFF"
DEBUG_INFERENCE = "OFF"
DEBUG_TRAINING = "OFF"
DEBUG_DATASET = "OFF"
DEBUG_AUDIO = "OFF"
DEBUG_LLM = "OFF"
DEBUG_UI = "OFF"
DEBUG_MODEL_LOADING = "OFF"
DEBUG_GPU = "OFF"
|