Spaces:

Gamahea
/

ACE-Step-Custom

Running on Zero

ACE-Step Custom

Deploy ACE-Step Custom Edition with bug fixes

a602628 9 days ago

7.28 kB

	"""
	Constants for ACE-Step
	Centralized constants used across the codebase
	"""

	# ==============================================================================
	# Language Constants
	# ==============================================================================

	# Supported languages for vocal generation and language detection
	# Covers major world languages with good TTS support in the underlying model
	# 'unknown' is used when language cannot be determined automatically
	VALID_LANGUAGES = [
	'ar', 'az', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en',
	'es', 'fa', 'fi', 'fr', 'he', 'hi', 'hr', 'ht', 'hu', 'id',
	'is', 'it', 'ja', 'ko', 'la', 'lt', 'ms', 'ne', 'nl', 'no',
	'pa', 'pl', 'pt', 'ro', 'ru', 'sa', 'sk', 'sr', 'sv', 'sw',
	'ta', 'te', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'yue', 'zh',
	'unknown'
	]


	# ==============================================================================
	# Keyscale Constants
	# ==============================================================================

	# Musical note names using standard Western notation
	KEYSCALE_NOTES = ['A', 'B', 'C', 'D', 'E', 'F', 'G']

	# Supported accidentals: natural, ASCII sharp/flat, Unicode sharp/flat
	KEYSCALE_ACCIDENTALS = ['', '#', 'b', '♯', '♭'] # empty + ASCII sharp/flat + Unicode sharp/flat

	# Major and minor scale modes
	KEYSCALE_MODES = ['major', 'minor']

	# Generate all valid keyscales: 7 notes × 5 accidentals × 2 modes = 70 combinations
	# Examples: "C major", "F# minor", "B♭ major"
	VALID_KEYSCALES = set()
	for note in KEYSCALE_NOTES:
	for acc in KEYSCALE_ACCIDENTALS:
	for mode in KEYSCALE_MODES:
	VALID_KEYSCALES.add(f"{note}{acc} {mode}")


	# ==============================================================================
	# Metadata Range Constants
	# ==============================================================================

	# BPM (Beats Per Minute) range - covers most musical styles
	# 30 BPM: Very slow ballads, ambient music
	# 300 BPM: Fast electronic dance music, extreme metal
	BPM_MIN = 30
	BPM_MAX = 300

	# Duration range (in seconds) - balances quality vs. computational cost
	# 10s: Short loops, musical excerpts
	# 600s: Full songs, extended compositions (10 minutes)
	DURATION_MIN = 10
	DURATION_MAX = 600

	# Valid time signatures - common musical meter patterns
	# 2: 2/4 time (marches, polka)
	# 3: 3/4 time (waltzes, ballads)
	# 4: 4/4 time (most pop, rock, hip-hop)
	# 6: 6/8 time (compound time, folk dances)
	VALID_TIME_SIGNATURES = [2, 3, 4, 6]


	# ==============================================================================
	# Task Type Constants
	# ==============================================================================

	# All supported generation tasks across different model variants
	TASK_TYPES = ["text2music", "repaint", "cover", "extract", "lego", "complete"]

	# Task types available for turbo models (optimized subset for speed)
	# - text2music: Generate from text descriptions
	# - repaint: Selective audio editing/regeneration
	# - cover: Style transfer using reference audio
	TASK_TYPES_TURBO = ["text2music", "repaint", "cover"]

	# Task types available for base models (full feature set)
	# Additional tasks requiring more computational resources:
	# - extract: Separate individual tracks/stems from audio
	# - lego: Multi-track generation (add layers)
	# - complete: Automatic completion of partial audio
	TASK_TYPES_BASE = ["text2music", "repaint", "cover", "extract", "lego", "complete"]


	# ==============================================================================
	# Instruction Constants
	# ==============================================================================

	# Default instructions
	DEFAULT_DIT_INSTRUCTION = "Fill the audio semantic mask based on the given conditions:"
	DEFAULT_LM_INSTRUCTION = "Generate audio semantic tokens based on the given conditions:"
	DEFAULT_LM_UNDERSTAND_INSTRUCTION = "Understand the given musical conditions and describe the audio semantics accordingly:"
	DEFAULT_LM_INSPIRED_INSTRUCTION = "Expand the user's input into a more detailed and specific musical description:"
	DEFAULT_LM_REWRITE_INSTRUCTION = "Format the user's input into a more detailed and specific musical description:"

	# Instruction templates for each task type
	# Note: Some instructions use placeholders like {TRACK_NAME} or {TRACK_CLASSES}
	# These should be formatted using .format() or f-strings when used
	TASK_INSTRUCTIONS = {
	"text2music": "Fill the audio semantic mask based on the given conditions:",
	"repaint": "Repaint the mask area based on the given conditions:",
	"cover": "Generate audio semantic tokens based on the given conditions:",
	"extract": "Extract the {TRACK_NAME} track from the audio:",
	"extract_default": "Extract the track from the audio:",
	"lego": "Generate the {TRACK_NAME} track based on the audio context:",
	"lego_default": "Generate the track based on the audio context:",
	"complete": "Complete the input track with {TRACK_CLASSES}:",
	"complete_default": "Complete the input track:",
	}


	# ==============================================================================
	# Track/Instrument Constants
	# ==============================================================================

	# Supported instrumental track types for multi-track generation and extraction
	# Organized by instrument families for logical grouping:
	# - Wind instruments: woodwinds, brass
	# - Electronic: fx (effects), synth (synthesizer)
	# - String instruments: strings, guitar, bass
	# - Rhythm section: percussion, drums, keyboard
	# - Vocals: backing_vocals, vocals (lead vocals)
	TRACK_NAMES = [
	"woodwinds", "brass", "fx", "synth", "strings", "percussion",
	"keyboard", "guitar", "bass", "drums", "backing_vocals", "vocals"
	]

	# Template for SFT (Supervised Fine-Tuning) model prompts
	# Used to format inputs for the language model with instruction, caption, and metadata
	SFT_GEN_PROMPT = """# Instruction
	{}

	# Caption
	{}

	# Metas
	{}<\|endoftext\|>
	"""


	# ==============================================================================
	# GPU Memory Configuration Constants
	# ==============================================================================

	# GPU tier thresholds (in GB)
	GPU_TIER_THRESHOLDS = {
	"tier1": 4, # <= 4GB
	"tier2": 6, # 4-6GB
	"tier3": 8, # 6-8GB
	"tier4": 12, # 8-12GB
	"tier5": 16, # 12-16GB
	"tier6": 24, # 16-24GB
	# "unlimited" for >= 24GB
	}

	# LM model memory requirements (in GB)
	LM_MODEL_MEMORY_GB = {
	"0.6B": 3.0,
	"1.7B": 8.0,
	"4B": 12.0,
	}

	# LM model names mapping
	LM_MODEL_NAMES = {
	"0.6B": "acestep-5Hz-lm-0.6B",
	"1.7B": "acestep-5Hz-lm-1.7B",
	"4B": "acestep-5Hz-lm-4B",
	}


	# ==============================================================================
	# Debug Constants
	# ==============================================================================

	# Tensor debug mode (values: "OFF" \| "ON" \| "VERBOSE")
	TENSOR_DEBUG_MODE = "OFF"

	# Placeholder debug switches for other main functionality (default "OFF")
	# Update names/usage as features adopt them.
	DEBUG_API_SERVER = "OFF"
	DEBUG_INFERENCE = "OFF"
	DEBUG_TRAINING = "OFF"
	DEBUG_DATASET = "OFF"
	DEBUG_AUDIO = "OFF"
	DEBUG_LLM = "OFF"
	DEBUG_UI = "OFF"
	DEBUG_MODEL_LOADING = "OFF"
	DEBUG_GPU = "OFF"