Upload 8 files
Browse files- backend/__init__.py +8 -0
- backend/audio_mixer.py +130 -0
- backend/config.py +150 -0
- backend/denoise.py +77 -0
- backend/i18n.py +113 -0
- backend/metadata_generator.py +236 -0
- backend/multi_voice_engine.py +213 -0
- backend/score_parser.py +251 -0
backend/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SolfegeScoreSinger Backend Modules
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .config import get_model, get_default_voice_path
|
| 6 |
+
from .i18n import I18n
|
| 7 |
+
|
| 8 |
+
__all__ = ['get_model', 'get_default_voice_path', 'I18n']
|
backend/audio_mixer.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Audio Mixer Module
|
| 3 |
+
Mixes multiple voice tracks into a single output
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
from typing import List, Optional
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def mix_voices(
|
| 11 |
+
voice_audios: List[np.ndarray],
|
| 12 |
+
method: str = "sum",
|
| 13 |
+
normalize: bool = True
|
| 14 |
+
) -> np.ndarray:
|
| 15 |
+
"""
|
| 16 |
+
Mix multiple voice audio tracks.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
voice_audios: List of audio arrays (one per voice)
|
| 20 |
+
method: Mixing method ("sum", "average", "weighted")
|
| 21 |
+
normalize: Whether to normalize output
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
Mixed audio array
|
| 25 |
+
"""
|
| 26 |
+
if not voice_audios:
|
| 27 |
+
return np.zeros(44100) # 1 second silence
|
| 28 |
+
|
| 29 |
+
if len(voice_audios) == 1:
|
| 30 |
+
audio = voice_audios[0]
|
| 31 |
+
if normalize:
|
| 32 |
+
audio = normalize_audio(audio)
|
| 33 |
+
return audio
|
| 34 |
+
|
| 35 |
+
# Find maximum length
|
| 36 |
+
max_length = max(len(audio) for audio in voice_audios)
|
| 37 |
+
|
| 38 |
+
# Pad shorter audios with silence
|
| 39 |
+
padded_audios = []
|
| 40 |
+
for audio in voice_audios:
|
| 41 |
+
if len(audio) < max_length:
|
| 42 |
+
padding = np.zeros(max_length - len(audio))
|
| 43 |
+
padded_audio = np.concatenate([audio, padding])
|
| 44 |
+
else:
|
| 45 |
+
padded_audio = audio
|
| 46 |
+
padded_audios.append(padded_audio)
|
| 47 |
+
|
| 48 |
+
# Mix
|
| 49 |
+
if method == "sum":
|
| 50 |
+
mixed = np.sum(padded_audios, axis=0)
|
| 51 |
+
elif method == "average":
|
| 52 |
+
mixed = np.mean(padded_audios, axis=0)
|
| 53 |
+
elif method == "weighted":
|
| 54 |
+
# Weight by inverse of energy (quieter voices get higher weight)
|
| 55 |
+
energies = [np.sum(audio ** 2) for audio in padded_audios]
|
| 56 |
+
weights = [1.0 / (e + 1e-10) for e in energies]
|
| 57 |
+
total_weight = sum(weights)
|
| 58 |
+
weights = [w / total_weight for w in weights]
|
| 59 |
+
|
| 60 |
+
mixed = np.zeros(max_length)
|
| 61 |
+
for audio, weight in zip(padded_audios, weights):
|
| 62 |
+
mixed += audio * weight
|
| 63 |
+
else:
|
| 64 |
+
mixed = np.sum(padded_audios, axis=0)
|
| 65 |
+
|
| 66 |
+
# Normalize
|
| 67 |
+
if normalize:
|
| 68 |
+
mixed = normalize_audio(mixed)
|
| 69 |
+
|
| 70 |
+
return mixed
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def normalize_audio(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:
|
| 74 |
+
"""
|
| 75 |
+
Normalize audio to target dB level.
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
audio: Audio array
|
| 79 |
+
target_db: Target dB level (default -3.0 dB)
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
Normalized audio
|
| 83 |
+
"""
|
| 84 |
+
# Calculate current RMS
|
| 85 |
+
rms = np.sqrt(np.mean(audio ** 2))
|
| 86 |
+
|
| 87 |
+
if rms < 1e-10:
|
| 88 |
+
return audio # Avoid division by zero
|
| 89 |
+
|
| 90 |
+
# Calculate target RMS
|
| 91 |
+
target_rms = 10 ** (target_db / 20) * 0.1
|
| 92 |
+
|
| 93 |
+
# Apply gain
|
| 94 |
+
gain = target_rms / rms
|
| 95 |
+
normalized = audio * gain
|
| 96 |
+
|
| 97 |
+
# Clip to prevent overflow
|
| 98 |
+
normalized = np.clip(normalized, -1.0, 1.0)
|
| 99 |
+
|
| 100 |
+
return normalized
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def apply_fade(audio: np.ndarray, fade_in: float = 0.01, fade_out: float = 0.01, sample_rate: int = 44100) -> np.ndarray:
|
| 104 |
+
"""
|
| 105 |
+
Apply fade in/out to audio.
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
audio: Audio array
|
| 109 |
+
fade_in: Fade in duration (seconds)
|
| 110 |
+
fade_out: Fade out duration (seconds)
|
| 111 |
+
sample_rate: Sample rate
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
Audio with fades applied
|
| 115 |
+
"""
|
| 116 |
+
audio = audio.copy()
|
| 117 |
+
|
| 118 |
+
# Fade in
|
| 119 |
+
fade_in_samples = int(fade_in * sample_rate)
|
| 120 |
+
if fade_in_samples > 0 and fade_in_samples < len(audio):
|
| 121 |
+
fade_in_curve = np.linspace(0, 1, fade_in_samples)
|
| 122 |
+
audio[:fade_in_samples] *= fade_in_curve
|
| 123 |
+
|
| 124 |
+
# Fade out
|
| 125 |
+
fade_out_samples = int(fade_out * sample_rate)
|
| 126 |
+
if fade_out_samples > 0 and fade_out_samples < len(audio):
|
| 127 |
+
fade_out_curve = np.linspace(1, 0, fade_out_samples)
|
| 128 |
+
audio[-fade_out_samples:] *= fade_out_curve
|
| 129 |
+
|
| 130 |
+
return audio
|
backend/config.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration and Model Management
|
| 3 |
+
Implements lazy loading to save memory on CPU environment
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import torch
|
| 8 |
+
|
| 9 |
+
# ============================================================================
|
| 10 |
+
# Environment Optimization (CPU)
|
| 11 |
+
# ============================================================================
|
| 12 |
+
|
| 13 |
+
os.environ["OMP_NUM_THREADS"] = "4"
|
| 14 |
+
os.environ["TORCH_NUM_THREADS"] = "4"
|
| 15 |
+
os.environ["MKL_NUM_THREADS"] = "4"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# ============================================================================
|
| 19 |
+
# Global Model Instance (Lazy Loading)
|
| 20 |
+
# ============================================================================
|
| 21 |
+
|
| 22 |
+
_model = None
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def get_model():
|
| 26 |
+
"""
|
| 27 |
+
Lazy load SoulX-Singer model.
|
| 28 |
+
Avoids loading on startup to save memory.
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
SoulX-Singer model instance
|
| 32 |
+
"""
|
| 33 |
+
global _model
|
| 34 |
+
|
| 35 |
+
if _model is None:
|
| 36 |
+
print("Loading SoulX-Singer model on CPU...")
|
| 37 |
+
|
| 38 |
+
# Import model from soulxsinger directory
|
| 39 |
+
import sys
|
| 40 |
+
base_path = os.path.dirname(__file__)
|
| 41 |
+
soulx_path = os.path.join(base_path, '..', 'soulxsinger')
|
| 42 |
+
cli_path = os.path.join(base_path, '..', 'cli')
|
| 43 |
+
|
| 44 |
+
# Add paths to sys.path
|
| 45 |
+
if os.path.exists(soulx_path):
|
| 46 |
+
sys.path.insert(0, os.path.dirname(soulx_path))
|
| 47 |
+
if os.path.exists(cli_path):
|
| 48 |
+
sys.path.insert(0, os.path.dirname(cli_path))
|
| 49 |
+
|
| 50 |
+
from cli.inference import SoulX_Singer
|
| 51 |
+
|
| 52 |
+
# Check for model weights - Auto-download if missing
|
| 53 |
+
model_weights_path = os.path.join(base_path, '..', 'pretrained_models', 'SoulX-Singer', 'model.pt')
|
| 54 |
+
|
| 55 |
+
if not os.path.exists(model_weights_path):
|
| 56 |
+
print("⚠️ Model weights not found!")
|
| 57 |
+
print("🔄 Attempting automatic download from HuggingFace Hub...")
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
# Install huggingface-hub if not already installed
|
| 61 |
+
import subprocess
|
| 62 |
+
subprocess.check_call(['pip', 'install', '-q', 'huggingface-hub'])
|
| 63 |
+
|
| 64 |
+
# Download model weights
|
| 65 |
+
from huggingface_hub import snapshot_download
|
| 66 |
+
model_dir = os.path.join(base_path, '..', 'pretrained_models', 'SoulX-Singer')
|
| 67 |
+
os.makedirs(model_dir, exist_ok=True)
|
| 68 |
+
|
| 69 |
+
print("⬇️ Downloading SoulX-Singer model (~1.5GB)...")
|
| 70 |
+
snapshot_download(
|
| 71 |
+
repo_id='Soul-AILab/SoulX-Singer',
|
| 72 |
+
local_dir=model_dir,
|
| 73 |
+
local_dir_use_symlinks=False,
|
| 74 |
+
ignore_patterns=['*.md', '*.txt', 'LICENSE', 'config/**', 'utils/**', 'scripts/**']
|
| 75 |
+
)
|
| 76 |
+
print("✅ Model downloaded successfully!")
|
| 77 |
+
|
| 78 |
+
except Exception as e:
|
| 79 |
+
print(f"❌ Auto-download failed: {e}")
|
| 80 |
+
print("Please manually download model.pt from:")
|
| 81 |
+
print("https://huggingface.co/Soul-AILab/SoulX-Singer")
|
| 82 |
+
print("And place it at: pretrained_models/SoulX-Singer/model.pt")
|
| 83 |
+
raise FileNotFoundError("Model weights not found and auto-download failed. See instructions above.")
|
| 84 |
+
|
| 85 |
+
# Load with INT8 quantization for CPU optimization
|
| 86 |
+
_model = SoulX_Singer(
|
| 87 |
+
config_path=os.path.join(soulx_path, "config", "soulxsinger.yaml"),
|
| 88 |
+
checkpoint_path=model_weights_path,
|
| 89 |
+
device='cpu',
|
| 90 |
+
dtype=torch.int8 # INT8 quantization
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
print("✅ Model loaded successfully!")
|
| 94 |
+
|
| 95 |
+
return _model
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def clear_model():
|
| 99 |
+
"""
|
| 100 |
+
Clear model from memory.
|
| 101 |
+
Call this when generation is complete to free resources.
|
| 102 |
+
"""
|
| 103 |
+
global _model
|
| 104 |
+
|
| 105 |
+
if _model is not None:
|
| 106 |
+
del _model
|
| 107 |
+
_model = None
|
| 108 |
+
|
| 109 |
+
import gc
|
| 110 |
+
gc.collect()
|
| 111 |
+
|
| 112 |
+
print("✅ Model memory cleared")
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def get_default_voice_path() -> str:
|
| 116 |
+
"""
|
| 117 |
+
Get path to default voice samples (child voice).
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
Path to DefaultVoice_Child directory
|
| 121 |
+
"""
|
| 122 |
+
return os.path.join(os.path.dirname(__file__), '..', 'DefaultVoice_Child')
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def get_cpu_warning() -> str:
|
| 126 |
+
"""
|
| 127 |
+
Get CPU environment warning message.
|
| 128 |
+
|
| 129 |
+
Returns:
|
| 130 |
+
Warning text about CPU generation time
|
| 131 |
+
"""
|
| 132 |
+
return "CPU Environment: Generation may take 5-10 min per second of audio"
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
# ============================================================================
|
| 136 |
+
# Model Inference Settings
|
| 137 |
+
# ============================================================================
|
| 138 |
+
|
| 139 |
+
INFERENCE_CONFIG = {
|
| 140 |
+
'n_steps': 12, # Reduced steps for CPU (default 32)
|
| 141 |
+
'cfg': 3.0, # CFG scale
|
| 142 |
+
'control': 'score', # Score-controlled mode
|
| 143 |
+
'use_fp16': False, # FP16 not supported on CPU
|
| 144 |
+
'segment_duration': 8.0 # Max segment duration (seconds)
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def get_inference_config():
|
| 149 |
+
"""Get default inference configuration for CPU"""
|
| 150 |
+
return INFERENCE_CONFIG.copy()
|
backend/denoise.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Denoise Module
|
| 3 |
+
Provides optional audio denoising using noisereduce
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
from typing import Optional
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def denoise_audio(
|
| 11 |
+
audio: np.ndarray,
|
| 12 |
+
sample_rate: int = 44100,
|
| 13 |
+
prop_decrease: float = 0.5,
|
| 14 |
+
stationary: bool = True
|
| 15 |
+
) -> np.ndarray:
|
| 16 |
+
"""
|
| 17 |
+
Apply noise reduction to audio.
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
audio: Audio array
|
| 21 |
+
sample_rate: Sample rate
|
| 22 |
+
prop_decrease: Proportion of noise to decrease (0.0-1.0)
|
| 23 |
+
stationary: Whether noise is stationary
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
Denoised audio
|
| 27 |
+
"""
|
| 28 |
+
try:
|
| 29 |
+
import noisereduce as nr
|
| 30 |
+
|
| 31 |
+
denoised = nr.reduce_noise(
|
| 32 |
+
y=audio,
|
| 33 |
+
sr=sample_rate,
|
| 34 |
+
prop_decrease=prop_decrease,
|
| 35 |
+
stationary=stationary
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
return denoised
|
| 39 |
+
|
| 40 |
+
except ImportError:
|
| 41 |
+
print("Warning: noisereduce not installed, returning original audio")
|
| 42 |
+
return audio
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print(f"Error in denoise_audio: {e}")
|
| 45 |
+
return audio
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def detect_noise_profile(
|
| 49 |
+
audio: np.ndarray,
|
| 50 |
+
sample_rate: int = 44100,
|
| 51 |
+
noise_duration: float = 0.5
|
| 52 |
+
) -> Optional[np.ndarray]:
|
| 53 |
+
"""
|
| 54 |
+
Detect noise profile from beginning of audio.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
audio: Audio array
|
| 58 |
+
sample_rate: Sample rate
|
| 59 |
+
noise_duration: Duration to analyze for noise (seconds)
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
Noise profile or None
|
| 63 |
+
"""
|
| 64 |
+
noise_samples = int(noise_duration * sample_rate)
|
| 65 |
+
|
| 66 |
+
if len(audio) < noise_samples:
|
| 67 |
+
return None
|
| 68 |
+
|
| 69 |
+
noise_segment = audio[:noise_samples]
|
| 70 |
+
|
| 71 |
+
# Calculate noise statistics
|
| 72 |
+
noise_rms = np.sqrt(np.mean(noise_segment ** 2))
|
| 73 |
+
|
| 74 |
+
return {
|
| 75 |
+
'rms': noise_rms,
|
| 76 |
+
'segment': noise_segment
|
| 77 |
+
}
|
backend/i18n.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Internationalization (i18n) Module
|
| 3 |
+
Supports English, Chinese, and Japanese
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
from typing import Dict, Optional
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class I18n:
|
| 12 |
+
"""Multi-language support class"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, default_lang: str = 'en'):
|
| 15 |
+
"""
|
| 16 |
+
Initialize i18n module.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
default_lang: Default language code ('en', 'zh', 'ja')
|
| 20 |
+
"""
|
| 21 |
+
self.current_lang = default_lang
|
| 22 |
+
self.translations = self._load_translations()
|
| 23 |
+
|
| 24 |
+
def _load_translations(self) -> Dict:
|
| 25 |
+
"""Load translation files from locales directory"""
|
| 26 |
+
translations = {}
|
| 27 |
+
locales_dir = os.path.join(os.path.dirname(__file__), '..', 'locales')
|
| 28 |
+
|
| 29 |
+
if not os.path.exists(locales_dir):
|
| 30 |
+
print(f"Warning: Locales directory not found: {locales_dir}")
|
| 31 |
+
return self._get_default_translations()
|
| 32 |
+
|
| 33 |
+
for lang_file in os.listdir(locales_dir):
|
| 34 |
+
if lang_file.endswith('.json'):
|
| 35 |
+
lang_code = lang_file.replace('.json', '')
|
| 36 |
+
file_path = os.path.join(locales_dir, lang_file)
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 40 |
+
translations[lang_code] = json.load(f)
|
| 41 |
+
except Exception as e:
|
| 42 |
+
print(f"Error loading {lang_file}: {e}")
|
| 43 |
+
|
| 44 |
+
# Fallback to default if no translations loaded
|
| 45 |
+
if not translations:
|
| 46 |
+
translations = self._get_default_translations()
|
| 47 |
+
|
| 48 |
+
return translations
|
| 49 |
+
|
| 50 |
+
def _get_default_translations(self) -> Dict:
|
| 51 |
+
"""Get default English translations (fallback)"""
|
| 52 |
+
return {
|
| 53 |
+
'en': {
|
| 54 |
+
'title': '🎵 SolfegeScoreSinger - AI Singing Synthesis',
|
| 55 |
+
'record_tab': 'Record Samples',
|
| 56 |
+
'upload_tab': 'Upload Score',
|
| 57 |
+
'config_tab': 'Configuration',
|
| 58 |
+
'generate_tab': 'Generate & Download',
|
| 59 |
+
'syllables': ['Do', 'Re', 'Mi', 'Fa', 'Sol', 'La', 'Ti'],
|
| 60 |
+
'record_instruction': 'Record 7 solfege syllables to clone your voice',
|
| 61 |
+
'upload_score': 'Upload Score (MIDI/MusicXML)',
|
| 62 |
+
'voice_mode': 'Voice Mode',
|
| 63 |
+
'my_recording': 'My Recording',
|
| 64 |
+
'child_voice': 'Child Voice (Built-in)',
|
| 65 |
+
'solfege_mode': 'Solfege Mode',
|
| 66 |
+
'movable_do': 'Movable Do (首调)',
|
| 67 |
+
'fixed_do': 'Fixed Do (固定调)',
|
| 68 |
+
'denoise': 'Enable Denoising',
|
| 69 |
+
'denoise_note': 'Default: No denoising (fidelity priority)',
|
| 70 |
+
'generate': 'Generate Audio',
|
| 71 |
+
'download': 'Download Audio',
|
| 72 |
+
'cpu_warning': 'CPU Environment: Generation may take 5-10 min per second of audio',
|
| 73 |
+
'validating': 'Validating inputs...',
|
| 74 |
+
'parsing_score': 'Parsing score...',
|
| 75 |
+
'preparing_samples': 'Preparing voice samples...',
|
| 76 |
+
'generating_metadata': 'Generating metadata...',
|
| 77 |
+
'loading_model': 'Loading AI model...',
|
| 78 |
+
'mixing_voices': 'Mixing voices...',
|
| 79 |
+
'saving_audio': 'Saving audio file...'
|
| 80 |
+
}
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
def set_language(self, lang: str):
|
| 84 |
+
"""Set current language"""
|
| 85 |
+
if lang in self.translations:
|
| 86 |
+
self.current_lang = lang
|
| 87 |
+
else:
|
| 88 |
+
print(f"Warning: Language '{lang}' not found, using default")
|
| 89 |
+
|
| 90 |
+
def t(self, key: str) -> str:
|
| 91 |
+
"""
|
| 92 |
+
Translate a key to current language.
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
key: Translation key (supports nested keys with '.')
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
Translated text
|
| 99 |
+
"""
|
| 100 |
+
keys = key.split('.')
|
| 101 |
+
value = self.translations.get(self.current_lang, {})
|
| 102 |
+
|
| 103 |
+
for k in keys:
|
| 104 |
+
if isinstance(value, dict):
|
| 105 |
+
value = value.get(k, key)
|
| 106 |
+
else:
|
| 107 |
+
return key
|
| 108 |
+
|
| 109 |
+
return value if isinstance(value, str) else key
|
| 110 |
+
|
| 111 |
+
def get_all_texts(self) -> Dict:
|
| 112 |
+
"""Get all texts for current language"""
|
| 113 |
+
return self.translations.get(self.current_lang, self._get_default_translations()['en'])
|
backend/metadata_generator.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Metadata Generator Module
|
| 3 |
+
Generates SoulX-Singer metadata from score and voice samples
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import numpy as np
|
| 8 |
+
import soundfile as sf
|
| 9 |
+
from typing import Dict, List, Optional
|
| 10 |
+
from .score_parser import SOLFEGE_SYLLABLES
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def prepare_voice_samples(
|
| 14 |
+
voice_mode: str,
|
| 15 |
+
user_samples: Optional[Dict[str, str]],
|
| 16 |
+
enable_denoise: bool
|
| 17 |
+
) -> Dict[str, np.ndarray]:
|
| 18 |
+
"""
|
| 19 |
+
Prepare voice samples for synthesis.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
voice_mode: "My Recording" or "Child Voice"
|
| 23 |
+
user_samples: Dict mapping syllable to audio file path
|
| 24 |
+
enable_denoise: Whether to apply denoising
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
Dict mapping syllable to audio array
|
| 28 |
+
"""
|
| 29 |
+
from .config import get_default_voice_path
|
| 30 |
+
|
| 31 |
+
samples = {}
|
| 32 |
+
|
| 33 |
+
if voice_mode == "Child Voice (Built-in)" or voice_mode == "童声音色 (内置)" or voice_mode == "子供の声 (内蔵)":
|
| 34 |
+
# Load default voice
|
| 35 |
+
default_path = get_default_voice_path()
|
| 36 |
+
|
| 37 |
+
for syllable in SOLFEGE_SYLLABLES:
|
| 38 |
+
# Capitalize first letter for filename
|
| 39 |
+
filename = syllable.capitalize() + '.wav'
|
| 40 |
+
file_path = os.path.join(default_path, filename)
|
| 41 |
+
|
| 42 |
+
if os.path.exists(file_path):
|
| 43 |
+
audio, sr = sf.read(file_path)
|
| 44 |
+
samples[syllable] = audio
|
| 45 |
+
else:
|
| 46 |
+
print(f"Warning: Default voice file not found: {file_path}")
|
| 47 |
+
|
| 48 |
+
elif user_samples:
|
| 49 |
+
# Load user recorded samples
|
| 50 |
+
for syllable in SOLFEGE_SYLLABLES:
|
| 51 |
+
file_path = user_samples.get(syllable)
|
| 52 |
+
|
| 53 |
+
if file_path and os.path.exists(file_path):
|
| 54 |
+
audio, sr = sf.read(file_path)
|
| 55 |
+
|
| 56 |
+
# Apply denoising if enabled
|
| 57 |
+
if enable_denoise:
|
| 58 |
+
audio = apply_denoise(audio, sr)
|
| 59 |
+
|
| 60 |
+
samples[syllable] = audio
|
| 61 |
+
|
| 62 |
+
return samples
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def apply_denoise(audio: np.ndarray, sample_rate: int) -> np.ndarray:
|
| 66 |
+
"""
|
| 67 |
+
Apply conservative denoising using noisereduce.
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
audio: Audio array
|
| 71 |
+
sample_rate: Sample rate
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
Denoised audio
|
| 75 |
+
"""
|
| 76 |
+
try:
|
| 77 |
+
import noisereduce as nr
|
| 78 |
+
return nr.reduce_noise(y=audio, sr=sample_rate, prop_decrease=0.5)
|
| 79 |
+
except ImportError:
|
| 80 |
+
print("Warning: noisereduce not installed, skipping denoising")
|
| 81 |
+
return audio
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def generate_metadata_for_voices(
|
| 85 |
+
voices: List[Dict],
|
| 86 |
+
voice_samples: Dict[str, np.ndarray]
|
| 87 |
+
) -> List[Dict]:
|
| 88 |
+
"""
|
| 89 |
+
Generate SoulX-Singer metadata for each voice.
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
voices: List of voice data from score parser
|
| 93 |
+
voice_samples: Dict of syllable -> audio array
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
List of metadata dicts for SoulX-Singer
|
| 97 |
+
"""
|
| 98 |
+
metadata_list = []
|
| 99 |
+
|
| 100 |
+
for voice in voices:
|
| 101 |
+
notes = voice['notes']
|
| 102 |
+
|
| 103 |
+
# Create prompt audio by concatenating solfege samples
|
| 104 |
+
prompt_audio = create_prompt_audio(notes, voice_samples)
|
| 105 |
+
|
| 106 |
+
# Create target metadata
|
| 107 |
+
target_metadata = create_target_metadata(notes)
|
| 108 |
+
|
| 109 |
+
metadata = {
|
| 110 |
+
'voice_id': voice['id'],
|
| 111 |
+
'instrument': voice['instrument'],
|
| 112 |
+
'prompt_audio': prompt_audio,
|
| 113 |
+
'target': target_metadata
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
metadata_list.append(metadata)
|
| 117 |
+
|
| 118 |
+
return metadata_list
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def create_prompt_audio(notes: List[Dict], voice_samples: Dict[str, np.ndarray]) -> np.ndarray:
|
| 122 |
+
"""
|
| 123 |
+
Create prompt audio by concatenating voice samples.
|
| 124 |
+
|
| 125 |
+
Strategy:
|
| 126 |
+
- Use first few notes' solfege to create a representative prompt
|
| 127 |
+
- Aim for ~3-5 seconds of prompt audio
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
notes: List of notes for this voice
|
| 131 |
+
voice_samples: Dict of syllable -> audio array
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
Concatenated prompt audio
|
| 135 |
+
"""
|
| 136 |
+
# Get unique solfeges from first few notes
|
| 137 |
+
solfeges = []
|
| 138 |
+
for note in notes[:10]:
|
| 139 |
+
solfege = note['solfege']
|
| 140 |
+
if solfege not in solfeges and solfege in voice_samples:
|
| 141 |
+
solfeges.append(solfege)
|
| 142 |
+
|
| 143 |
+
# Use at least 3 different syllables
|
| 144 |
+
if len(solfeges) < 3:
|
| 145 |
+
for syllable in SOLFEGE_SYLLABLES:
|
| 146 |
+
if syllable not in solfeges and syllable in voice_samples:
|
| 147 |
+
solfeges.append(syllable)
|
| 148 |
+
if len(solfeges) >= 3:
|
| 149 |
+
break
|
| 150 |
+
|
| 151 |
+
# Concatenate samples with small gaps
|
| 152 |
+
prompt_segments = []
|
| 153 |
+
for syllable in solfeges[:5]:
|
| 154 |
+
if syllable in voice_samples:
|
| 155 |
+
sample = voice_samples[syllable]
|
| 156 |
+
prompt_segments.append(sample)
|
| 157 |
+
|
| 158 |
+
# Add small gap (50ms silence)
|
| 159 |
+
gap = np.zeros(int(44100 * 0.05))
|
| 160 |
+
prompt_segments.append(gap)
|
| 161 |
+
|
| 162 |
+
if prompt_segments:
|
| 163 |
+
return np.concatenate(prompt_segments)
|
| 164 |
+
else:
|
| 165 |
+
# Fallback: use first available sample
|
| 166 |
+
for sample in voice_samples.values():
|
| 167 |
+
return sample
|
| 168 |
+
|
| 169 |
+
return np.zeros(44100) # 1 second silence
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def create_target_metadata(notes: List[Dict]) -> Dict:
|
| 173 |
+
"""
|
| 174 |
+
Create target metadata for SoulX-Singer.
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
notes: List of notes
|
| 178 |
+
|
| 179 |
+
Returns:
|
| 180 |
+
Target metadata dict
|
| 181 |
+
"""
|
| 182 |
+
# Convert notes to SoulX format
|
| 183 |
+
phonemes = []
|
| 184 |
+
note_pitches = []
|
| 185 |
+
note_durations = []
|
| 186 |
+
note_types = []
|
| 187 |
+
|
| 188 |
+
for note in notes:
|
| 189 |
+
solfege = note['solfege']
|
| 190 |
+
midi_num = note['midi']
|
| 191 |
+
duration = note['duration']
|
| 192 |
+
|
| 193 |
+
# Phoneme (simplified - just use solfege name)
|
| 194 |
+
phoneme = solfege_to_phoneme(solfege)
|
| 195 |
+
phonemes.append(phoneme)
|
| 196 |
+
|
| 197 |
+
# Pitch
|
| 198 |
+
note_pitches.append(midi_num)
|
| 199 |
+
|
| 200 |
+
# Duration (in frames)
|
| 201 |
+
note_durations.append(int(duration * 44100 / 256)) # Assume 256 samples per frame
|
| 202 |
+
|
| 203 |
+
# Note type (1 = regular)
|
| 204 |
+
note_types.append(1)
|
| 205 |
+
|
| 206 |
+
return {
|
| 207 |
+
'phoneme': phonemes,
|
| 208 |
+
'note_pitch': note_pitches,
|
| 209 |
+
'note_duration': note_durations,
|
| 210 |
+
'note_type': note_types,
|
| 211 |
+
'duration': sum(note['duration'] for note in notes)
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
def solfege_to_phoneme(solfege: str) -> str:
|
| 216 |
+
"""
|
| 217 |
+
Convert solfege syllable to phoneme.
|
| 218 |
+
|
| 219 |
+
Args:
|
| 220 |
+
solfege: Solfege syllable (do, re, mi, fa, sol, la, ti)
|
| 221 |
+
|
| 222 |
+
Returns:
|
| 223 |
+
Phoneme string
|
| 224 |
+
"""
|
| 225 |
+
# ARPAbet phonemes (simplified)
|
| 226 |
+
SOLFEGE_TO_PHONEME = {
|
| 227 |
+
'do': 'd ow',
|
| 228 |
+
're': 'r ey',
|
| 229 |
+
'mi': 'm iy',
|
| 230 |
+
'fa': 'f aa',
|
| 231 |
+
'sol': 's ow l',
|
| 232 |
+
'la': 'l aa',
|
| 233 |
+
'ti': 't iy'
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
return SOLFEGE_TO_PHONEME.get(solfege, 'd ow')
|
backend/multi_voice_engine.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Multi-Voice Engine Module
|
| 3 |
+
Handles SoulX-Singer model inference for multiple voices
|
| 4 |
+
Implements segment-based processing for long scores
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import torch
|
| 9 |
+
from typing import Dict, List, Optional, Callable
|
| 10 |
+
import gc
|
| 11 |
+
|
| 12 |
+
from .config import get_inference_config
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class MultiVoiceEngine:
|
| 16 |
+
"""
|
| 17 |
+
Multi-voice synthesis engine using SoulX-Singer.
|
| 18 |
+
|
| 19 |
+
Features:
|
| 20 |
+
- Segment-based processing for long scores (≤8s per segment)
|
| 21 |
+
- Memory management with garbage collection
|
| 22 |
+
- Progress callback support
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(self, model):
|
| 26 |
+
"""
|
| 27 |
+
Initialize engine with SoulX-Singer model.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
model: SoulX-Singer model instance
|
| 31 |
+
"""
|
| 32 |
+
self.model = model
|
| 33 |
+
self.config = get_inference_config()
|
| 34 |
+
|
| 35 |
+
def generate_single_voice(
|
| 36 |
+
self,
|
| 37 |
+
metadata: Dict,
|
| 38 |
+
on_progress: Optional[Callable[[float], None]] = None
|
| 39 |
+
) -> np.ndarray:
|
| 40 |
+
"""
|
| 41 |
+
Generate audio for a single voice.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
metadata: Voice metadata from metadata_generator
|
| 45 |
+
on_progress: Progress callback function
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
Generated audio array
|
| 49 |
+
"""
|
| 50 |
+
target = metadata['target']
|
| 51 |
+
prompt_audio = metadata['prompt_audio']
|
| 52 |
+
|
| 53 |
+
# Check if segmentation is needed
|
| 54 |
+
total_duration = target['duration']
|
| 55 |
+
segment_duration = self.config['segment_duration']
|
| 56 |
+
|
| 57 |
+
if total_duration <= segment_duration:
|
| 58 |
+
# Single segment
|
| 59 |
+
return self._generate_segment(prompt_audio, target, on_progress)
|
| 60 |
+
else:
|
| 61 |
+
# Multiple segments
|
| 62 |
+
return self._generate_segments(prompt_audio, target, on_progress)
|
| 63 |
+
|
| 64 |
+
def _generate_segment(
|
| 65 |
+
self,
|
| 66 |
+
prompt_audio: np.ndarray,
|
| 67 |
+
target: Dict,
|
| 68 |
+
on_progress: Optional[Callable[[float], None]] = None
|
| 69 |
+
) -> np.ndarray:
|
| 70 |
+
"""
|
| 71 |
+
Generate a single segment (≤8 seconds).
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
prompt_audio: Prompt audio array
|
| 75 |
+
target: Target metadata
|
| 76 |
+
on_progress: Progress callback
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
Generated audio for this segment
|
| 80 |
+
"""
|
| 81 |
+
try:
|
| 82 |
+
# Prepare model input
|
| 83 |
+
infer_data = {
|
| 84 |
+
'prompt': {
|
| 85 |
+
'waveform': torch.from_numpy(prompt_audio).float(),
|
| 86 |
+
'phoneme': self._phonemes_to_tensor(target['phoneme'][:len(prompt_audio)//100]),
|
| 87 |
+
'note_pitch': torch.tensor(target['note_pitch'][:len(prompt_audio)//100]),
|
| 88 |
+
'note_type': torch.tensor(target['note_type'][:len(prompt_audio)//100])
|
| 89 |
+
},
|
| 90 |
+
'target': {
|
| 91 |
+
'phoneme': self._phonemes_to_tensor(target['phoneme']),
|
| 92 |
+
'note_pitch': torch.tensor(target['note_pitch']),
|
| 93 |
+
'note_type': torch.tensor(target['note_type'])
|
| 94 |
+
}
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
# Run inference
|
| 98 |
+
with torch.no_grad():
|
| 99 |
+
output = self.model.infer(
|
| 100 |
+
infer_data,
|
| 101 |
+
auto_shift=False,
|
| 102 |
+
pitch_shift=0,
|
| 103 |
+
n_steps=self.config['n_steps'],
|
| 104 |
+
cfg=self.config['cfg'],
|
| 105 |
+
control=self.config['control'],
|
| 106 |
+
use_fp16=self.config['use_fp16']
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# Clean up
|
| 110 |
+
del infer_data
|
| 111 |
+
gc.collect()
|
| 112 |
+
|
| 113 |
+
if on_progress:
|
| 114 |
+
on_progress(100.0)
|
| 115 |
+
|
| 116 |
+
return output.cpu().numpy() if torch.is_tensor(output) else output
|
| 117 |
+
|
| 118 |
+
except Exception as e:
|
| 119 |
+
print(f"Error in _generate_segment: {e}")
|
| 120 |
+
# Fallback: return silence
|
| 121 |
+
duration = target.get('duration', 1.0)
|
| 122 |
+
return np.zeros(int(44100 * duration))
|
| 123 |
+
|
| 124 |
+
def _generate_segments(
|
| 125 |
+
self,
|
| 126 |
+
prompt_audio: np.ndarray,
|
| 127 |
+
target: Dict,
|
| 128 |
+
on_progress: Optional[Callable[[float], None]] = None
|
| 129 |
+
) -> np.ndarray:
|
| 130 |
+
"""
|
| 131 |
+
Generate multiple segments and concatenate.
|
| 132 |
+
|
| 133 |
+
Args:
|
| 134 |
+
prompt_audio: Prompt audio
|
| 135 |
+
target: Target metadata
|
| 136 |
+
on_progress: Progress callback
|
| 137 |
+
|
| 138 |
+
Returns:
|
| 139 |
+
Concatenated generated audio
|
| 140 |
+
"""
|
| 141 |
+
total_duration = target['duration']
|
| 142 |
+
segment_duration = self.config['segment_duration']
|
| 143 |
+
num_segments = int(np.ceil(total_duration / segment_duration))
|
| 144 |
+
|
| 145 |
+
segments = []
|
| 146 |
+
|
| 147 |
+
for i in range(num_segments):
|
| 148 |
+
# Extract segment metadata
|
| 149 |
+
start_time = i * segment_duration
|
| 150 |
+
end_time = min((i + 1) * segment_duration, total_duration)
|
| 151 |
+
|
| 152 |
+
segment_target = self._extract_segment(target, start_time, end_time)
|
| 153 |
+
|
| 154 |
+
# Generate this segment
|
| 155 |
+
segment_audio = self._generate_segment(prompt_audio, segment_target)
|
| 156 |
+
segments.append(segment_audio)
|
| 157 |
+
|
| 158 |
+
# Update progress
|
| 159 |
+
if on_progress:
|
| 160 |
+
progress = (i + 1) / num_segments * 100
|
| 161 |
+
on_progress(progress)
|
| 162 |
+
|
| 163 |
+
# Memory cleanup
|
| 164 |
+
gc.collect()
|
| 165 |
+
|
| 166 |
+
# Concatenate segments
|
| 167 |
+
return np.concatenate(segments)
|
| 168 |
+
|
| 169 |
+
def _extract_segment(
|
| 170 |
+
self,
|
| 171 |
+
target: Dict,
|
| 172 |
+
start_time: float,
|
| 173 |
+
end_time: float
|
| 174 |
+
) -> Dict:
|
| 175 |
+
"""
|
| 176 |
+
Extract a time segment from target metadata.
|
| 177 |
+
|
| 178 |
+
Args:
|
| 179 |
+
target: Full target metadata
|
| 180 |
+
start_time: Segment start time (seconds)
|
| 181 |
+
end_time: Segment end time (seconds)
|
| 182 |
+
|
| 183 |
+
Returns:
|
| 184 |
+
Segment metadata
|
| 185 |
+
"""
|
| 186 |
+
# Simplified: just return full target for now
|
| 187 |
+
# TODO: Implement proper time-based extraction
|
| 188 |
+
return {
|
| 189 |
+
'phoneme': target['phoneme'],
|
| 190 |
+
'note_pitch': target['note_pitch'],
|
| 191 |
+
'note_type': target['note_type'],
|
| 192 |
+
'duration': end_time - start_time
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
def _phonemes_to_tensor(self, phonemes: List[str]) -> torch.Tensor:
|
| 196 |
+
"""
|
| 197 |
+
Convert phoneme list to tensor.
|
| 198 |
+
|
| 199 |
+
Args:
|
| 200 |
+
phonemes: List of phoneme strings
|
| 201 |
+
|
| 202 |
+
Returns:
|
| 203 |
+
Phoneme tensor
|
| 204 |
+
"""
|
| 205 |
+
# Simplified: convert to indices
|
| 206 |
+
# TODO: Use proper phoneme vocabulary
|
| 207 |
+
phoneme_to_idx = {
|
| 208 |
+
'd ow': 0, 'r ey': 1, 'm iy': 2, 'f aa': 3,
|
| 209 |
+
's ow l': 4, 'l aa': 5, 't iy': 6
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
indices = [phoneme_to_idx.get(p, 0) for p in phonemes]
|
| 213 |
+
return torch.tensor(indices, dtype=torch.long)
|
backend/score_parser.py
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Score Parser Module
|
| 3 |
+
Supports MIDI and MusicXML formats
|
| 4 |
+
Implements key detection and solfege mapping
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
from typing import Dict, List, Optional, Tuple
|
| 9 |
+
import tempfile
|
| 10 |
+
|
| 11 |
+
# Solfege syllables (CORRECTED: 'sol' not 'so')
|
| 12 |
+
SOLFEGE_SYLLABLES = ['do', 're', 'mi', 'fa', 'sol', 'la', 'ti']
|
| 13 |
+
|
| 14 |
+
# Reference pitches (C4 octave)
|
| 15 |
+
REFERENCE_PITCHES = {
|
| 16 |
+
'do': 261.63, 're': 293.66, 'mi': 329.63, 'fa': 349.23,
|
| 17 |
+
'sol': 392.00, 'la': 440.00, 'ti': 493.88
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def quick_parse_score(file_path: str) -> Dict:
|
| 22 |
+
"""
|
| 23 |
+
Quick parse score to get basic info (duration, voice count).
|
| 24 |
+
Used for time estimation.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
file_path: Path to MIDI or MusicXML file
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
{
|
| 31 |
+
'duration': float (seconds),
|
| 32 |
+
'voice_count': int,
|
| 33 |
+
'key': str
|
| 34 |
+
}
|
| 35 |
+
"""
|
| 36 |
+
try:
|
| 37 |
+
# Try music21 first
|
| 38 |
+
from music21 import converter
|
| 39 |
+
|
| 40 |
+
score = converter.parse(file_path)
|
| 41 |
+
duration = score.duration.quarterLength / 2 # Rough estimate (120 BPM)
|
| 42 |
+
voice_count = len(score.parts) if hasattr(score, 'parts') else 1
|
| 43 |
+
|
| 44 |
+
# Key detection
|
| 45 |
+
key_analysis = score.analyze('key')
|
| 46 |
+
key_name = f"{key_analysis.tonic.name} {key_analysis.mode}"
|
| 47 |
+
|
| 48 |
+
return {
|
| 49 |
+
'duration': max(duration, 10), # Minimum 10s
|
| 50 |
+
'voice_count': max(voice_count, 1),
|
| 51 |
+
'key': key_name
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
except Exception as e:
|
| 55 |
+
print(f"Error in quick_parse_score: {e}")
|
| 56 |
+
# Fallback
|
| 57 |
+
return {
|
| 58 |
+
'duration': 30,
|
| 59 |
+
'voice_count': 1,
|
| 60 |
+
'key': 'C major'
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def parse_score_with_solfege(file_path: str, mode: str = "movable") -> Dict:
|
| 65 |
+
"""
|
| 66 |
+
Parse score and generate solfege mapping.
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
file_path: Path to MIDI or MusicXML file
|
| 70 |
+
mode: "movable" or "fixed"
|
| 71 |
+
|
| 72 |
+
Returns:
|
| 73 |
+
{
|
| 74 |
+
'key': str,
|
| 75 |
+
'duration': float,
|
| 76 |
+
'voices': List[Dict],
|
| 77 |
+
'solfege_table': List[List] # For Gradio Dataframe
|
| 78 |
+
}
|
| 79 |
+
"""
|
| 80 |
+
from music21 import converter, key, pitch, note
|
| 81 |
+
|
| 82 |
+
try:
|
| 83 |
+
score = converter.parse(file_path)
|
| 84 |
+
|
| 85 |
+
# Detect key
|
| 86 |
+
key_analysis = score.analyze('key')
|
| 87 |
+
key_name = f"{key_analysis.tonic.name} {key_analysis.mode}"
|
| 88 |
+
key_fifths = key_analysis.sharps
|
| 89 |
+
|
| 90 |
+
# Extract voices
|
| 91 |
+
voices = []
|
| 92 |
+
solfege_table = []
|
| 93 |
+
|
| 94 |
+
for part_idx, part in enumerate(score.parts):
|
| 95 |
+
voice_notes = []
|
| 96 |
+
|
| 97 |
+
for element in part.flatten().notes:
|
| 98 |
+
if isinstance(element, note.Note):
|
| 99 |
+
# Get MIDI number
|
| 100 |
+
midi_num = element.pitch.midi
|
| 101 |
+
|
| 102 |
+
# Map to solfege
|
| 103 |
+
if mode == "movable":
|
| 104 |
+
solfege = midi_to_solfege_movable(midi_num, key_fifths)
|
| 105 |
+
else:
|
| 106 |
+
solfege = midi_to_solfege_fixed(midi_num)
|
| 107 |
+
|
| 108 |
+
# Get measure and beat
|
| 109 |
+
measure = element.measureNumber or 1
|
| 110 |
+
beat = element.beat or 1
|
| 111 |
+
|
| 112 |
+
# Duration in seconds (assume 120 BPM)
|
| 113 |
+
duration = element.duration.quarterLength * 0.5
|
| 114 |
+
|
| 115 |
+
voice_notes.append({
|
| 116 |
+
'midi': midi_num,
|
| 117 |
+
'solfege': solfege,
|
| 118 |
+
'start': element.offset,
|
| 119 |
+
'duration': duration,
|
| 120 |
+
'measure': measure,
|
| 121 |
+
'beat': beat
|
| 122 |
+
})
|
| 123 |
+
|
| 124 |
+
# Add to correction table (first 20 notes)
|
| 125 |
+
if len(solfege_table) < 20:
|
| 126 |
+
solfege_table.append([
|
| 127 |
+
len(solfege_table) + 1,
|
| 128 |
+
measure,
|
| 129 |
+
f"{beat:.1f}",
|
| 130 |
+
solfege,
|
| 131 |
+
"" # User correction
|
| 132 |
+
])
|
| 133 |
+
|
| 134 |
+
voices.append({
|
| 135 |
+
'id': part_idx,
|
| 136 |
+
'instrument': part.partName or f"Voice {part_idx + 1}",
|
| 137 |
+
'notes': voice_notes
|
| 138 |
+
})
|
| 139 |
+
|
| 140 |
+
# Total duration
|
| 141 |
+
total_duration = score.duration.quarterLength * 0.5
|
| 142 |
+
|
| 143 |
+
return {
|
| 144 |
+
'key': key_name,
|
| 145 |
+
'duration': total_duration,
|
| 146 |
+
'voices': voices,
|
| 147 |
+
'solfege_table': solfege_table
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
except Exception as e:
|
| 151 |
+
print(f"Error parsing score: {e}")
|
| 152 |
+
raise
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def parse_score_with_correction(file_path: str, mode: str = "movable", corrections=None) -> Dict:
|
| 156 |
+
"""
|
| 157 |
+
Parse score with optional user corrections.
|
| 158 |
+
|
| 159 |
+
Args:
|
| 160 |
+
file_path: Path to score file
|
| 161 |
+
mode: "movable" or "fixed"
|
| 162 |
+
corrections: Gradio Dataframe with corrections
|
| 163 |
+
|
| 164 |
+
Returns:
|
| 165 |
+
Same as parse_score_with_solfege
|
| 166 |
+
"""
|
| 167 |
+
result = parse_score_with_solfege(file_path, mode)
|
| 168 |
+
|
| 169 |
+
# Apply corrections if provided
|
| 170 |
+
if corrections is not None and len(corrections) > 0:
|
| 171 |
+
for row in corrections:
|
| 172 |
+
if len(row) >= 5 and row[4]: # Has correction
|
| 173 |
+
note_idx = int(row[0]) - 1
|
| 174 |
+
corrected_solfege = row[4].lower()
|
| 175 |
+
|
| 176 |
+
if corrected_solfege in SOLFEGE_SYLLABLES:
|
| 177 |
+
# Apply to first voice (simplified)
|
| 178 |
+
if result['voices'] and note_idx < len(result['voices'][0]['notes']):
|
| 179 |
+
result['voices'][0]['notes'][note_idx]['solfege'] = corrected_solfege
|
| 180 |
+
|
| 181 |
+
return result
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def midi_to_solfege_fixed(midi_num: int) -> str:
|
| 185 |
+
"""
|
| 186 |
+
Convert MIDI note to solfege using Fixed Do.
|
| 187 |
+
Based on pitch class, not letter name (simplified).
|
| 188 |
+
|
| 189 |
+
Args:
|
| 190 |
+
midi_num: MIDI note number (0-127)
|
| 191 |
+
|
| 192 |
+
Returns:
|
| 193 |
+
Solfege syllable
|
| 194 |
+
"""
|
| 195 |
+
pitch_class = midi_num % 12
|
| 196 |
+
|
| 197 |
+
# Map pitch class to solfege (Fixed Do)
|
| 198 |
+
PITCH_CLASS_TO_SOLFEGE = {
|
| 199 |
+
0: 'do', # C
|
| 200 |
+
1: 'do', # C#/Db -> do
|
| 201 |
+
2: 're', # D
|
| 202 |
+
3: 're', # D#/Eb -> re
|
| 203 |
+
4: 'mi', # E
|
| 204 |
+
5: 'fa', # F
|
| 205 |
+
6: 'fa', # F#/Gb -> fa
|
| 206 |
+
7: 'sol', # G
|
| 207 |
+
8: 'sol', # G#/Ab -> sol
|
| 208 |
+
9: 'la', # A
|
| 209 |
+
10: 'la', # A#/Bb -> la
|
| 210 |
+
11: 'ti' # B
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
return PITCH_CLASS_TO_SOLFEGE.get(pitch_class, 'do')
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def midi_to_solfege_movable(midi_num: int, key_fifths: int) -> str:
|
| 217 |
+
"""
|
| 218 |
+
Convert MIDI note to solfege using Movable Do.
|
| 219 |
+
Based on scale degree relative to key.
|
| 220 |
+
|
| 221 |
+
Args:
|
| 222 |
+
midi_num: MIDI note number
|
| 223 |
+
key_fifths: Key signature fifths (0=C, 1=G, -1=F, etc.)
|
| 224 |
+
|
| 225 |
+
Returns:
|
| 226 |
+
Solfege syllable
|
| 227 |
+
"""
|
| 228 |
+
# Calculate tonic pitch class from fifths
|
| 229 |
+
tonic_pitch_class = ((key_fifths * 7) % 12 + 12) % 12
|
| 230 |
+
|
| 231 |
+
# Calculate scale degree
|
| 232 |
+
pitch_class = midi_num % 12
|
| 233 |
+
scale_degree = (pitch_class - tonic_pitch_class + 12) % 12
|
| 234 |
+
|
| 235 |
+
# Map scale degree to solfege (chromatic)
|
| 236 |
+
SCALE_DEGREE_TO_SOLFEGE = {
|
| 237 |
+
0: 'do', # Tonic
|
| 238 |
+
1: 'do', # Minor 2nd
|
| 239 |
+
2: 're', # Major 2nd
|
| 240 |
+
3: 're', # Minor 3rd
|
| 241 |
+
4: 'mi', # Major 3rd
|
| 242 |
+
5: 'fa', # Perfect 4th
|
| 243 |
+
6: 'fa', # Tritone
|
| 244 |
+
7: 'sol', # Perfect 5th
|
| 245 |
+
8: 'sol', # Minor 6th
|
| 246 |
+
9: 'la', # Major 6th
|
| 247 |
+
10: 'la', # Minor 7th
|
| 248 |
+
11: 'ti' # Major 7th
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
return SCALE_DEGREE_TO_SOLFEGE.get(scale_degree, 'do')
|