JeffreyZhou798 commited on
Commit
ecadc11
·
verified ·
1 Parent(s): 38984a2

Upload 8 files

Browse files
backend/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SolfegeScoreSinger Backend Modules
3
+ """
4
+
5
+ from .config import get_model, get_default_voice_path
6
+ from .i18n import I18n
7
+
8
+ __all__ = ['get_model', 'get_default_voice_path', 'I18n']
backend/audio_mixer.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Audio Mixer Module
3
+ Mixes multiple voice tracks into a single output
4
+ """
5
+
6
+ import numpy as np
7
+ from typing import List, Optional
8
+
9
+
10
+ def mix_voices(
11
+ voice_audios: List[np.ndarray],
12
+ method: str = "sum",
13
+ normalize: bool = True
14
+ ) -> np.ndarray:
15
+ """
16
+ Mix multiple voice audio tracks.
17
+
18
+ Args:
19
+ voice_audios: List of audio arrays (one per voice)
20
+ method: Mixing method ("sum", "average", "weighted")
21
+ normalize: Whether to normalize output
22
+
23
+ Returns:
24
+ Mixed audio array
25
+ """
26
+ if not voice_audios:
27
+ return np.zeros(44100) # 1 second silence
28
+
29
+ if len(voice_audios) == 1:
30
+ audio = voice_audios[0]
31
+ if normalize:
32
+ audio = normalize_audio(audio)
33
+ return audio
34
+
35
+ # Find maximum length
36
+ max_length = max(len(audio) for audio in voice_audios)
37
+
38
+ # Pad shorter audios with silence
39
+ padded_audios = []
40
+ for audio in voice_audios:
41
+ if len(audio) < max_length:
42
+ padding = np.zeros(max_length - len(audio))
43
+ padded_audio = np.concatenate([audio, padding])
44
+ else:
45
+ padded_audio = audio
46
+ padded_audios.append(padded_audio)
47
+
48
+ # Mix
49
+ if method == "sum":
50
+ mixed = np.sum(padded_audios, axis=0)
51
+ elif method == "average":
52
+ mixed = np.mean(padded_audios, axis=0)
53
+ elif method == "weighted":
54
+ # Weight by inverse of energy (quieter voices get higher weight)
55
+ energies = [np.sum(audio ** 2) for audio in padded_audios]
56
+ weights = [1.0 / (e + 1e-10) for e in energies]
57
+ total_weight = sum(weights)
58
+ weights = [w / total_weight for w in weights]
59
+
60
+ mixed = np.zeros(max_length)
61
+ for audio, weight in zip(padded_audios, weights):
62
+ mixed += audio * weight
63
+ else:
64
+ mixed = np.sum(padded_audios, axis=0)
65
+
66
+ # Normalize
67
+ if normalize:
68
+ mixed = normalize_audio(mixed)
69
+
70
+ return mixed
71
+
72
+
73
+ def normalize_audio(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:
74
+ """
75
+ Normalize audio to target dB level.
76
+
77
+ Args:
78
+ audio: Audio array
79
+ target_db: Target dB level (default -3.0 dB)
80
+
81
+ Returns:
82
+ Normalized audio
83
+ """
84
+ # Calculate current RMS
85
+ rms = np.sqrt(np.mean(audio ** 2))
86
+
87
+ if rms < 1e-10:
88
+ return audio # Avoid division by zero
89
+
90
+ # Calculate target RMS
91
+ target_rms = 10 ** (target_db / 20) * 0.1
92
+
93
+ # Apply gain
94
+ gain = target_rms / rms
95
+ normalized = audio * gain
96
+
97
+ # Clip to prevent overflow
98
+ normalized = np.clip(normalized, -1.0, 1.0)
99
+
100
+ return normalized
101
+
102
+
103
+ def apply_fade(audio: np.ndarray, fade_in: float = 0.01, fade_out: float = 0.01, sample_rate: int = 44100) -> np.ndarray:
104
+ """
105
+ Apply fade in/out to audio.
106
+
107
+ Args:
108
+ audio: Audio array
109
+ fade_in: Fade in duration (seconds)
110
+ fade_out: Fade out duration (seconds)
111
+ sample_rate: Sample rate
112
+
113
+ Returns:
114
+ Audio with fades applied
115
+ """
116
+ audio = audio.copy()
117
+
118
+ # Fade in
119
+ fade_in_samples = int(fade_in * sample_rate)
120
+ if fade_in_samples > 0 and fade_in_samples < len(audio):
121
+ fade_in_curve = np.linspace(0, 1, fade_in_samples)
122
+ audio[:fade_in_samples] *= fade_in_curve
123
+
124
+ # Fade out
125
+ fade_out_samples = int(fade_out * sample_rate)
126
+ if fade_out_samples > 0 and fade_out_samples < len(audio):
127
+ fade_out_curve = np.linspace(1, 0, fade_out_samples)
128
+ audio[-fade_out_samples:] *= fade_out_curve
129
+
130
+ return audio
backend/config.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration and Model Management
3
+ Implements lazy loading to save memory on CPU environment
4
+ """
5
+
6
+ import os
7
+ import torch
8
+
9
+ # ============================================================================
10
+ # Environment Optimization (CPU)
11
+ # ============================================================================
12
+
13
+ os.environ["OMP_NUM_THREADS"] = "4"
14
+ os.environ["TORCH_NUM_THREADS"] = "4"
15
+ os.environ["MKL_NUM_THREADS"] = "4"
16
+
17
+
18
+ # ============================================================================
19
+ # Global Model Instance (Lazy Loading)
20
+ # ============================================================================
21
+
22
+ _model = None
23
+
24
+
25
+ def get_model():
26
+ """
27
+ Lazy load SoulX-Singer model.
28
+ Avoids loading on startup to save memory.
29
+
30
+ Returns:
31
+ SoulX-Singer model instance
32
+ """
33
+ global _model
34
+
35
+ if _model is None:
36
+ print("Loading SoulX-Singer model on CPU...")
37
+
38
+ # Import model from soulxsinger directory
39
+ import sys
40
+ base_path = os.path.dirname(__file__)
41
+ soulx_path = os.path.join(base_path, '..', 'soulxsinger')
42
+ cli_path = os.path.join(base_path, '..', 'cli')
43
+
44
+ # Add paths to sys.path
45
+ if os.path.exists(soulx_path):
46
+ sys.path.insert(0, os.path.dirname(soulx_path))
47
+ if os.path.exists(cli_path):
48
+ sys.path.insert(0, os.path.dirname(cli_path))
49
+
50
+ from cli.inference import SoulX_Singer
51
+
52
+ # Check for model weights - Auto-download if missing
53
+ model_weights_path = os.path.join(base_path, '..', 'pretrained_models', 'SoulX-Singer', 'model.pt')
54
+
55
+ if not os.path.exists(model_weights_path):
56
+ print("⚠️ Model weights not found!")
57
+ print("🔄 Attempting automatic download from HuggingFace Hub...")
58
+
59
+ try:
60
+ # Install huggingface-hub if not already installed
61
+ import subprocess
62
+ subprocess.check_call(['pip', 'install', '-q', 'huggingface-hub'])
63
+
64
+ # Download model weights
65
+ from huggingface_hub import snapshot_download
66
+ model_dir = os.path.join(base_path, '..', 'pretrained_models', 'SoulX-Singer')
67
+ os.makedirs(model_dir, exist_ok=True)
68
+
69
+ print("⬇️ Downloading SoulX-Singer model (~1.5GB)...")
70
+ snapshot_download(
71
+ repo_id='Soul-AILab/SoulX-Singer',
72
+ local_dir=model_dir,
73
+ local_dir_use_symlinks=False,
74
+ ignore_patterns=['*.md', '*.txt', 'LICENSE', 'config/**', 'utils/**', 'scripts/**']
75
+ )
76
+ print("✅ Model downloaded successfully!")
77
+
78
+ except Exception as e:
79
+ print(f"❌ Auto-download failed: {e}")
80
+ print("Please manually download model.pt from:")
81
+ print("https://huggingface.co/Soul-AILab/SoulX-Singer")
82
+ print("And place it at: pretrained_models/SoulX-Singer/model.pt")
83
+ raise FileNotFoundError("Model weights not found and auto-download failed. See instructions above.")
84
+
85
+ # Load with INT8 quantization for CPU optimization
86
+ _model = SoulX_Singer(
87
+ config_path=os.path.join(soulx_path, "config", "soulxsinger.yaml"),
88
+ checkpoint_path=model_weights_path,
89
+ device='cpu',
90
+ dtype=torch.int8 # INT8 quantization
91
+ )
92
+
93
+ print("✅ Model loaded successfully!")
94
+
95
+ return _model
96
+
97
+
98
+ def clear_model():
99
+ """
100
+ Clear model from memory.
101
+ Call this when generation is complete to free resources.
102
+ """
103
+ global _model
104
+
105
+ if _model is not None:
106
+ del _model
107
+ _model = None
108
+
109
+ import gc
110
+ gc.collect()
111
+
112
+ print("✅ Model memory cleared")
113
+
114
+
115
+ def get_default_voice_path() -> str:
116
+ """
117
+ Get path to default voice samples (child voice).
118
+
119
+ Returns:
120
+ Path to DefaultVoice_Child directory
121
+ """
122
+ return os.path.join(os.path.dirname(__file__), '..', 'DefaultVoice_Child')
123
+
124
+
125
+ def get_cpu_warning() -> str:
126
+ """
127
+ Get CPU environment warning message.
128
+
129
+ Returns:
130
+ Warning text about CPU generation time
131
+ """
132
+ return "CPU Environment: Generation may take 5-10 min per second of audio"
133
+
134
+
135
+ # ============================================================================
136
+ # Model Inference Settings
137
+ # ============================================================================
138
+
139
+ INFERENCE_CONFIG = {
140
+ 'n_steps': 12, # Reduced steps for CPU (default 32)
141
+ 'cfg': 3.0, # CFG scale
142
+ 'control': 'score', # Score-controlled mode
143
+ 'use_fp16': False, # FP16 not supported on CPU
144
+ 'segment_duration': 8.0 # Max segment duration (seconds)
145
+ }
146
+
147
+
148
+ def get_inference_config():
149
+ """Get default inference configuration for CPU"""
150
+ return INFERENCE_CONFIG.copy()
backend/denoise.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Denoise Module
3
+ Provides optional audio denoising using noisereduce
4
+ """
5
+
6
+ import numpy as np
7
+ from typing import Optional
8
+
9
+
10
+ def denoise_audio(
11
+ audio: np.ndarray,
12
+ sample_rate: int = 44100,
13
+ prop_decrease: float = 0.5,
14
+ stationary: bool = True
15
+ ) -> np.ndarray:
16
+ """
17
+ Apply noise reduction to audio.
18
+
19
+ Args:
20
+ audio: Audio array
21
+ sample_rate: Sample rate
22
+ prop_decrease: Proportion of noise to decrease (0.0-1.0)
23
+ stationary: Whether noise is stationary
24
+
25
+ Returns:
26
+ Denoised audio
27
+ """
28
+ try:
29
+ import noisereduce as nr
30
+
31
+ denoised = nr.reduce_noise(
32
+ y=audio,
33
+ sr=sample_rate,
34
+ prop_decrease=prop_decrease,
35
+ stationary=stationary
36
+ )
37
+
38
+ return denoised
39
+
40
+ except ImportError:
41
+ print("Warning: noisereduce not installed, returning original audio")
42
+ return audio
43
+ except Exception as e:
44
+ print(f"Error in denoise_audio: {e}")
45
+ return audio
46
+
47
+
48
+ def detect_noise_profile(
49
+ audio: np.ndarray,
50
+ sample_rate: int = 44100,
51
+ noise_duration: float = 0.5
52
+ ) -> Optional[np.ndarray]:
53
+ """
54
+ Detect noise profile from beginning of audio.
55
+
56
+ Args:
57
+ audio: Audio array
58
+ sample_rate: Sample rate
59
+ noise_duration: Duration to analyze for noise (seconds)
60
+
61
+ Returns:
62
+ Noise profile or None
63
+ """
64
+ noise_samples = int(noise_duration * sample_rate)
65
+
66
+ if len(audio) < noise_samples:
67
+ return None
68
+
69
+ noise_segment = audio[:noise_samples]
70
+
71
+ # Calculate noise statistics
72
+ noise_rms = np.sqrt(np.mean(noise_segment ** 2))
73
+
74
+ return {
75
+ 'rms': noise_rms,
76
+ 'segment': noise_segment
77
+ }
backend/i18n.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Internationalization (i18n) Module
3
+ Supports English, Chinese, and Japanese
4
+ """
5
+
6
+ import json
7
+ import os
8
+ from typing import Dict, Optional
9
+
10
+
11
+ class I18n:
12
+ """Multi-language support class"""
13
+
14
+ def __init__(self, default_lang: str = 'en'):
15
+ """
16
+ Initialize i18n module.
17
+
18
+ Args:
19
+ default_lang: Default language code ('en', 'zh', 'ja')
20
+ """
21
+ self.current_lang = default_lang
22
+ self.translations = self._load_translations()
23
+
24
+ def _load_translations(self) -> Dict:
25
+ """Load translation files from locales directory"""
26
+ translations = {}
27
+ locales_dir = os.path.join(os.path.dirname(__file__), '..', 'locales')
28
+
29
+ if not os.path.exists(locales_dir):
30
+ print(f"Warning: Locales directory not found: {locales_dir}")
31
+ return self._get_default_translations()
32
+
33
+ for lang_file in os.listdir(locales_dir):
34
+ if lang_file.endswith('.json'):
35
+ lang_code = lang_file.replace('.json', '')
36
+ file_path = os.path.join(locales_dir, lang_file)
37
+
38
+ try:
39
+ with open(file_path, 'r', encoding='utf-8') as f:
40
+ translations[lang_code] = json.load(f)
41
+ except Exception as e:
42
+ print(f"Error loading {lang_file}: {e}")
43
+
44
+ # Fallback to default if no translations loaded
45
+ if not translations:
46
+ translations = self._get_default_translations()
47
+
48
+ return translations
49
+
50
+ def _get_default_translations(self) -> Dict:
51
+ """Get default English translations (fallback)"""
52
+ return {
53
+ 'en': {
54
+ 'title': '🎵 SolfegeScoreSinger - AI Singing Synthesis',
55
+ 'record_tab': 'Record Samples',
56
+ 'upload_tab': 'Upload Score',
57
+ 'config_tab': 'Configuration',
58
+ 'generate_tab': 'Generate & Download',
59
+ 'syllables': ['Do', 'Re', 'Mi', 'Fa', 'Sol', 'La', 'Ti'],
60
+ 'record_instruction': 'Record 7 solfege syllables to clone your voice',
61
+ 'upload_score': 'Upload Score (MIDI/MusicXML)',
62
+ 'voice_mode': 'Voice Mode',
63
+ 'my_recording': 'My Recording',
64
+ 'child_voice': 'Child Voice (Built-in)',
65
+ 'solfege_mode': 'Solfege Mode',
66
+ 'movable_do': 'Movable Do (首调)',
67
+ 'fixed_do': 'Fixed Do (固定调)',
68
+ 'denoise': 'Enable Denoising',
69
+ 'denoise_note': 'Default: No denoising (fidelity priority)',
70
+ 'generate': 'Generate Audio',
71
+ 'download': 'Download Audio',
72
+ 'cpu_warning': 'CPU Environment: Generation may take 5-10 min per second of audio',
73
+ 'validating': 'Validating inputs...',
74
+ 'parsing_score': 'Parsing score...',
75
+ 'preparing_samples': 'Preparing voice samples...',
76
+ 'generating_metadata': 'Generating metadata...',
77
+ 'loading_model': 'Loading AI model...',
78
+ 'mixing_voices': 'Mixing voices...',
79
+ 'saving_audio': 'Saving audio file...'
80
+ }
81
+ }
82
+
83
+ def set_language(self, lang: str):
84
+ """Set current language"""
85
+ if lang in self.translations:
86
+ self.current_lang = lang
87
+ else:
88
+ print(f"Warning: Language '{lang}' not found, using default")
89
+
90
+ def t(self, key: str) -> str:
91
+ """
92
+ Translate a key to current language.
93
+
94
+ Args:
95
+ key: Translation key (supports nested keys with '.')
96
+
97
+ Returns:
98
+ Translated text
99
+ """
100
+ keys = key.split('.')
101
+ value = self.translations.get(self.current_lang, {})
102
+
103
+ for k in keys:
104
+ if isinstance(value, dict):
105
+ value = value.get(k, key)
106
+ else:
107
+ return key
108
+
109
+ return value if isinstance(value, str) else key
110
+
111
+ def get_all_texts(self) -> Dict:
112
+ """Get all texts for current language"""
113
+ return self.translations.get(self.current_lang, self._get_default_translations()['en'])
backend/metadata_generator.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Metadata Generator Module
3
+ Generates SoulX-Singer metadata from score and voice samples
4
+ """
5
+
6
+ import os
7
+ import numpy as np
8
+ import soundfile as sf
9
+ from typing import Dict, List, Optional
10
+ from .score_parser import SOLFEGE_SYLLABLES
11
+
12
+
13
+ def prepare_voice_samples(
14
+ voice_mode: str,
15
+ user_samples: Optional[Dict[str, str]],
16
+ enable_denoise: bool
17
+ ) -> Dict[str, np.ndarray]:
18
+ """
19
+ Prepare voice samples for synthesis.
20
+
21
+ Args:
22
+ voice_mode: "My Recording" or "Child Voice"
23
+ user_samples: Dict mapping syllable to audio file path
24
+ enable_denoise: Whether to apply denoising
25
+
26
+ Returns:
27
+ Dict mapping syllable to audio array
28
+ """
29
+ from .config import get_default_voice_path
30
+
31
+ samples = {}
32
+
33
+ if voice_mode == "Child Voice (Built-in)" or voice_mode == "童声音色 (内置)" or voice_mode == "子供の声 (内蔵)":
34
+ # Load default voice
35
+ default_path = get_default_voice_path()
36
+
37
+ for syllable in SOLFEGE_SYLLABLES:
38
+ # Capitalize first letter for filename
39
+ filename = syllable.capitalize() + '.wav'
40
+ file_path = os.path.join(default_path, filename)
41
+
42
+ if os.path.exists(file_path):
43
+ audio, sr = sf.read(file_path)
44
+ samples[syllable] = audio
45
+ else:
46
+ print(f"Warning: Default voice file not found: {file_path}")
47
+
48
+ elif user_samples:
49
+ # Load user recorded samples
50
+ for syllable in SOLFEGE_SYLLABLES:
51
+ file_path = user_samples.get(syllable)
52
+
53
+ if file_path and os.path.exists(file_path):
54
+ audio, sr = sf.read(file_path)
55
+
56
+ # Apply denoising if enabled
57
+ if enable_denoise:
58
+ audio = apply_denoise(audio, sr)
59
+
60
+ samples[syllable] = audio
61
+
62
+ return samples
63
+
64
+
65
+ def apply_denoise(audio: np.ndarray, sample_rate: int) -> np.ndarray:
66
+ """
67
+ Apply conservative denoising using noisereduce.
68
+
69
+ Args:
70
+ audio: Audio array
71
+ sample_rate: Sample rate
72
+
73
+ Returns:
74
+ Denoised audio
75
+ """
76
+ try:
77
+ import noisereduce as nr
78
+ return nr.reduce_noise(y=audio, sr=sample_rate, prop_decrease=0.5)
79
+ except ImportError:
80
+ print("Warning: noisereduce not installed, skipping denoising")
81
+ return audio
82
+
83
+
84
+ def generate_metadata_for_voices(
85
+ voices: List[Dict],
86
+ voice_samples: Dict[str, np.ndarray]
87
+ ) -> List[Dict]:
88
+ """
89
+ Generate SoulX-Singer metadata for each voice.
90
+
91
+ Args:
92
+ voices: List of voice data from score parser
93
+ voice_samples: Dict of syllable -> audio array
94
+
95
+ Returns:
96
+ List of metadata dicts for SoulX-Singer
97
+ """
98
+ metadata_list = []
99
+
100
+ for voice in voices:
101
+ notes = voice['notes']
102
+
103
+ # Create prompt audio by concatenating solfege samples
104
+ prompt_audio = create_prompt_audio(notes, voice_samples)
105
+
106
+ # Create target metadata
107
+ target_metadata = create_target_metadata(notes)
108
+
109
+ metadata = {
110
+ 'voice_id': voice['id'],
111
+ 'instrument': voice['instrument'],
112
+ 'prompt_audio': prompt_audio,
113
+ 'target': target_metadata
114
+ }
115
+
116
+ metadata_list.append(metadata)
117
+
118
+ return metadata_list
119
+
120
+
121
+ def create_prompt_audio(notes: List[Dict], voice_samples: Dict[str, np.ndarray]) -> np.ndarray:
122
+ """
123
+ Create prompt audio by concatenating voice samples.
124
+
125
+ Strategy:
126
+ - Use first few notes' solfege to create a representative prompt
127
+ - Aim for ~3-5 seconds of prompt audio
128
+
129
+ Args:
130
+ notes: List of notes for this voice
131
+ voice_samples: Dict of syllable -> audio array
132
+
133
+ Returns:
134
+ Concatenated prompt audio
135
+ """
136
+ # Get unique solfeges from first few notes
137
+ solfeges = []
138
+ for note in notes[:10]:
139
+ solfege = note['solfege']
140
+ if solfege not in solfeges and solfege in voice_samples:
141
+ solfeges.append(solfege)
142
+
143
+ # Use at least 3 different syllables
144
+ if len(solfeges) < 3:
145
+ for syllable in SOLFEGE_SYLLABLES:
146
+ if syllable not in solfeges and syllable in voice_samples:
147
+ solfeges.append(syllable)
148
+ if len(solfeges) >= 3:
149
+ break
150
+
151
+ # Concatenate samples with small gaps
152
+ prompt_segments = []
153
+ for syllable in solfeges[:5]:
154
+ if syllable in voice_samples:
155
+ sample = voice_samples[syllable]
156
+ prompt_segments.append(sample)
157
+
158
+ # Add small gap (50ms silence)
159
+ gap = np.zeros(int(44100 * 0.05))
160
+ prompt_segments.append(gap)
161
+
162
+ if prompt_segments:
163
+ return np.concatenate(prompt_segments)
164
+ else:
165
+ # Fallback: use first available sample
166
+ for sample in voice_samples.values():
167
+ return sample
168
+
169
+ return np.zeros(44100) # 1 second silence
170
+
171
+
172
+ def create_target_metadata(notes: List[Dict]) -> Dict:
173
+ """
174
+ Create target metadata for SoulX-Singer.
175
+
176
+ Args:
177
+ notes: List of notes
178
+
179
+ Returns:
180
+ Target metadata dict
181
+ """
182
+ # Convert notes to SoulX format
183
+ phonemes = []
184
+ note_pitches = []
185
+ note_durations = []
186
+ note_types = []
187
+
188
+ for note in notes:
189
+ solfege = note['solfege']
190
+ midi_num = note['midi']
191
+ duration = note['duration']
192
+
193
+ # Phoneme (simplified - just use solfege name)
194
+ phoneme = solfege_to_phoneme(solfege)
195
+ phonemes.append(phoneme)
196
+
197
+ # Pitch
198
+ note_pitches.append(midi_num)
199
+
200
+ # Duration (in frames)
201
+ note_durations.append(int(duration * 44100 / 256)) # Assume 256 samples per frame
202
+
203
+ # Note type (1 = regular)
204
+ note_types.append(1)
205
+
206
+ return {
207
+ 'phoneme': phonemes,
208
+ 'note_pitch': note_pitches,
209
+ 'note_duration': note_durations,
210
+ 'note_type': note_types,
211
+ 'duration': sum(note['duration'] for note in notes)
212
+ }
213
+
214
+
215
+ def solfege_to_phoneme(solfege: str) -> str:
216
+ """
217
+ Convert solfege syllable to phoneme.
218
+
219
+ Args:
220
+ solfege: Solfege syllable (do, re, mi, fa, sol, la, ti)
221
+
222
+ Returns:
223
+ Phoneme string
224
+ """
225
+ # ARPAbet phonemes (simplified)
226
+ SOLFEGE_TO_PHONEME = {
227
+ 'do': 'd ow',
228
+ 're': 'r ey',
229
+ 'mi': 'm iy',
230
+ 'fa': 'f aa',
231
+ 'sol': 's ow l',
232
+ 'la': 'l aa',
233
+ 'ti': 't iy'
234
+ }
235
+
236
+ return SOLFEGE_TO_PHONEME.get(solfege, 'd ow')
backend/multi_voice_engine.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Multi-Voice Engine Module
3
+ Handles SoulX-Singer model inference for multiple voices
4
+ Implements segment-based processing for long scores
5
+ """
6
+
7
+ import numpy as np
8
+ import torch
9
+ from typing import Dict, List, Optional, Callable
10
+ import gc
11
+
12
+ from .config import get_inference_config
13
+
14
+
15
+ class MultiVoiceEngine:
16
+ """
17
+ Multi-voice synthesis engine using SoulX-Singer.
18
+
19
+ Features:
20
+ - Segment-based processing for long scores (≤8s per segment)
21
+ - Memory management with garbage collection
22
+ - Progress callback support
23
+ """
24
+
25
+ def __init__(self, model):
26
+ """
27
+ Initialize engine with SoulX-Singer model.
28
+
29
+ Args:
30
+ model: SoulX-Singer model instance
31
+ """
32
+ self.model = model
33
+ self.config = get_inference_config()
34
+
35
+ def generate_single_voice(
36
+ self,
37
+ metadata: Dict,
38
+ on_progress: Optional[Callable[[float], None]] = None
39
+ ) -> np.ndarray:
40
+ """
41
+ Generate audio for a single voice.
42
+
43
+ Args:
44
+ metadata: Voice metadata from metadata_generator
45
+ on_progress: Progress callback function
46
+
47
+ Returns:
48
+ Generated audio array
49
+ """
50
+ target = metadata['target']
51
+ prompt_audio = metadata['prompt_audio']
52
+
53
+ # Check if segmentation is needed
54
+ total_duration = target['duration']
55
+ segment_duration = self.config['segment_duration']
56
+
57
+ if total_duration <= segment_duration:
58
+ # Single segment
59
+ return self._generate_segment(prompt_audio, target, on_progress)
60
+ else:
61
+ # Multiple segments
62
+ return self._generate_segments(prompt_audio, target, on_progress)
63
+
64
+ def _generate_segment(
65
+ self,
66
+ prompt_audio: np.ndarray,
67
+ target: Dict,
68
+ on_progress: Optional[Callable[[float], None]] = None
69
+ ) -> np.ndarray:
70
+ """
71
+ Generate a single segment (≤8 seconds).
72
+
73
+ Args:
74
+ prompt_audio: Prompt audio array
75
+ target: Target metadata
76
+ on_progress: Progress callback
77
+
78
+ Returns:
79
+ Generated audio for this segment
80
+ """
81
+ try:
82
+ # Prepare model input
83
+ infer_data = {
84
+ 'prompt': {
85
+ 'waveform': torch.from_numpy(prompt_audio).float(),
86
+ 'phoneme': self._phonemes_to_tensor(target['phoneme'][:len(prompt_audio)//100]),
87
+ 'note_pitch': torch.tensor(target['note_pitch'][:len(prompt_audio)//100]),
88
+ 'note_type': torch.tensor(target['note_type'][:len(prompt_audio)//100])
89
+ },
90
+ 'target': {
91
+ 'phoneme': self._phonemes_to_tensor(target['phoneme']),
92
+ 'note_pitch': torch.tensor(target['note_pitch']),
93
+ 'note_type': torch.tensor(target['note_type'])
94
+ }
95
+ }
96
+
97
+ # Run inference
98
+ with torch.no_grad():
99
+ output = self.model.infer(
100
+ infer_data,
101
+ auto_shift=False,
102
+ pitch_shift=0,
103
+ n_steps=self.config['n_steps'],
104
+ cfg=self.config['cfg'],
105
+ control=self.config['control'],
106
+ use_fp16=self.config['use_fp16']
107
+ )
108
+
109
+ # Clean up
110
+ del infer_data
111
+ gc.collect()
112
+
113
+ if on_progress:
114
+ on_progress(100.0)
115
+
116
+ return output.cpu().numpy() if torch.is_tensor(output) else output
117
+
118
+ except Exception as e:
119
+ print(f"Error in _generate_segment: {e}")
120
+ # Fallback: return silence
121
+ duration = target.get('duration', 1.0)
122
+ return np.zeros(int(44100 * duration))
123
+
124
+ def _generate_segments(
125
+ self,
126
+ prompt_audio: np.ndarray,
127
+ target: Dict,
128
+ on_progress: Optional[Callable[[float], None]] = None
129
+ ) -> np.ndarray:
130
+ """
131
+ Generate multiple segments and concatenate.
132
+
133
+ Args:
134
+ prompt_audio: Prompt audio
135
+ target: Target metadata
136
+ on_progress: Progress callback
137
+
138
+ Returns:
139
+ Concatenated generated audio
140
+ """
141
+ total_duration = target['duration']
142
+ segment_duration = self.config['segment_duration']
143
+ num_segments = int(np.ceil(total_duration / segment_duration))
144
+
145
+ segments = []
146
+
147
+ for i in range(num_segments):
148
+ # Extract segment metadata
149
+ start_time = i * segment_duration
150
+ end_time = min((i + 1) * segment_duration, total_duration)
151
+
152
+ segment_target = self._extract_segment(target, start_time, end_time)
153
+
154
+ # Generate this segment
155
+ segment_audio = self._generate_segment(prompt_audio, segment_target)
156
+ segments.append(segment_audio)
157
+
158
+ # Update progress
159
+ if on_progress:
160
+ progress = (i + 1) / num_segments * 100
161
+ on_progress(progress)
162
+
163
+ # Memory cleanup
164
+ gc.collect()
165
+
166
+ # Concatenate segments
167
+ return np.concatenate(segments)
168
+
169
+ def _extract_segment(
170
+ self,
171
+ target: Dict,
172
+ start_time: float,
173
+ end_time: float
174
+ ) -> Dict:
175
+ """
176
+ Extract a time segment from target metadata.
177
+
178
+ Args:
179
+ target: Full target metadata
180
+ start_time: Segment start time (seconds)
181
+ end_time: Segment end time (seconds)
182
+
183
+ Returns:
184
+ Segment metadata
185
+ """
186
+ # Simplified: just return full target for now
187
+ # TODO: Implement proper time-based extraction
188
+ return {
189
+ 'phoneme': target['phoneme'],
190
+ 'note_pitch': target['note_pitch'],
191
+ 'note_type': target['note_type'],
192
+ 'duration': end_time - start_time
193
+ }
194
+
195
+ def _phonemes_to_tensor(self, phonemes: List[str]) -> torch.Tensor:
196
+ """
197
+ Convert phoneme list to tensor.
198
+
199
+ Args:
200
+ phonemes: List of phoneme strings
201
+
202
+ Returns:
203
+ Phoneme tensor
204
+ """
205
+ # Simplified: convert to indices
206
+ # TODO: Use proper phoneme vocabulary
207
+ phoneme_to_idx = {
208
+ 'd ow': 0, 'r ey': 1, 'm iy': 2, 'f aa': 3,
209
+ 's ow l': 4, 'l aa': 5, 't iy': 6
210
+ }
211
+
212
+ indices = [phoneme_to_idx.get(p, 0) for p in phonemes]
213
+ return torch.tensor(indices, dtype=torch.long)
backend/score_parser.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Score Parser Module
3
+ Supports MIDI and MusicXML formats
4
+ Implements key detection and solfege mapping
5
+ """
6
+
7
+ import os
8
+ from typing import Dict, List, Optional, Tuple
9
+ import tempfile
10
+
11
+ # Solfege syllables (CORRECTED: 'sol' not 'so')
12
+ SOLFEGE_SYLLABLES = ['do', 're', 'mi', 'fa', 'sol', 'la', 'ti']
13
+
14
+ # Reference pitches (C4 octave)
15
+ REFERENCE_PITCHES = {
16
+ 'do': 261.63, 're': 293.66, 'mi': 329.63, 'fa': 349.23,
17
+ 'sol': 392.00, 'la': 440.00, 'ti': 493.88
18
+ }
19
+
20
+
21
+ def quick_parse_score(file_path: str) -> Dict:
22
+ """
23
+ Quick parse score to get basic info (duration, voice count).
24
+ Used for time estimation.
25
+
26
+ Args:
27
+ file_path: Path to MIDI or MusicXML file
28
+
29
+ Returns:
30
+ {
31
+ 'duration': float (seconds),
32
+ 'voice_count': int,
33
+ 'key': str
34
+ }
35
+ """
36
+ try:
37
+ # Try music21 first
38
+ from music21 import converter
39
+
40
+ score = converter.parse(file_path)
41
+ duration = score.duration.quarterLength / 2 # Rough estimate (120 BPM)
42
+ voice_count = len(score.parts) if hasattr(score, 'parts') else 1
43
+
44
+ # Key detection
45
+ key_analysis = score.analyze('key')
46
+ key_name = f"{key_analysis.tonic.name} {key_analysis.mode}"
47
+
48
+ return {
49
+ 'duration': max(duration, 10), # Minimum 10s
50
+ 'voice_count': max(voice_count, 1),
51
+ 'key': key_name
52
+ }
53
+
54
+ except Exception as e:
55
+ print(f"Error in quick_parse_score: {e}")
56
+ # Fallback
57
+ return {
58
+ 'duration': 30,
59
+ 'voice_count': 1,
60
+ 'key': 'C major'
61
+ }
62
+
63
+
64
+ def parse_score_with_solfege(file_path: str, mode: str = "movable") -> Dict:
65
+ """
66
+ Parse score and generate solfege mapping.
67
+
68
+ Args:
69
+ file_path: Path to MIDI or MusicXML file
70
+ mode: "movable" or "fixed"
71
+
72
+ Returns:
73
+ {
74
+ 'key': str,
75
+ 'duration': float,
76
+ 'voices': List[Dict],
77
+ 'solfege_table': List[List] # For Gradio Dataframe
78
+ }
79
+ """
80
+ from music21 import converter, key, pitch, note
81
+
82
+ try:
83
+ score = converter.parse(file_path)
84
+
85
+ # Detect key
86
+ key_analysis = score.analyze('key')
87
+ key_name = f"{key_analysis.tonic.name} {key_analysis.mode}"
88
+ key_fifths = key_analysis.sharps
89
+
90
+ # Extract voices
91
+ voices = []
92
+ solfege_table = []
93
+
94
+ for part_idx, part in enumerate(score.parts):
95
+ voice_notes = []
96
+
97
+ for element in part.flatten().notes:
98
+ if isinstance(element, note.Note):
99
+ # Get MIDI number
100
+ midi_num = element.pitch.midi
101
+
102
+ # Map to solfege
103
+ if mode == "movable":
104
+ solfege = midi_to_solfege_movable(midi_num, key_fifths)
105
+ else:
106
+ solfege = midi_to_solfege_fixed(midi_num)
107
+
108
+ # Get measure and beat
109
+ measure = element.measureNumber or 1
110
+ beat = element.beat or 1
111
+
112
+ # Duration in seconds (assume 120 BPM)
113
+ duration = element.duration.quarterLength * 0.5
114
+
115
+ voice_notes.append({
116
+ 'midi': midi_num,
117
+ 'solfege': solfege,
118
+ 'start': element.offset,
119
+ 'duration': duration,
120
+ 'measure': measure,
121
+ 'beat': beat
122
+ })
123
+
124
+ # Add to correction table (first 20 notes)
125
+ if len(solfege_table) < 20:
126
+ solfege_table.append([
127
+ len(solfege_table) + 1,
128
+ measure,
129
+ f"{beat:.1f}",
130
+ solfege,
131
+ "" # User correction
132
+ ])
133
+
134
+ voices.append({
135
+ 'id': part_idx,
136
+ 'instrument': part.partName or f"Voice {part_idx + 1}",
137
+ 'notes': voice_notes
138
+ })
139
+
140
+ # Total duration
141
+ total_duration = score.duration.quarterLength * 0.5
142
+
143
+ return {
144
+ 'key': key_name,
145
+ 'duration': total_duration,
146
+ 'voices': voices,
147
+ 'solfege_table': solfege_table
148
+ }
149
+
150
+ except Exception as e:
151
+ print(f"Error parsing score: {e}")
152
+ raise
153
+
154
+
155
+ def parse_score_with_correction(file_path: str, mode: str = "movable", corrections=None) -> Dict:
156
+ """
157
+ Parse score with optional user corrections.
158
+
159
+ Args:
160
+ file_path: Path to score file
161
+ mode: "movable" or "fixed"
162
+ corrections: Gradio Dataframe with corrections
163
+
164
+ Returns:
165
+ Same as parse_score_with_solfege
166
+ """
167
+ result = parse_score_with_solfege(file_path, mode)
168
+
169
+ # Apply corrections if provided
170
+ if corrections is not None and len(corrections) > 0:
171
+ for row in corrections:
172
+ if len(row) >= 5 and row[4]: # Has correction
173
+ note_idx = int(row[0]) - 1
174
+ corrected_solfege = row[4].lower()
175
+
176
+ if corrected_solfege in SOLFEGE_SYLLABLES:
177
+ # Apply to first voice (simplified)
178
+ if result['voices'] and note_idx < len(result['voices'][0]['notes']):
179
+ result['voices'][0]['notes'][note_idx]['solfege'] = corrected_solfege
180
+
181
+ return result
182
+
183
+
184
+ def midi_to_solfege_fixed(midi_num: int) -> str:
185
+ """
186
+ Convert MIDI note to solfege using Fixed Do.
187
+ Based on pitch class, not letter name (simplified).
188
+
189
+ Args:
190
+ midi_num: MIDI note number (0-127)
191
+
192
+ Returns:
193
+ Solfege syllable
194
+ """
195
+ pitch_class = midi_num % 12
196
+
197
+ # Map pitch class to solfege (Fixed Do)
198
+ PITCH_CLASS_TO_SOLFEGE = {
199
+ 0: 'do', # C
200
+ 1: 'do', # C#/Db -> do
201
+ 2: 're', # D
202
+ 3: 're', # D#/Eb -> re
203
+ 4: 'mi', # E
204
+ 5: 'fa', # F
205
+ 6: 'fa', # F#/Gb -> fa
206
+ 7: 'sol', # G
207
+ 8: 'sol', # G#/Ab -> sol
208
+ 9: 'la', # A
209
+ 10: 'la', # A#/Bb -> la
210
+ 11: 'ti' # B
211
+ }
212
+
213
+ return PITCH_CLASS_TO_SOLFEGE.get(pitch_class, 'do')
214
+
215
+
216
+ def midi_to_solfege_movable(midi_num: int, key_fifths: int) -> str:
217
+ """
218
+ Convert MIDI note to solfege using Movable Do.
219
+ Based on scale degree relative to key.
220
+
221
+ Args:
222
+ midi_num: MIDI note number
223
+ key_fifths: Key signature fifths (0=C, 1=G, -1=F, etc.)
224
+
225
+ Returns:
226
+ Solfege syllable
227
+ """
228
+ # Calculate tonic pitch class from fifths
229
+ tonic_pitch_class = ((key_fifths * 7) % 12 + 12) % 12
230
+
231
+ # Calculate scale degree
232
+ pitch_class = midi_num % 12
233
+ scale_degree = (pitch_class - tonic_pitch_class + 12) % 12
234
+
235
+ # Map scale degree to solfege (chromatic)
236
+ SCALE_DEGREE_TO_SOLFEGE = {
237
+ 0: 'do', # Tonic
238
+ 1: 'do', # Minor 2nd
239
+ 2: 're', # Major 2nd
240
+ 3: 're', # Minor 3rd
241
+ 4: 'mi', # Major 3rd
242
+ 5: 'fa', # Perfect 4th
243
+ 6: 'fa', # Tritone
244
+ 7: 'sol', # Perfect 5th
245
+ 8: 'sol', # Minor 6th
246
+ 9: 'la', # Major 6th
247
+ 10: 'la', # Minor 7th
248
+ 11: 'ti' # Major 7th
249
+ }
250
+
251
+ return SCALE_DEGREE_TO_SOLFEGE.get(scale_degree, 'do')