Spaces:

JeffreyZhou798
/

SolfegeScore-Singer-01

Paused

App Files Files Community

JeffreyZhou798 commited on 26 days ago

Commit

ecadc11

verified ·

1 Parent(s): 38984a2

Upload 8 files

Browse files

Files changed (8) hide show

backend/__init__.py +8 -0
backend/audio_mixer.py +130 -0
backend/config.py +150 -0
backend/denoise.py +77 -0
backend/i18n.py +113 -0
backend/metadata_generator.py +236 -0
backend/multi_voice_engine.py +213 -0
backend/score_parser.py +251 -0

backend/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+SolfegeScoreSinger Backend Modules
+"""
+from .config import get_model, get_default_voice_path
+from .i18n import I18n
+__all__ = ['get_model', 'get_default_voice_path', 'I18n']

backend/audio_mixer.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""
+Audio Mixer Module
+Mixes multiple voice tracks into a single output
+"""
+import numpy as np
+from typing import List, Optional
+def mix_voices(
+    voice_audios: List[np.ndarray],
+    method: str = "sum",
+    normalize: bool = True
+) -> np.ndarray:
+    """
+    Mix multiple voice audio tracks.
+    Args:
+        voice_audios: List of audio arrays (one per voice)
+        method: Mixing method ("sum", "average", "weighted")
+        normalize: Whether to normalize output
+    Returns:
+        Mixed audio array
+    """
+    if not voice_audios:
+        return np.zeros(44100)  # 1 second silence
+    if len(voice_audios) == 1:
+        audio = voice_audios[0]
+        if normalize:
+            audio = normalize_audio(audio)
+        return audio
+    # Find maximum length
+    max_length = max(len(audio) for audio in voice_audios)
+    # Pad shorter audios with silence
+    padded_audios = []
+    for audio in voice_audios:
+        if len(audio) < max_length:
+            padding = np.zeros(max_length - len(audio))
+            padded_audio = np.concatenate([audio, padding])
+        else:
+            padded_audio = audio
+        padded_audios.append(padded_audio)
+    # Mix
+    if method == "sum":
+        mixed = np.sum(padded_audios, axis=0)
+    elif method == "average":
+        mixed = np.mean(padded_audios, axis=0)
+    elif method == "weighted":
+        # Weight by inverse of energy (quieter voices get higher weight)
+        energies = [np.sum(audio ** 2) for audio in padded_audios]
+        weights = [1.0 / (e + 1e-10) for e in energies]
+        total_weight = sum(weights)
+        weights = [w / total_weight for w in weights]
+        mixed = np.zeros(max_length)
+        for audio, weight in zip(padded_audios, weights):
+            mixed += audio * weight
+    else:
+        mixed = np.sum(padded_audios, axis=0)
+    # Normalize
+    if normalize:
+        mixed = normalize_audio(mixed)
+    return mixed
+def normalize_audio(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:
+    """
+    Normalize audio to target dB level.
+    Args:
+        audio: Audio array
+        target_db: Target dB level (default -3.0 dB)
+    Returns:
+        Normalized audio
+    """
+    # Calculate current RMS
+    rms = np.sqrt(np.mean(audio ** 2))
+    if rms < 1e-10:
+        return audio  # Avoid division by zero
+    # Calculate target RMS
+    target_rms = 10 ** (target_db / 20) * 0.1
+    # Apply gain
+    gain = target_rms / rms
+    normalized = audio * gain
+    # Clip to prevent overflow
+    normalized = np.clip(normalized, -1.0, 1.0)
+    return normalized
+def apply_fade(audio: np.ndarray, fade_in: float = 0.01, fade_out: float = 0.01, sample_rate: int = 44100) -> np.ndarray:
+    """
+    Apply fade in/out to audio.
+    Args:
+        audio: Audio array
+        fade_in: Fade in duration (seconds)
+        fade_out: Fade out duration (seconds)
+        sample_rate: Sample rate
+    Returns:
+        Audio with fades applied
+    """
+    audio = audio.copy()
+    # Fade in
+    fade_in_samples = int(fade_in * sample_rate)
+    if fade_in_samples > 0 and fade_in_samples < len(audio):
+        fade_in_curve = np.linspace(0, 1, fade_in_samples)
+        audio[:fade_in_samples] *= fade_in_curve
+    # Fade out
+    fade_out_samples = int(fade_out * sample_rate)
+    if fade_out_samples > 0 and fade_out_samples < len(audio):
+        fade_out_curve = np.linspace(1, 0, fade_out_samples)
+        audio[-fade_out_samples:] *= fade_out_curve
+    return audio

backend/config.py ADDED Viewed

	@@ -0,0 +1,150 @@

+"""
+Configuration and Model Management
+Implements lazy loading to save memory on CPU environment
+"""
+import os
+import torch
+# ============================================================================
+# Environment Optimization (CPU)
+# ============================================================================
+os.environ["OMP_NUM_THREADS"] = "4"
+os.environ["TORCH_NUM_THREADS"] = "4"
+os.environ["MKL_NUM_THREADS"] = "4"
+# ============================================================================
+# Global Model Instance (Lazy Loading)
+# ============================================================================
+_model = None
+def get_model():
+    """
+    Lazy load SoulX-Singer model.
+    Avoids loading on startup to save memory.
+    Returns:
+        SoulX-Singer model instance
+    """
+    global _model
+    if _model is None:
+        print("Loading SoulX-Singer model on CPU...")
+        # Import model from soulxsinger directory
+        import sys
+        base_path = os.path.dirname(__file__)
+        soulx_path = os.path.join(base_path, '..', 'soulxsinger')
+        cli_path = os.path.join(base_path, '..', 'cli')
+        # Add paths to sys.path
+        if os.path.exists(soulx_path):
+            sys.path.insert(0, os.path.dirname(soulx_path))
+        if os.path.exists(cli_path):
+            sys.path.insert(0, os.path.dirname(cli_path))
+        from cli.inference import SoulX_Singer
+        # Check for model weights - Auto-download if missing
+        model_weights_path = os.path.join(base_path, '..', 'pretrained_models', 'SoulX-Singer', 'model.pt')
+        if not os.path.exists(model_weights_path):
+            print("⚠️ Model weights not found!")
+            print("🔄 Attempting automatic download from HuggingFace Hub...")
+            try:
+                # Install huggingface-hub if not already installed
+                import subprocess
+                subprocess.check_call(['pip', 'install', '-q', 'huggingface-hub'])
+                # Download model weights
+                from huggingface_hub import snapshot_download
+                model_dir = os.path.join(base_path, '..', 'pretrained_models', 'SoulX-Singer')
+                os.makedirs(model_dir, exist_ok=True)
+                print("⬇️ Downloading SoulX-Singer model (~1.5GB)...")
+                snapshot_download(
+                    repo_id='Soul-AILab/SoulX-Singer',
+                    local_dir=model_dir,
+                    local_dir_use_symlinks=False,
+                    ignore_patterns=['*.md', '*.txt', 'LICENSE', 'config/**', 'utils/**', 'scripts/**']
+                )
+                print("✅ Model downloaded successfully!")
+            except Exception as e:
+                print(f"❌ Auto-download failed: {e}")
+                print("Please manually download model.pt from:")
+                print("https://huggingface.co/Soul-AILab/SoulX-Singer")
+                print("And place it at: pretrained_models/SoulX-Singer/model.pt")
+                raise FileNotFoundError("Model weights not found and auto-download failed. See instructions above.")
+        # Load with INT8 quantization for CPU optimization
+        _model = SoulX_Singer(
+            config_path=os.path.join(soulx_path, "config", "soulxsinger.yaml"),
+            checkpoint_path=model_weights_path,
+            device='cpu',
+            dtype=torch.int8  # INT8 quantization
+        )
+        print("✅ Model loaded successfully!")
+    return _model
+def clear_model():
+    """
+    Clear model from memory.
+    Call this when generation is complete to free resources.
+    """
+    global _model
+    if _model is not None:
+        del _model
+        _model = None
+        import gc
+        gc.collect()
+        print("✅ Model memory cleared")
+def get_default_voice_path() -> str:
+    """
+    Get path to default voice samples (child voice).
+    Returns:
+        Path to DefaultVoice_Child directory
+    """
+    return os.path.join(os.path.dirname(__file__), '..', 'DefaultVoice_Child')
+def get_cpu_warning() -> str:
+    """
+    Get CPU environment warning message.
+    Returns:
+        Warning text about CPU generation time
+    """
+    return "CPU Environment: Generation may take 5-10 min per second of audio"
+# ============================================================================
+# Model Inference Settings
+# ============================================================================
+INFERENCE_CONFIG = {
+    'n_steps': 12,          # Reduced steps for CPU (default 32)
+    'cfg': 3.0,             # CFG scale
+    'control': 'score',     # Score-controlled mode
+    'use_fp16': False,      # FP16 not supported on CPU
+    'segment_duration': 8.0  # Max segment duration (seconds)
+}
+def get_inference_config():
+    """Get default inference configuration for CPU"""
+    return INFERENCE_CONFIG.copy()

backend/denoise.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+Denoise Module
+Provides optional audio denoising using noisereduce
+"""
+import numpy as np
+from typing import Optional
+def denoise_audio(
+    audio: np.ndarray,
+    sample_rate: int = 44100,
+    prop_decrease: float = 0.5,
+    stationary: bool = True
+) -> np.ndarray:
+    """
+    Apply noise reduction to audio.
+    Args:
+        audio: Audio array
+        sample_rate: Sample rate
+        prop_decrease: Proportion of noise to decrease (0.0-1.0)
+        stationary: Whether noise is stationary
+    Returns:
+        Denoised audio
+    """
+    try:
+        import noisereduce as nr
+        denoised = nr.reduce_noise(
+            y=audio,
+            sr=sample_rate,
+            prop_decrease=prop_decrease,
+            stationary=stationary
+        )
+        return denoised
+    except ImportError:
+        print("Warning: noisereduce not installed, returning original audio")
+        return audio
+    except Exception as e:
+        print(f"Error in denoise_audio: {e}")
+        return audio
+def detect_noise_profile(
+    audio: np.ndarray,
+    sample_rate: int = 44100,
+    noise_duration: float = 0.5
+) -> Optional[np.ndarray]:
+    """
+    Detect noise profile from beginning of audio.
+    Args:
+        audio: Audio array
+        sample_rate: Sample rate
+        noise_duration: Duration to analyze for noise (seconds)
+    Returns:
+        Noise profile or None
+    """
+    noise_samples = int(noise_duration * sample_rate)
+    if len(audio) < noise_samples:
+        return None
+    noise_segment = audio[:noise_samples]
+    # Calculate noise statistics
+    noise_rms = np.sqrt(np.mean(noise_segment ** 2))
+    return {
+        'rms': noise_rms,
+        'segment': noise_segment
+    }

backend/i18n.py ADDED Viewed

	@@ -0,0 +1,113 @@

+"""
+Internationalization (i18n) Module
+Supports English, Chinese, and Japanese
+"""
+import json
+import os
+from typing import Dict, Optional
+class I18n:
+    """Multi-language support class"""
+    def __init__(self, default_lang: str = 'en'):
+        """
+        Initialize i18n module.
+        Args:
+            default_lang: Default language code ('en', 'zh', 'ja')
+        """
+        self.current_lang = default_lang
+        self.translations = self._load_translations()
+    def _load_translations(self) -> Dict:
+        """Load translation files from locales directory"""
+        translations = {}
+        locales_dir = os.path.join(os.path.dirname(__file__), '..', 'locales')
+        if not os.path.exists(locales_dir):
+            print(f"Warning: Locales directory not found: {locales_dir}")
+            return self._get_default_translations()
+        for lang_file in os.listdir(locales_dir):
+            if lang_file.endswith('.json'):
+                lang_code = lang_file.replace('.json', '')
+                file_path = os.path.join(locales_dir, lang_file)
+                try:
+                    with open(file_path, 'r', encoding='utf-8') as f:
+                        translations[lang_code] = json.load(f)
+                except Exception as e:
+                    print(f"Error loading {lang_file}: {e}")
+        # Fallback to default if no translations loaded
+        if not translations:
+            translations = self._get_default_translations()
+        return translations
+    def _get_default_translations(self) -> Dict:
+        """Get default English translations (fallback)"""
+        return {
+            'en': {
+                'title': '🎵 SolfegeScoreSinger - AI Singing Synthesis',
+                'record_tab': 'Record Samples',
+                'upload_tab': 'Upload Score',
+                'config_tab': 'Configuration',
+                'generate_tab': 'Generate & Download',
+                'syllables': ['Do', 'Re', 'Mi', 'Fa', 'Sol', 'La', 'Ti'],
+                'record_instruction': 'Record 7 solfege syllables to clone your voice',
+                'upload_score': 'Upload Score (MIDI/MusicXML)',
+                'voice_mode': 'Voice Mode',
+                'my_recording': 'My Recording',
+                'child_voice': 'Child Voice (Built-in)',
+                'solfege_mode': 'Solfege Mode',
+                'movable_do': 'Movable Do (首调)',
+                'fixed_do': 'Fixed Do (固定调)',
+                'denoise': 'Enable Denoising',
+                'denoise_note': 'Default: No denoising (fidelity priority)',
+                'generate': 'Generate Audio',
+                'download': 'Download Audio',
+                'cpu_warning': 'CPU Environment: Generation may take 5-10 min per second of audio',
+                'validating': 'Validating inputs...',
+                'parsing_score': 'Parsing score...',
+                'preparing_samples': 'Preparing voice samples...',
+                'generating_metadata': 'Generating metadata...',
+                'loading_model': 'Loading AI model...',
+                'mixing_voices': 'Mixing voices...',
+                'saving_audio': 'Saving audio file...'
+            }
+        }
+    def set_language(self, lang: str):
+        """Set current language"""
+        if lang in self.translations:
+            self.current_lang = lang
+        else:
+            print(f"Warning: Language '{lang}' not found, using default")
+    def t(self, key: str) -> str:
+        """
+        Translate a key to current language.
+        Args:
+            key: Translation key (supports nested keys with '.')
+        Returns:
+            Translated text
+        """
+        keys = key.split('.')
+        value = self.translations.get(self.current_lang, {})
+        for k in keys:
+            if isinstance(value, dict):
+                value = value.get(k, key)
+            else:
+                return key
+        return value if isinstance(value, str) else key
+    def get_all_texts(self) -> Dict:
+        """Get all texts for current language"""
+        return self.translations.get(self.current_lang, self._get_default_translations()['en'])

backend/metadata_generator.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""
+Metadata Generator Module
+Generates SoulX-Singer metadata from score and voice samples
+"""
+import os
+import numpy as np
+import soundfile as sf
+from typing import Dict, List, Optional
+from .score_parser import SOLFEGE_SYLLABLES
+def prepare_voice_samples(
+    voice_mode: str,
+    user_samples: Optional[Dict[str, str]],
+    enable_denoise: bool
+) -> Dict[str, np.ndarray]:
+    """
+    Prepare voice samples for synthesis.
+    Args:
+        voice_mode: "My Recording" or "Child Voice"
+        user_samples: Dict mapping syllable to audio file path
+        enable_denoise: Whether to apply denoising
+    Returns:
+        Dict mapping syllable to audio array
+    """
+    from .config import get_default_voice_path
+    samples = {}
+    if voice_mode == "Child Voice (Built-in)" or voice_mode == "童声音色 (内置)" or voice_mode == "子供の声 (内蔵)":
+        # Load default voice
+        default_path = get_default_voice_path()
+        for syllable in SOLFEGE_SYLLABLES:
+            # Capitalize first letter for filename
+            filename = syllable.capitalize() + '.wav'
+            file_path = os.path.join(default_path, filename)
+            if os.path.exists(file_path):
+                audio, sr = sf.read(file_path)
+                samples[syllable] = audio
+            else:
+                print(f"Warning: Default voice file not found: {file_path}")
+    elif user_samples:
+        # Load user recorded samples
+        for syllable in SOLFEGE_SYLLABLES:
+            file_path = user_samples.get(syllable)
+            if file_path and os.path.exists(file_path):
+                audio, sr = sf.read(file_path)
+                # Apply denoising if enabled
+                if enable_denoise:
+                    audio = apply_denoise(audio, sr)
+                samples[syllable] = audio
+    return samples
+def apply_denoise(audio: np.ndarray, sample_rate: int) -> np.ndarray:
+    """
+    Apply conservative denoising using noisereduce.
+    Args:
+        audio: Audio array
+        sample_rate: Sample rate
+    Returns:
+        Denoised audio
+    """
+    try:
+        import noisereduce as nr
+        return nr.reduce_noise(y=audio, sr=sample_rate, prop_decrease=0.5)
+    except ImportError:
+        print("Warning: noisereduce not installed, skipping denoising")
+        return audio
+def generate_metadata_for_voices(
+    voices: List[Dict],
+    voice_samples: Dict[str, np.ndarray]
+) -> List[Dict]:
+    """
+    Generate SoulX-Singer metadata for each voice.
+    Args:
+        voices: List of voice data from score parser
+        voice_samples: Dict of syllable -> audio array
+    Returns:
+        List of metadata dicts for SoulX-Singer
+    """
+    metadata_list = []
+    for voice in voices:
+        notes = voice['notes']
+        # Create prompt audio by concatenating solfege samples
+        prompt_audio = create_prompt_audio(notes, voice_samples)
+        # Create target metadata
+        target_metadata = create_target_metadata(notes)
+        metadata = {
+            'voice_id': voice['id'],
+            'instrument': voice['instrument'],
+            'prompt_audio': prompt_audio,
+            'target': target_metadata
+        }
+        metadata_list.append(metadata)
+    return metadata_list
+def create_prompt_audio(notes: List[Dict], voice_samples: Dict[str, np.ndarray]) -> np.ndarray:
+    """
+    Create prompt audio by concatenating voice samples.
+    Strategy:
+    - Use first few notes' solfege to create a representative prompt
+    - Aim for ~3-5 seconds of prompt audio
+    Args:
+        notes: List of notes for this voice
+        voice_samples: Dict of syllable -> audio array
+    Returns:
+        Concatenated prompt audio
+    """
+    # Get unique solfeges from first few notes
+    solfeges = []
+    for note in notes[:10]:
+        solfege = note['solfege']
+        if solfege not in solfeges and solfege in voice_samples:
+            solfeges.append(solfege)
+    # Use at least 3 different syllables
+    if len(solfeges) < 3:
+        for syllable in SOLFEGE_SYLLABLES:
+            if syllable not in solfeges and syllable in voice_samples:
+                solfeges.append(syllable)
+                if len(solfeges) >= 3:
+                    break
+    # Concatenate samples with small gaps
+    prompt_segments = []
+    for syllable in solfeges[:5]:
+        if syllable in voice_samples:
+            sample = voice_samples[syllable]
+            prompt_segments.append(sample)
+            # Add small gap (50ms silence)
+            gap = np.zeros(int(44100 * 0.05))
+            prompt_segments.append(gap)
+    if prompt_segments:
+        return np.concatenate(prompt_segments)
+    else:
+        # Fallback: use first available sample
+        for sample in voice_samples.values():
+            return sample
+    return np.zeros(44100)  # 1 second silence
+def create_target_metadata(notes: List[Dict]) -> Dict:
+    """
+    Create target metadata for SoulX-Singer.
+    Args:
+        notes: List of notes
+    Returns:
+        Target metadata dict
+    """
+    # Convert notes to SoulX format
+    phonemes = []
+    note_pitches = []
+    note_durations = []
+    note_types = []
+    for note in notes:
+        solfege = note['solfege']
+        midi_num = note['midi']
+        duration = note['duration']
+        # Phoneme (simplified - just use solfege name)
+        phoneme = solfege_to_phoneme(solfege)
+        phonemes.append(phoneme)
+        # Pitch
+        note_pitches.append(midi_num)
+        # Duration (in frames)
+        note_durations.append(int(duration * 44100 / 256))  # Assume 256 samples per frame
+        # Note type (1 = regular)
+        note_types.append(1)
+    return {
+        'phoneme': phonemes,
+        'note_pitch': note_pitches,
+        'note_duration': note_durations,
+        'note_type': note_types,
+        'duration': sum(note['duration'] for note in notes)
+    }
+def solfege_to_phoneme(solfege: str) -> str:
+    """
+    Convert solfege syllable to phoneme.
+    Args:
+        solfege: Solfege syllable (do, re, mi, fa, sol, la, ti)
+    Returns:
+        Phoneme string
+    """
+    # ARPAbet phonemes (simplified)
+    SOLFEGE_TO_PHONEME = {
+        'do': 'd ow',
+        're': 'r ey',
+        'mi': 'm iy',
+        'fa': 'f aa',
+        'sol': 's ow l',
+        'la': 'l aa',
+        'ti': 't iy'
+    }
+    return SOLFEGE_TO_PHONEME.get(solfege, 'd ow')

backend/multi_voice_engine.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""
+Multi-Voice Engine Module
+Handles SoulX-Singer model inference for multiple voices
+Implements segment-based processing for long scores
+"""
+import numpy as np
+import torch
+from typing import Dict, List, Optional, Callable
+import gc
+from .config import get_inference_config
+class MultiVoiceEngine:
+    """
+    Multi-voice synthesis engine using SoulX-Singer.
+    Features:
+    - Segment-based processing for long scores (≤8s per segment)
+    - Memory management with garbage collection
+    - Progress callback support
+    """
+    def __init__(self, model):
+        """
+        Initialize engine with SoulX-Singer model.
+        Args:
+            model: SoulX-Singer model instance
+        """
+        self.model = model
+        self.config = get_inference_config()
+    def generate_single_voice(
+        self,
+        metadata: Dict,
+        on_progress: Optional[Callable[[float], None]] = None
+    ) -> np.ndarray:
+        """
+        Generate audio for a single voice.
+        Args:
+            metadata: Voice metadata from metadata_generator
+            on_progress: Progress callback function
+        Returns:
+            Generated audio array
+        """
+        target = metadata['target']
+        prompt_audio = metadata['prompt_audio']
+        # Check if segmentation is needed
+        total_duration = target['duration']
+        segment_duration = self.config['segment_duration']
+        if total_duration <= segment_duration:
+            # Single segment
+            return self._generate_segment(prompt_audio, target, on_progress)
+        else:
+            # Multiple segments
+            return self._generate_segments(prompt_audio, target, on_progress)
+    def _generate_segment(
+        self,
+        prompt_audio: np.ndarray,
+        target: Dict,
+        on_progress: Optional[Callable[[float], None]] = None
+    ) -> np.ndarray:
+        """
+        Generate a single segment (≤8 seconds).
+        Args:
+            prompt_audio: Prompt audio array
+            target: Target metadata
+            on_progress: Progress callback
+        Returns:
+            Generated audio for this segment
+        """
+        try:
+            # Prepare model input
+            infer_data = {
+                'prompt': {
+                    'waveform': torch.from_numpy(prompt_audio).float(),
+                    'phoneme': self._phonemes_to_tensor(target['phoneme'][:len(prompt_audio)//100]),
+                    'note_pitch': torch.tensor(target['note_pitch'][:len(prompt_audio)//100]),
+                    'note_type': torch.tensor(target['note_type'][:len(prompt_audio)//100])
+                },
+                'target': {
+                    'phoneme': self._phonemes_to_tensor(target['phoneme']),
+                    'note_pitch': torch.tensor(target['note_pitch']),
+                    'note_type': torch.tensor(target['note_type'])
+                }
+            }
+            # Run inference
+            with torch.no_grad():
+                output = self.model.infer(
+                    infer_data,
+                    auto_shift=False,
+                    pitch_shift=0,
+                    n_steps=self.config['n_steps'],
+                    cfg=self.config['cfg'],
+                    control=self.config['control'],
+                    use_fp16=self.config['use_fp16']
+                )
+            # Clean up
+            del infer_data
+            gc.collect()
+            if on_progress:
+                on_progress(100.0)
+            return output.cpu().numpy() if torch.is_tensor(output) else output
+        except Exception as e:
+            print(f"Error in _generate_segment: {e}")
+            # Fallback: return silence
+            duration = target.get('duration', 1.0)
+            return np.zeros(int(44100 * duration))
+    def _generate_segments(
+        self,
+        prompt_audio: np.ndarray,
+        target: Dict,
+        on_progress: Optional[Callable[[float], None]] = None
+    ) -> np.ndarray:
+        """
+        Generate multiple segments and concatenate.
+        Args:
+            prompt_audio: Prompt audio
+            target: Target metadata
+            on_progress: Progress callback
+        Returns:
+            Concatenated generated audio
+        """
+        total_duration = target['duration']
+        segment_duration = self.config['segment_duration']
+        num_segments = int(np.ceil(total_duration / segment_duration))
+        segments = []
+        for i in range(num_segments):
+            # Extract segment metadata
+            start_time = i * segment_duration
+            end_time = min((i + 1) * segment_duration, total_duration)
+            segment_target = self._extract_segment(target, start_time, end_time)
+            # Generate this segment
+            segment_audio = self._generate_segment(prompt_audio, segment_target)
+            segments.append(segment_audio)
+            # Update progress
+            if on_progress:
+                progress = (i + 1) / num_segments * 100
+                on_progress(progress)
+            # Memory cleanup
+            gc.collect()
+        # Concatenate segments
+        return np.concatenate(segments)
+    def _extract_segment(
+        self,
+        target: Dict,
+        start_time: float,
+        end_time: float
+    ) -> Dict:
+        """
+        Extract a time segment from target metadata.
+        Args:
+            target: Full target metadata
+            start_time: Segment start time (seconds)
+            end_time: Segment end time (seconds)
+        Returns:
+            Segment metadata
+        """
+        # Simplified: just return full target for now
+        # TODO: Implement proper time-based extraction
+        return {
+            'phoneme': target['phoneme'],
+            'note_pitch': target['note_pitch'],
+            'note_type': target['note_type'],
+            'duration': end_time - start_time
+        }
+    def _phonemes_to_tensor(self, phonemes: List[str]) -> torch.Tensor:
+        """
+        Convert phoneme list to tensor.
+        Args:
+            phonemes: List of phoneme strings
+        Returns:
+            Phoneme tensor
+        """
+        # Simplified: convert to indices
+        # TODO: Use proper phoneme vocabulary
+        phoneme_to_idx = {
+            'd ow': 0, 'r ey': 1, 'm iy': 2, 'f aa': 3,
+            's ow l': 4, 'l aa': 5, 't iy': 6
+        }
+        indices = [phoneme_to_idx.get(p, 0) for p in phonemes]
+        return torch.tensor(indices, dtype=torch.long)

backend/score_parser.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""
+Score Parser Module
+Supports MIDI and MusicXML formats
+Implements key detection and solfege mapping
+"""
+import os
+from typing import Dict, List, Optional, Tuple
+import tempfile
+# Solfege syllables (CORRECTED: 'sol' not 'so')
+SOLFEGE_SYLLABLES = ['do', 're', 'mi', 'fa', 'sol', 'la', 'ti']
+# Reference pitches (C4 octave)
+REFERENCE_PITCHES = {
+    'do': 261.63, 're': 293.66, 'mi': 329.63, 'fa': 349.23,
+    'sol': 392.00, 'la': 440.00, 'ti': 493.88
+}
+def quick_parse_score(file_path: str) -> Dict:
+    """
+    Quick parse score to get basic info (duration, voice count).
+    Used for time estimation.
+    Args:
+        file_path: Path to MIDI or MusicXML file
+    Returns:
+        {
+            'duration': float (seconds),
+            'voice_count': int,
+            'key': str
+        }
+    """
+    try:
+        # Try music21 first
+        from music21 import converter
+        score = converter.parse(file_path)
+        duration = score.duration.quarterLength / 2  # Rough estimate (120 BPM)
+        voice_count = len(score.parts) if hasattr(score, 'parts') else 1
+        # Key detection
+        key_analysis = score.analyze('key')
+        key_name = f"{key_analysis.tonic.name} {key_analysis.mode}"
+        return {
+            'duration': max(duration, 10),  # Minimum 10s
+            'voice_count': max(voice_count, 1),
+            'key': key_name
+        }
+    except Exception as e:
+        print(f"Error in quick_parse_score: {e}")
+        # Fallback
+        return {
+            'duration': 30,
+            'voice_count': 1,
+            'key': 'C major'
+        }
+def parse_score_with_solfege(file_path: str, mode: str = "movable") -> Dict:
+    """
+    Parse score and generate solfege mapping.
+    Args:
+        file_path: Path to MIDI or MusicXML file
+        mode: "movable" or "fixed"
+    Returns:
+        {
+            'key': str,
+            'duration': float,
+            'voices': List[Dict],
+            'solfege_table': List[List]  # For Gradio Dataframe
+        }
+    """
+    from music21 import converter, key, pitch, note
+    try:
+        score = converter.parse(file_path)
+        # Detect key
+        key_analysis = score.analyze('key')
+        key_name = f"{key_analysis.tonic.name} {key_analysis.mode}"
+        key_fifths = key_analysis.sharps
+        # Extract voices
+        voices = []
+        solfege_table = []
+        for part_idx, part in enumerate(score.parts):
+            voice_notes = []
+            for element in part.flatten().notes:
+                if isinstance(element, note.Note):
+                    # Get MIDI number
+                    midi_num = element.pitch.midi
+                    # Map to solfege
+                    if mode == "movable":
+                        solfege = midi_to_solfege_movable(midi_num, key_fifths)
+                    else:
+                        solfege = midi_to_solfege_fixed(midi_num)
+                    # Get measure and beat
+                    measure = element.measureNumber or 1
+                    beat = element.beat or 1
+                    # Duration in seconds (assume 120 BPM)
+                    duration = element.duration.quarterLength * 0.5
+                    voice_notes.append({
+                        'midi': midi_num,
+                        'solfege': solfege,
+                        'start': element.offset,
+                        'duration': duration,
+                        'measure': measure,
+                        'beat': beat
+                    })
+                    # Add to correction table (first 20 notes)
+                    if len(solfege_table) < 20:
+                        solfege_table.append([
+                            len(solfege_table) + 1,
+                            measure,
+                            f"{beat:.1f}",
+                            solfege,
+                            ""  # User correction
+                        ])
+            voices.append({
+                'id': part_idx,
+                'instrument': part.partName or f"Voice {part_idx + 1}",
+                'notes': voice_notes
+            })
+        # Total duration
+        total_duration = score.duration.quarterLength * 0.5
+        return {
+            'key': key_name,
+            'duration': total_duration,
+            'voices': voices,
+            'solfege_table': solfege_table
+        }
+    except Exception as e:
+        print(f"Error parsing score: {e}")
+        raise
+def parse_score_with_correction(file_path: str, mode: str = "movable", corrections=None) -> Dict:
+    """
+    Parse score with optional user corrections.
+    Args:
+        file_path: Path to score file
+        mode: "movable" or "fixed"
+        corrections: Gradio Dataframe with corrections
+    Returns:
+        Same as parse_score_with_solfege
+    """
+    result = parse_score_with_solfege(file_path, mode)
+    # Apply corrections if provided
+    if corrections is not None and len(corrections) > 0:
+        for row in corrections:
+            if len(row) >= 5 and row[4]:  # Has correction
+                note_idx = int(row[0]) - 1
+                corrected_solfege = row[4].lower()
+                if corrected_solfege in SOLFEGE_SYLLABLES:
+                    # Apply to first voice (simplified)
+                    if result['voices'] and note_idx < len(result['voices'][0]['notes']):
+                        result['voices'][0]['notes'][note_idx]['solfege'] = corrected_solfege
+    return result
+def midi_to_solfege_fixed(midi_num: int) -> str:
+    """
+    Convert MIDI note to solfege using Fixed Do.
+    Based on pitch class, not letter name (simplified).
+    Args:
+        midi_num: MIDI note number (0-127)
+    Returns:
+        Solfege syllable
+    """
+    pitch_class = midi_num % 12
+    # Map pitch class to solfege (Fixed Do)
+    PITCH_CLASS_TO_SOLFEGE = {
+        0: 'do',   # C
+        1: 'do',   # C#/Db -> do
+        2: 're',   # D
+        3: 're',   # D#/Eb -> re
+        4: 'mi',   # E
+        5: 'fa',   # F
+        6: 'fa',   # F#/Gb -> fa
+        7: 'sol',  # G
+        8: 'sol',  # G#/Ab -> sol
+        9: 'la',   # A
+        10: 'la',  # A#/Bb -> la
+        11: 'ti'   # B
+    }
+    return PITCH_CLASS_TO_SOLFEGE.get(pitch_class, 'do')
+def midi_to_solfege_movable(midi_num: int, key_fifths: int) -> str:
+    """
+    Convert MIDI note to solfege using Movable Do.
+    Based on scale degree relative to key.
+    Args:
+        midi_num: MIDI note number
+        key_fifths: Key signature fifths (0=C, 1=G, -1=F, etc.)
+    Returns:
+        Solfege syllable
+    """
+    # Calculate tonic pitch class from fifths
+    tonic_pitch_class = ((key_fifths * 7) % 12 + 12) % 12
+    # Calculate scale degree
+    pitch_class = midi_num % 12
+    scale_degree = (pitch_class - tonic_pitch_class + 12) % 12
+    # Map scale degree to solfege (chromatic)
+    SCALE_DEGREE_TO_SOLFEGE = {
+        0: 'do',   # Tonic
+        1: 'do',   # Minor 2nd
+        2: 're',   # Major 2nd
+        3: 're',   # Minor 3rd
+        4: 'mi',   # Major 3rd
+        5: 'fa',   # Perfect 4th
+        6: 'fa',   # Tritone
+        7: 'sol',  # Perfect 5th
+        8: 'sol',  # Minor 6th
+        9: 'la',   # Major 6th
+        10: 'la',  # Minor 7th
+        11: 'ti'   # Major 7th
+    }
+    return SCALE_DEGREE_TO_SOLFEGE.get(scale_degree, 'do')