JeffreyZhou798's picture
Update app.py
a7196b6 verified
"""
SolfegeScoreSinger - AI Singing Synthesis System
Gradio Web Application with Async Architecture
Features:
- Zero-shot voice cloning with 7 solfege syllables (Do/Re/Mi/Fa/Sol/La/Ti)
- Multi-voice synthesis and mixing
- Support for MIDI and MusicXML formats
- Movable Do / Fixed Do solfege modes
- Multi-language interface (English, Chinese, Japanese)
- HuggingFace Spaces deployment (Docker SDK + CPU Basic)
Author: SolfegeScoreSinger Team
Version: 2.0.1
"""
import gradio as gr
import os
import tempfile
import time
from typing import Dict, List, Optional, Tuple
import json
import subprocess
import librosa
# Backend modules
from backend.i18n import I18n
from backend.config import get_model, get_default_voice_path, AUDIO_CONFIG
# ============================================================================
# Multi-language Support
# ============================================================================
i18n = I18n(default_lang='en')
def get_ui_texts() -> Dict:
"""Get current language UI texts"""
return i18n.get_all_texts()
# ============================================================================
# Score Processing Functions
# ============================================================================
def estimate_generation_time(score_file) -> str:
"""Estimate generation time based on score duration and voice count"""
if score_file is None:
return "N/A"
try:
from backend.score_parser import quick_parse_score
info = quick_parse_score(score_file.name)
# CPU estimation: ~8 minutes per second per voice
total_minutes = info['duration'] * info['voice_count'] * 8
if total_minutes < 60:
return f"~{int(total_minutes)} minutes"
else:
hours = int(total_minutes / 60)
mins = int(total_minutes % 60)
return f"~{hours}h {mins}m"
except Exception as e:
return f"~10 minutes (default)"
def on_score_upload(score_file, solfege_mode: str):
"""Parse score after upload and generate solfege table"""
if score_file is None:
return "", "N/A", None
try:
from backend.score_parser import parse_score_with_solfege
mode = "movable" if "首调" in solfege_mode or "Movable" in solfege_mode or "移動" in solfege_mode else "fixed"
result = parse_score_with_solfege(score_file.name, mode=mode)
# Estimate time
estimated = estimate_generation_time(score_file)
return result['key'], estimated, result['solfege_table']
except Exception as e:
return f"Error: {str(e)}", "N/A", None
def apply_solfege_correction(solfege_table):
"""Apply user corrections to solfege"""
return "✅ Solfege corrections applied"
# ============================================================================
# Reference Tone Generator
# ============================================================================
def generate_reference_tone(syllable: str):
"""Generate a reference tone for the given syllable
Args:
syllable: Solfege syllable (do, re, mi, fa, sol, la, ti)
Returns:
Tuple of (sample_rate, audio_data) for Gradio Audio component
"""
import numpy as np
import soundfile as sf
# Map lowercase syllable to filename
syllable = syllable.lower()
# Construct path to DefaultVoice_Child sample
script_dir = os.path.dirname(os.path.abspath(__file__))
ref_path = os.path.join(script_dir, "DefaultVoice_Child", f"{syllable.capitalize()}.wav")
# If file exists, load and return as tuple
if os.path.exists(ref_path):
audio_data, sample_rate = sf.read(ref_path)
return (sample_rate, audio_data)
# Fallback: generate sine wave
REFERENCE_PITCHES = {
'do': 261.63, 're': 293.66, 'mi': 329.63, 'fa': 349.23,
'sol': 392.00, 'la': 440.00, 'ti': 493.88
}
freq = REFERENCE_PITCHES.get(syllable, 261.63)
sample_rate = 44100
duration = 2.0
t = np.linspace(0, duration, int(sample_rate * duration), False)
# Generate sine wave with envelope
tone = np.sin(2 * np.pi * freq * t)
# Apply fade in/out
fade_samples = int(sample_rate * 0.1)
envelope = np.ones_like(tone)
envelope[:fade_samples] = np.linspace(0, 1, fade_samples)
envelope[-fade_samples:] = np.linspace(1, 0, fade_samples)
tone *= envelope * 0.3
return (sample_rate, tone)
# ============================================================================
# Main Generation Function (Async with Progress)
# ============================================================================
def generate_audio_async(
do_audio, re_audio, mi_audio, fa_audio, sol_audio, la_audio, ti_audio,
score_file,
voice_mode: str,
solfege_mode: str,
enable_denoise: bool,
solfege_correction,
progress=gr.Progress() # Gradio 6.x: progress as default parameter
) -> Tuple[Optional[str], str, str]:
"""
Main audio generation function with async progress updates.
Uses Gradio's native Progress API for SSE-based async updates.
No HTTP timeout issues - can run for hours if needed.
Args:
Audio inputs: 7 solfege sample files
score_file: MIDI or MusicXML file
voice_mode: "My Recording" or "Child Voice"
solfege_mode: "Movable Do" or "Fixed Do"
enable_denoise: Whether to apply denoising
solfege_correction: User corrections to solfege table
progress: Gradio progress object (auto-injected)
Returns:
(audio_path, status_message, progress_text)
"""
texts = get_ui_texts()
try:
# Step 1: Validate inputs (5%)
if progress is not None:
progress(0, desc=texts.get('validating', "Validating inputs..."))
if score_file is None:
return None, "❌ Error: Please upload a score file", "Failed"
if voice_mode == texts.get('my_recording', 'My Recording'):
solfege_audios = [do_audio, re_audio, mi_audio, fa_audio, sol_audio, la_audio, ti_audio]
if not all(solfege_audios):
return None, "❌ Error: Please record all 7 solfege syllables", "Failed"
# Step 2: Parse score (5-15%)
if progress is not None:
progress(0.05, desc=texts.get('parsing_score', "Parsing score..."))
from backend.score_parser import parse_score_with_correction
mode = "movable" if "首调" in solfege_mode or "Movable" in solfege_mode or "移動" in solfege_mode else "fixed"
score_data = parse_score_with_correction(
score_file.name,
mode=mode,
corrections=solfege_correction
)
# Step 3: Prepare voice samples (15-20%)
if progress is not None:
progress(0.15, desc=texts.get('preparing_samples', "Preparing voice samples..."))
from backend.metadata_generator import prepare_voice_samples
voice_samples = prepare_voice_samples(
voice_mode,
{
'do': do_audio, 're': re_audio, 'mi': mi_audio, 'fa': fa_audio,
'sol': sol_audio, 'la': la_audio, 'ti': ti_audio
} if voice_mode == texts.get('my_recording', 'My Recording') else None,
enable_denoise
)
# Step 4: Generate metadata (20-25%)
if progress is not None:
progress(0.20, desc=texts.get('generating_metadata', "Generating metadata..."))
from backend.metadata_generator import generate_metadata_for_voices
voices_metadata = generate_metadata_for_voices(score_data['voices'], voice_samples)
# Step 5: Load model (25-30%)
if progress is not None:
progress(0.25, desc=texts.get('loading_model', "Loading AI model..."))
model = get_model()
# Step 6: Multi-voice generation (30-90%)
from backend.multi_voice_engine import MultiVoiceEngine
engine = MultiVoiceEngine(model)
voice_audios = []
total_voices = len(score_data['voices'])
temp_dir = tempfile.mkdtemp(prefix="solfegescore_")
output_path = os.path.join(temp_dir, "output.wav")
for voice_idx, (voice, metadata) in enumerate(zip(score_data['voices'], voices_metadata)):
# Calculate progress range: 0.3 - 0.9
progress_base = 0.3 + (voice_idx / total_voices) * 0.6
if progress is not None:
progress(
progress_base,
desc=f"Generating voice {voice_idx + 1}/{total_voices}..."
)
# Generate this voice
def on_voice_progress(p):
if progress is not None:
progress(
progress_base + p * 0.6 / total_voices / 100,
desc=f"Voice {voice_idx + 1}/{total_voices}: {p:.1f}%"
)
voice_audio = engine.generate_single_voice(metadata, on_progress=on_voice_progress)
voice_audios.append(voice_audio)
# Memory cleanup
import gc
gc.collect()
# Step 7: Mix voices (90-95%)
if progress is not None:
progress(0.90, desc=texts.get('mixing_voices', "Mixing voices..."))
from backend.audio_mixer import mix_voices
mixed_audio = mix_voices(voice_audios)
# Step 8: Save result (95-100%)
if progress is not None:
progress(0.95, desc=texts.get('saving_audio', "Saving audio file..."))
import soundfile as sf
sf.write(output_path, mixed_audio, AUDIO_CONFIG['sample_rate'])
# Complete
if progress is not None:
progress(1.0, desc="✅ Completed!")
return (
output_path,
f"✅ Generated successfully! Duration: {score_data['duration']:.1f}s",
f"✅ Completed!\n\nGenerated {total_voices} voice(s)\nTotal duration: {score_data['duration']:.1f}s\n\nDownload your audio below ⬇️"
)
except Exception as e:
import traceback
error_msg = f"❌ Generation failed: {str(e)}"
print(f"Error in generate_audio_async: {e}")
print(traceback.format_exc())
return None, error_msg, f"❌ Failed\n\nError: {str(e)}"
def apply_speed_change(audio_path: str, speed: float) -> str:
"""Apply time-stretching to change speed without changing pitch
Args:
audio_path: Path to the audio file
speed: Speed multiplier (0.5 = half speed, 2.0 = double speed)
Returns:
Path to the processed audio file
"""
if speed == 1.0 or audio_path is None:
return audio_path # No change needed
try:
# Load audio with original sample rate
audio, sr = librosa.load(audio_path, sr=None, mono=True)
# Apply time stretching (变速不变调)
stretched_audio = librosa.effects.time_stretch(audio, rate=speed)
# Save to new file
base_path = os.path.splitext(audio_path)[0]
output_path = f"{base_path}_speed_{speed:.1f}x.wav"
import soundfile as sf
sf.write(output_path, stretched_audio, sr)
return output_path
except Exception as e:
print(f"⚠️ Speed change failed: {e}")
return audio_path # Return original if processing fails
def generate_audio_async_with_format(
do_audio, re_audio, mi_audio, fa_audio, sol_audio, la_audio, ti_audio,
score_file,
voice_mode: str,
solfege_mode: str,
enable_denoise: bool,
solfege_correction,
output_format: str,
playback_speed: float = 1.0,
progress=gr.Progress()
) -> Tuple[Optional[str], str, str]:
"""Wrapper function that handles audio format conversion and speed change
Args:
All args from generate_audio_async, plus:
output_format: "WAV" or "MP3"
playback_speed: Speed multiplier (0.5 - 2.0)
Returns:
(audio_path, status_message, progress_text)
"""
# Call original generation function
result = generate_audio_async(
do_audio, re_audio, mi_audio, fa_audio, sol_audio, la_audio, ti_audio,
score_file, voice_mode, solfege_mode, enable_denoise, solfege_correction,
progress
)
audio_path, status_msg, progress_text = result
# Apply speed change if needed
if audio_path and playback_speed != 1.0:
audio_path = apply_speed_change(audio_path, playback_speed)
# If generation succeeded and MP3 is requested, convert
if audio_path and output_format == "MP3":
mp3_path = convert_to_mp3(audio_path)
if mp3_path:
return mp3_path, status_msg, progress_text
else:
# Fall back to WAV if conversion fails
return audio_path, status_msg + " (MP3 conversion failed, provided WAV)", progress_text
return result
# ============================================================================
# Audio Format Conversion
# ============================================================================
def convert_to_mp3(wav_path: str) -> Optional[str]:
"""Convert WAV audio to MP3 format using ffmpeg
Args:
wav_path: Path to the WAV file
Returns:
Path to the MP3 file, or None if conversion failed
"""
try:
mp3_path = wav_path.replace('.wav', '.mp3')
# Use ffmpeg to convert WAV to MP3
result = subprocess.run(
['ffmpeg', '-y', '-i', wav_path, '-codec:a', 'libmp3lame', '-qscale:a', '2', mp3_path],
capture_output=True,
text=True,
timeout=60
)
if result.returncode == 0 and os.path.exists(mp3_path):
return mp3_path
else:
print(f"FFmpeg error: {result.stderr}")
return None
except Exception as e:
print(f"MP3 conversion failed: {e}")
return None
# ============================================================================
# Gradio Interface
# ============================================================================
# Custom CSS for larger download button (Gradio 6.x compatible)
CUSTOM_CSS = """
/* Download button styling for Gradio 6.x */
.gradio-container button[title="Download"],
.gradio-container button[aria-label="Download"],
.gradio-container .audio-component button[title="Download"] {
font-size: 16px !important;
padding: 12px 24px !important;
min-width: 120px !important;
background-color: #4CAF50 !important;
border-radius: 8px !important;
color: white !important;
}
.gradio-container button[title="Download"]:hover,
.gradio-container button[aria-label="Download"]:hover,
.gradio-container .audio-component button[title="Download"]:hover {
background-color: #45a049 !important;
}
/* Share button styling */
.gradio-container button[aria-label="Share"],
.gradio-container .share-button {
font-size: 16px !important;
padding: 12px 24px !important;
background-color: #2196F3 !important;
border-radius: 8px !important;
color: white !important;
}
.gradio-container button[aria-label="Share"]:hover,
.gradio-container .share-button:hover {
background-color: #1976D2 !important;
}
"""
# JavaScript to handle language switching dynamically
LANGUAGE_CHANGE_JS = """
<script>
// Language mappings
const translations = {
en: {
title: "🎵 SolfegeScoreSinger - AI Singing Synthesis",
record_tab: "Record Samples",
upload_tab: "Upload Score",
config_tab: "Configuration",
generate_tab: "Generate & Download",
record_instruction: "Record 7 solfege syllables to clone your voice",
upload_score: "Upload Score (MIDI/MusicXML)",
voice_mode: "Voice Mode",
my_recording: "My Recording",
child_voice: "Child Voice (Built-in)",
solfege_mode: "Solfege Mode",
movable_do: "Movable Do (首调)",
fixed_do: "Fixed Do (固定调)",
denoise: "Enable Denoising",
generate: "Generate Audio",
download: "Download Audio"
},
zh: {
title: "🎵 唱谱歌手 - AI唱谱合成系统",
record_tab: "录制唱名",
upload_tab: "上传乐谱",
config_tab: "参数配置",
generate_tab: "生成下载",
record_instruction: "录制7个唱名以克隆您的音色",
upload_score: "上传乐谱 (MIDI/MusicXML)",
voice_mode: "音色模式",
my_recording: "我的录音",
child_voice: "童声音色 (内置)",
solfege_mode: "唱名法",
movable_do: "首调唱名法",
fixed_do: "固定调唱名法",
denoise: "启用降噪",
generate: "生成音频",
download: "下载音频"
},
ja: {
title: "🎵 ソルフェージュ歌手 - AI歌唱合成",
record_tab: "サンプル録音",
upload_tab: "楽譜アップロード",
config_tab: "設定",
generate_tab: "生成&ダウンロード",
record_instruction: "音色をクローンするために7つのソルフェージュを録音",
upload_score: "楽譜をアップロード (MIDI/MusicXML)",
voice_mode: "音色モード",
my_recording: "マイ録音",
child_voice: "子供の声 (内蔵)",
solfege_mode: "ソルフェージュモード",
movable_do: "移動ド",
fixed_do: "固定ド",
denoise: "ノイズ除去を有効化",
generate: "音声生成",
download: "音声ダウンロード"
}
};
// Get cookie value
function getCookie(name) {
var cookies = document.cookie.split(';');
for (var i = 0; i < cookies.length; i++) {
var cookie = cookies[i].trim();
if (cookie.startsWith(name + '=')) {
return cookie.substring(name.length + 1);
}
}
return 'en'; // default
}
// Set cookie value
function setCookie(name, value, days) {
var date = new Date();
date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000));
document.cookie = name + '=' + value + ';expires=' + date.toUTCString() + ';path=/';
}
// Apply translations
function applyLanguage(langCode) {
var t = translations[langCode];
if (!t) return;
// Update title
var titleElement = document.querySelector('.gradio-container h1');
if (titleElement) {
titleElement.textContent = t.title;
}
// Update tab labels
var tabLabels = document.querySelectorAll('button.svelte-1bfm9j8');
tabLabels.forEach(function(label) {
if (label.textContent.includes('Record Samples') || label.textContent.includes('录制唱名')) {
label.textContent = t.record_tab;
} else if (label.textContent.includes('Upload Score') || label.textContent.includes('上传乐谱')) {
label.textContent = t.upload_tab;
} else if (label.textContent.includes('Configuration') || label.textContent.includes('参数配置')) {
label.textContent = t.config_tab;
} else if (label.textContent.includes('Generate') || label.textContent.includes('生成下载')) {
label.textContent = t.generate_tab;
}
});
// Update instructions
var instructions = document.querySelectorAll('.gradio-container h3');
instructions.forEach(function(elem) {
if (elem.textContent.includes('Record 7') || elem.textContent.includes('录制7')) {
elem.textContent = t.record_instruction;
}
});
}
// Language switch function
function switchLanguage(lang) {
var langCode = 'en';
if (lang === 'English') {
langCode = 'en';
} else if (lang === '中文') {
langCode = 'zh';
} else if (lang === '日本語') {
langCode = 'ja';
}
setCookie('solfege_language', langCode, 365);
applyLanguage(langCode);
}
// Initialize on page load
document.addEventListener('DOMContentLoaded', function() {
var langCode = getCookie('solfege_language');
// Apply initial language
setTimeout(function() {
applyLanguage(langCode);
// Set radio button
var langSelector = document.querySelector('#lang_selector');
if (langSelector) {
var radioButtons = langSelector.querySelectorAll('input[type="radio"]');
radioButtons.forEach(function(radio) {
var label = radio.parentElement.textContent.trim();
if ((langCode === 'en' && label === 'English') ||
(langCode === 'zh' && label === '中文') ||
(langCode === 'ja' && label === '日本語')) {
radio.click();
}
});
}
}, 1000);
});
</script>
"""
def create_interface(lang: str = 'en'):
"""Create the main Gradio interface
Args:
lang: Language code ('en', 'zh', 'ja') - currently only affects initial render
"""
# Set language for backend (affects backend messages)
i18n.set_language(lang)
texts = get_ui_texts()
with gr.Blocks(
title="SolfegeScoreSinger",
head=LANGUAGE_CHANGE_JS # Add JavaScript for client-side language switching
) as app:
# Language selector with JavaScript reload
with gr.Row():
lang_selector = gr.Radio(
choices=['English', '中文', '日本語'],
value='English' if lang == 'en' else ('中文' if lang == 'zh' else '日本語'),
label='Language / 语言 / 言語',
interactive=True,
scale=1,
elem_id="lang_selector"
)
# Title
title = gr.Markdown(f"# {texts['title']}")
# CPU warning
with gr.Row():
cpu_warning = gr.Markdown(f"⚠️ **{texts['cpu_warning']}**")
# Main tabs
with gr.Tabs():
# Tab 1: Record Samples
with gr.TabItem(texts['record_tab']):
gr.Markdown(f"### {texts['record_instruction']}")
solfege_audios = []
syllables = texts['syllables']
for i, syllable in enumerate(syllables):
with gr.Row():
with gr.Column(scale=1):
ref_btn = gr.Button(f"🎵 Play {syllable}", size="sm")
ref_audio_out = gr.Audio(
visible=True,
autoplay=True,
show_label=False
)
with gr.Column(scale=3):
audio = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label=syllable,
show_label=True
)
solfege_audios.append(audio)
# Reference tone button
ref_btn.click(
fn=lambda s=syllable.lower(): generate_reference_tone(s),
outputs=ref_audio_out
)
# Tab 2: Upload Score
with gr.TabItem(texts['upload_tab']):
score_file = gr.File(
label=texts['upload_score'],
file_types=[".mid", ".midi", ".musicxml", ".mxl", ".xml"]
)
with gr.Row():
detected_key = gr.Textbox(
label="Detected Key / 检测到的调性",
interactive=False
)
estimated_time = gr.Textbox(
label="⏱️ Estimated Generation Time",
value="Upload a score to see estimate",
interactive=False
)
# Manual solfege correction
gr.Markdown("### Manual Solfege Correction / 手动修正唱名")
gr.Markdown("AI automatically detects solfege. You can manually correct any errors.")
solfege_correction = gr.Dataframe(
headers=["Note", "Measure", "Beat", "Detected Solfege", "Corrected Solfege"],
datatype=["number", "number", "number", "str", "str"],
column_count=(5, "fixed"), # Gradio 6.x: col_count renamed to column_count
row_count=20,
interactive=[False, False, False, False, True],
label="Solfege Correction Table"
)
correct_btn = gr.Button("Apply Corrections / 应用修正", variant="secondary")
# Tab 3: Configuration
with gr.TabItem(texts['config_tab']):
voice_mode = gr.Radio(
choices=[
texts['my_recording'],
texts['child_voice']
],
value=texts['my_recording'],
label=texts['voice_mode']
)
solfege_mode = gr.Radio(
choices=[
texts['movable_do'],
texts['fixed_do']
],
value=texts['movable_do'],
label=texts['solfege_mode']
)
enable_denoise = gr.Checkbox(
label=texts['denoise'],
value=False,
info=texts.get('denoise_note', 'Default: No denoising (fidelity priority)')
)
# Tab 4: Generate & Download
with gr.TabItem(texts['generate_tab']):
generate_btn = gr.Button(
texts['generate'],
variant="primary",
size="lg"
)
task_status = gr.Textbox(
label="Task Status",
value="Ready to generate",
interactive=False
)
progress_bar = gr.Textbox(
label="Progress",
value="",
interactive=False,
lines=3
)
with gr.Row():
output_format = gr.Radio(
choices=["WAV", "MP3"],
value="WAV",
label="Output Format / 输出格式",
interactive=True,
info="Choose audio format for download"
)
playback_speed = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label="Playback Speed / 播放速度",
info="Speed up or slow down without changing pitch (0.5x - 2.0x)",
interactive=True
)
output_audio = gr.Audio(
label=texts['download'],
type="filepath",
interactive=True
)
score_file.upload(
fn=on_score_upload,
inputs=[score_file, solfege_mode],
outputs=[detected_key, estimated_time, solfege_correction]
)
correct_btn.click(
fn=apply_solfege_correction,
inputs=[solfege_correction],
outputs=[task_status]
)
generate_btn.click(
fn=generate_audio_async_with_format,
inputs=[
*solfege_audios,
score_file,
voice_mode,
solfege_mode,
enable_denoise,
solfege_correction,
output_format,
playback_speed
],
outputs=[output_audio, task_status, progress_bar]
)
return app
# ============================================================================
# Main Entry Point
# ============================================================================
if __name__ == "__main__":
# Create interface with default language (English)
# Language switching is handled by JavaScript on the client side
app = create_interface(lang='en')
# Enable queue for async mode (CRITICAL!)
# This enables SSE-based long connections, avoiding HTTP timeout
app.queue(
default_concurrency_limit=1, # CPU: single task at a time
max_size=20 # Queue size limit
)
# Launch
app.launch(
server_name="0.0.0.0",
server_port=7860, # HF Spaces default port
share=True, # Enable public link sharing
css=CUSTOM_CSS, # Gradio 6.x: css moved here
theme=gr.themes.Soft() # Gradio 6.x: theme moved here
)