melody-expander / app.py
junghoonson's picture
Disable YouTube input on HF Spaces (outbound blocked)
70319eb
"""Melody Expander - Gradio web application.
Separates audio into stems, transcribes melodies, and applies embellishments.
Deployed on HuggingFace Spaces with ZeroGPU support.
"""
import os
import sys
import json
import traceback
# Patch gradio_client bug: crashes when JSON schema has additionalProperties=true (bool)
import gradio_client.utils as _gc_utils
_orig_get_type = _gc_utils.get_type
def _patched_get_type(schema):
if isinstance(schema, bool):
return "Any"
return _orig_get_type(schema)
_gc_utils.get_type = _patched_get_type
_orig_json_schema_to_python_type = _gc_utils._json_schema_to_python_type
def _patched_json_schema_to_python_type(schema, defs=None):
if isinstance(schema, bool):
return "Any"
return _orig_json_schema_to_python_type(schema, defs)
_gc_utils._json_schema_to_python_type = _patched_json_schema_to_python_type
import gradio as gr
# Detect HuggingFace Spaces environment
IS_SPACES = os.environ.get("SPACE_ID") is not None
if IS_SPACES:
import spaces
from pipeline.orchestrator import run_separation, run_transcription_and_format
from pipeline.transcriber import NoteEvent, estimate_tempo
from pipeline.formatter import export_all_formats
from embellishments.registry import (
get_style,
list_styles,
get_display_name,
STYLE_DISPLAY_NAMES,
)
from utils.audio_io import validate_audio_file, AudioValidationError
from utils.file_manager import (
create_session_dir,
cleanup_session,
collect_all_files,
package_zip,
)
from utils.music_theory import detect_key, midi_to_note_name
from utils.youtube import download_audio, is_youtube_url, YouTubeError
# ---------------------------------------------------------------------------
# GPU-decorated separation (only active on HF Spaces)
# ---------------------------------------------------------------------------
def _separate_gpu(audio_path: str, output_dir: str):
"""Stem separation — wrapped with @spaces.GPU when on Spaces."""
return run_separation(audio_path, output_dir)
if IS_SPACES:
_separate_gpu = spaces.GPU(duration=120)(_separate_gpu)
# ---------------------------------------------------------------------------
# State helpers — use JSON string to avoid Gradio schema bugs with dicts
# ---------------------------------------------------------------------------
def _encode_state(data):
return json.dumps(data)
def _decode_state(state_str):
if not state_str:
return {}
try:
return json.loads(state_str)
except (json.JSONDecodeError, TypeError):
return {}
# ---------------------------------------------------------------------------
# Tab 1: Upload & Separate
# ---------------------------------------------------------------------------
def process_audio(audio_file, youtube_url):
"""Main processing pipeline: separate stems, transcribe, format."""
audio_path = None
session_dir = create_session_dir()
try:
if youtube_url and youtube_url.strip():
print(f"[melody-expander] Downloading from YouTube: {youtube_url.strip()}")
try:
yt_dir = os.path.join(session_dir, "youtube")
audio_path = download_audio(youtube_url.strip(), yt_dir)
print(f"[melody-expander] Downloaded to: {audio_path}")
except YouTubeError as e:
print(f"[melody-expander] YouTube error: {e}")
return _error_result(f"YouTube error: {e}")
elif audio_file is not None:
audio_path = audio_file
print(f"[melody-expander] Using uploaded file: {audio_path}")
else:
return _error_result("Please upload an audio file or paste a YouTube URL.")
# Validate
try:
metadata = validate_audio_file(audio_path)
except AudioValidationError as e:
print(f"[melody-expander] Validation error: {e}")
return _error_result(str(e))
info_text = (
f"**Input:** {metadata['duration']:.1f}s, "
f"{metadata['sample_rate']}Hz, "
f"{metadata['channels']}ch, {metadata['format']}"
)
# Phase 1: Separation (GPU)
print("[melody-expander] Starting stem separation...")
stems_dir = os.path.join(session_dir, "stems")
stem_paths = _separate_gpu(audio_path, stems_dir)
# Phase 2: Transcription + Formatting (CPU)
print("[melody-expander] Starting transcription...")
output_dir = os.path.join(session_dir, "output")
results = run_transcription_and_format(stem_paths, output_dir)
# Build outputs for UI
print("[melody-expander] Packaging results...")
stem_audio_outputs = []
stem_info_parts = []
all_download_files = []
for stem_name in ["vocals", "drums", "bass", "other"]:
stem_data = results["stems"].get(stem_name)
if stem_data:
stem_audio_outputs.append(stem_data["audio_path"])
note_count = stem_data["note_count"]
tempo = stem_data["tempo_bpm"]
if stem_data["notes"]:
pcs = [n.pitch_midi % 12 for n in stem_data["notes"]]
key_root, key_mode = detect_key(pcs)
key_str = f"{midi_to_note_name(key_root + 60)[:-1]} {key_mode}"
else:
key_str = "N/A"
stem_info_parts.append(
f"**{stem_name.title()}:** {note_count} notes, "
f"~{tempo} BPM, key: {key_str}"
)
for fmt, fpath in stem_data["files"].items():
all_download_files.append(fpath)
else:
stem_audio_outputs.append(None)
stem_info_parts.append(f"**{stem_name.title()}:** No output")
# Create ZIP of everything
zip_path = os.path.join(session_dir, "all_stems.zip")
all_file_dict = collect_all_files(results)
package_zip(all_file_dict, zip_path)
all_download_files.append(zip_path)
stem_info = info_text + "\n\n" + "\n\n".join(stem_info_parts)
# Serialize state as JSON string
state_data = {"session_dir": session_dir, "stems": {}}
for stem_name, stem_data in results["stems"].items():
state_data["stems"][stem_name] = {
"notes": [n.to_dict() for n in stem_data["notes"]],
"tempo_bpm": stem_data["tempo_bpm"],
"audio_path": stem_data["audio_path"],
"files": stem_data["files"],
}
return (
stem_audio_outputs[0],
stem_audio_outputs[1],
stem_audio_outputs[2],
stem_audio_outputs[3],
stem_info,
all_download_files,
_encode_state(state_data),
)
except Exception as e:
print(f"[melody-expander] EXCEPTION: {e}")
traceback.print_exc()
return _error_result(f"Processing failed: {e}")
def _error_result(msg):
return (None, None, None, None, f"**Error:** {msg}", [], "")
# ---------------------------------------------------------------------------
# Tab 2: Embellish
# ---------------------------------------------------------------------------
def apply_embellishments(state_str, stem_choice, style_choices):
"""Apply selected embellishments to a stem's notes."""
state_data = _decode_state(state_str)
if not state_data or "stems" not in state_data:
return "**Error:** No stems loaded. Process audio first (Tab 1).", []
if not stem_choice:
return "**Error:** Select a stem.", []
if not style_choices:
return "**Error:** Select at least one embellishment style.", []
stem_data = state_data["stems"].get(stem_choice)
if not stem_data:
return f"**Error:** Stem '{stem_choice}' not found.", []
notes = [NoteEvent.from_dict(d) for d in stem_data["notes"]]
tempo_bpm = stem_data["tempo_bpm"]
if not notes:
return f"**{stem_choice}** has no pitched content to embellish.", []
pcs = [n.pitch_midi % 12 for n in notes]
key_root, key_mode = detect_key(pcs)
print(f"[melody-expander] Applying embellishments: {style_choices}")
result_notes = notes
applied_names = []
for style_name in style_choices:
style = get_style(style_name)
result_notes = style.apply(result_notes, tempo_bpm, key_root, key_mode)
applied_names.append(get_display_name(style_name))
print("[melody-expander] Exporting embellished files...")
session_dir = state_data.get("session_dir", create_session_dir())
emb_dir = os.path.join(session_dir, "embellished", stem_choice)
suffix = "_".join(style_choices)
file_paths = export_all_formats(
result_notes, emb_dir, f"{stem_choice}_{suffix}", tempo_bpm
)
info = (
f"**Embellished {stem_choice.title()}**\n\n"
f"Styles applied: {', '.join(applied_names)}\n\n"
f"Original notes: {len(notes)} -> Embellished notes: {len(result_notes)}\n\n"
f"Key: {midi_to_note_name(key_root + 60)[:-1]} {key_mode}, Tempo: {tempo_bpm} BPM"
)
return info, list(file_paths.values())
def get_available_stems(state_str):
"""Return list of stems that have notes for the dropdown."""
state_data = _decode_state(state_str)
if not state_data or "stems" not in state_data:
return gr.Dropdown(choices=[], value=None)
stems = [s for s, d in state_data["stems"].items() if d.get("notes")]
return gr.Dropdown(choices=stems, value=stems[0] if stems else None)
# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------
ABOUT_TEXT = """
# Melody Expander
**Separate, transcribe, and embellish melodies from any audio.**
## How It Works
1. **Upload** an MP3/WAV file (or paste a YouTube URL) up to 5 minutes long
2. **Separate** into 4 stems: vocals, drums, bass, other (using Demucs v4)
3. **Transcribe** each stem to notes (using Basic Pitch)
4. **Download** as MIDI, MusicXML, or JSON
5. **Embellish** with jazz swing, parallel harmonies, and more
## Technical Details
- **Stem Separation:** Demucs v4 (htdemucs) — hybrid transformer model
- **Transcription:** Basic Pitch by Spotify — lightweight neural MIDI transcription
- **Output Formats:** MIDI (for DAWs), MusicXML (for notation software), JSON (for code)
- **Embellishments:** Rule-based transformations on note events
## Limitations
- Max 5 minutes, 50MB file size
- Transcription quality depends on audio clarity
- Drum transcription shows pitched components only
- YouTube downloads require yt-dlp to be installed
## Credits
Built with [Demucs](https://github.com/facebookresearch/demucs),
[Basic Pitch](https://github.com/spotify/basic-pitch),
[music21](https://web.mit.edu/music21/),
and [Gradio](https://gradio.app/).
"""
def build_ui():
with gr.Blocks(
title="Melody Expander",
theme=gr.themes.Soft(),
) as app:
gr.Markdown("# Melody Expander\nSeparate stems, transcribe melodies, apply embellishments.")
# Hidden textbox for state (avoids Gradio schema introspection bugs with gr.State)
pipeline_state = gr.Textbox(visible=False, elem_id="pipeline_state")
with gr.Tabs():
# ------ Tab 1: Upload & Separate ------
with gr.Tab("Upload & Separate"):
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(
label="Upload Audio (MP3/WAV, max 5 min)",
type="filepath",
)
youtube_input = gr.Textbox(
label="Or paste a YouTube URL (local only, not available on Spaces)",
placeholder="https://www.youtube.com/watch?v=...",
interactive=not IS_SPACES,
)
process_btn = gr.Button("Process", variant="primary", size="lg")
with gr.Column(scale=2):
info_output = gr.Markdown(label="Info")
gr.Markdown("### Separated Stems")
with gr.Row():
vocals_audio = gr.Audio(label="Vocals", interactive=False)
drums_audio = gr.Audio(label="Drums", interactive=False)
with gr.Row():
bass_audio = gr.Audio(label="Bass", interactive=False)
other_audio = gr.Audio(label="Other", interactive=False)
download_files = gr.File(
label="Download Files (MIDI, MusicXML, JSON, ZIP)",
file_count="multiple",
interactive=False,
)
process_btn.click(
fn=process_audio,
inputs=[audio_input, youtube_input],
outputs=[
vocals_audio, drums_audio, bass_audio, other_audio,
info_output, download_files, pipeline_state,
],
)
# ------ Tab 2: Embellish ------
with gr.Tab("Embellish"):
gr.Markdown(
"Select a stem and embellishment style(s) to transform the melody. "
"Process audio in Tab 1 first."
)
with gr.Row():
stem_dropdown = gr.Dropdown(
label="Stem",
choices=[],
interactive=True,
)
refresh_btn = gr.Button("Refresh Stems", size="sm")
style_checkboxes = gr.CheckboxGroup(
label="Embellishment Styles",
choices=list_styles(),
)
apply_btn = gr.Button("Apply Embellishments", variant="primary")
emb_info = gr.Markdown()
emb_files = gr.File(
label="Download Embellished Files",
file_count="multiple",
interactive=False,
)
refresh_btn.click(
fn=get_available_stems,
inputs=[pipeline_state],
outputs=[stem_dropdown],
)
apply_btn.click(
fn=apply_embellishments,
inputs=[pipeline_state, stem_dropdown, style_checkboxes],
outputs=[emb_info, emb_files],
)
# ------ Tab 3: About ------
with gr.Tab("About"):
gr.Markdown(ABOUT_TEXT)
return app
demo = build_ui()
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)