Spaces:

mrblackdev
/

Voice-To-MiDI-VTM

Runtime error

App Files Files Community

mrblackdev commited on Aug 24, 2025

Commit

a2446b2

verified ·

1 Parent(s): bf213f8

Update app.py

Browse files

Files changed (1) hide show

app.py +226 -328

app.py CHANGED Viewed

@@ -1,367 +1,265 @@
-# app.py - Audio -> Multi-track MIDI (HPSS + Multi-pitch + Clustering)
-# Designed for Hugging Face Spaces (Gradio).
-# Author: AlexGPT (responding to your request)
 import os
 import tempfile
 import traceback
 import numpy as np
 import librosa
 import pretty_midi
 import gradio as gr
-from sklearn.cluster import AgglomerativeClustering
-# ---------- Config ----------
-A440 = 440.0
-# ---------- Utilities ----------
-def hz_to_midi(f):
-    """Return float MIDI number or np.nan for invalid f."""
     try:
-        if f is None or np.isnan(f) or f <= 0:
-            return np.nan
-        return 69 + 12 * np.log2(f / A440)
-    except Exception:
-        return np.nan
-def safe_median_filter(data, size=3):
-    """Median filter forcing float64 to avoid scipy errors; fallback to identity."""
-    try:
-        from scipy.ndimage import median_filter
-        arr = np.asarray(data)
-        if arr.dtype != np.float64:
-            arr = arr.astype(np.float64)
-        return median_filter(arr, size=size)
-    except Exception as e:
-        print("median_filter fallback:", e)
-        return np.asarray(data, dtype=np.float64)
-def round_to_grid(seconds, bpm, division=4):
-    if bpm <= 0:
-        return seconds
-    beat = 60.0 / bpm
-    grid = beat / division
-    ticks = np.round(seconds / grid)
-    return ticks * grid
-# ---------- Signal separation & percussive detection ----------
-def separate_harmonic_percussive(y):
-    """HPSS separation; returns (harmonic, percussive). If fails, return (y, zeros)."""
-    try:
-        y_h, y_p = librosa.effects.hpss(y)
-        return y_h, y_p
     except Exception as e:
-        print("HPSS fallback:", e)
-        return y, np.zeros_like(y)
-def detect_percussive_hits(y_p, sr, backtrack=False):
-    """
-    Detect percussive onsets and map them to simple drum MIDI notes.
-    Returns list of (time_seconds, midi_note).
-    Heuristics: use spectral centroid & onset energy to classify kick/snare/hihat.
-    """
-    try:
-        onset_env = librosa.onset.onset_strength(y=y_p, sr=sr)
-        onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, backtrack=backtrack)
-        hits = []
-        if len(onsets) == 0:
-            return hits
-        S = np.abs(librosa.stft(y_p, n_fft=2048))
-        for fr in onsets:
-            t = float(librosa.frames_to_time(fr, sr=sr))
-            # spectral centroid around the frame (safe slicing)
-            start = max(0, fr - 2)
-            end = min(fr + 3, S.shape[1] - 1)
-            try:
-                centroid = np.mean(librosa.feature.spectral_centroid(S=S[:, start:end+1], sr=sr))
-            except Exception:
-                centroid = 0.0
-            # Heurística simple:
-            # centroid small -> kick, medium -> snare, large -> hihat
-            if centroid < 1500:
-                midi_note = 36   # Kick
-            elif centroid < 3500:
-                midi_note = 38   # Acoustic snare
-            else:
-                midi_note = 42   # Closed hi-hat
-            hits.append((t, midi_note))
-        return hits
-    except Exception as e:
-        print("Percussive detection error:", e)
-        return []
-# ---------- Multi-pitch extraction ----------
-def extract_multi_pitches(y_h, sr, hop_length=256, top_n=3, min_confidence=0.08):
-    """
-    Use piptrack to extract candidate pitches per frame.
-    Returns list of (time_seconds, freq_hz).
-    """
     try:
-        S = np.abs(librosa.stft(y_h, n_fft=2048, hop_length=hop_length))
-        pitches, mags = librosa.piptrack(S=S, sr=sr, hop_length=hop_length)
-        times = librosa.frames_to_time(np.arange(pitches.shape[1]), sr=sr, hop_length=hop_length)
-        candidates = []
-        for i in range(pitches.shape[1]):
-            col_p = pitches[:, i]
-            col_m = mags[:, i]
-            if np.max(col_m) <= 0:
-                continue
-            # pick top_n bins by magnitude
-            idx = np.argsort(col_m)[-top_n:]
-            max_col = np.max(col_m)
-            for k in idx:
-                if col_m[k] > 0 and col_m[k] >= min_confidence * max_col:
-                    candidates.append((times[i], float(col_p[k])))
-        # filter zeros & NaNs
-        candidates = [(t, p) for (t, p) in candidates if p is not None and p > 0 and not np.isnan(p)]
-        return candidates
-    except Exception as e:
-        print("extract_multi_pitches error:", e)
-        return []
-# ---------- Clustering / track formation ----------
-def cluster_pitch_trajectories(candidates, max_voices=4):
     """
-    Cluster candidate (time, pitch) pairs into trajectories representing voices/instruments.
-    Returns list of tracks; each track is a sorted list of (time, freq_hz).
     """
-    if not candidates:
-        return []
     try:
-        X = np.array([[t, hz_to_midi(h)] for (t, h) in candidates], dtype=np.float64)
-        # Normalize columns
-        Xn = X.copy()
-        if Xn[:,0].ptp() > 1e-9:
-            Xn[:,0] = (Xn[:,0] - Xn[:,0].min()) / (Xn[:,0].ptp())
-        else:
-            Xn[:,0] = 0.0
-        if Xn[:,1].ptp() > 1e-9:
-            Xn[:,1] = (Xn[:,1] - Xn[:,1].min()) / (Xn[:,1].ptp())
-        else:
-            Xn[:,1] = 0.0
-        n_clusters = min(max_voices, max(1, int(np.unique(np.round(Xn, 3), axis=0).shape[0])))
-        if n_clusters <= 1:
-            labels = np.zeros(len(Xn), dtype=int)
-        else:
-            clustering = AgglomerativeClustering(n_clusters=n_clusters).fit(Xn)
-            labels = clustering.labels_
-        tracks = []
-        for lab in range(int(labels.max()) + 1):
-            idxs = np.where(labels == lab)[0]
-            if len(idxs) == 0:
-                continue
-            pts = [(float(X[i,0]), float(X[i,1])) for i in idxs]
-            # convert midi values back to hz for smoothing/processing (midi->hz)
-            pts_hz = [(t, A440 * (2 ** ((m - 69) / 12))) for (t, m) in pts]
-            pts_sorted = sorted(pts_hz, key=lambda x: x[0])
-            tracks.append(pts_sorted)
-        return tracks
     except Exception as e:
-        print("cluster_pitch_trajectories error:", e)
-        return []
-def trajectories_to_notes(tracks, hop_length, sr, min_note_ms=80):
     """
-    Convert each trajectory (time,freq) to notes (midi_int, start, end).
-    Groups consecutive equal rounded-midis and enforces minimum duration.
     """
-    notes = []
-    for tr in tracks:
-        if not tr:
-            continue
-        times = np.array([t for t, _ in tr])
-        freqs = np.array([f for _, f in tr])
-        # Smooth frequencies
-        freqs_s = safe_median_filter(freqs.astype(np.float64), size=3)
-        midis = np.round([hz_to_midi(f) for f in freqs_s])
-        # Group consecutive equal midis
-        i = 0
-        n = len(midis)
-        frame_ms = 1000.0 * hop_length / sr
-        min_frames = max(1, int(np.ceil(min_note_ms / frame_ms)))
-        while i < n:
-            j = i + 1
-            while j < n and midis[j] == midis[i]:
-                j += 1
-            if (j - i) >= min_frames and not np.isnan(midis[i]):
-                t0 = float(times[i])
-                t1 = float(times[j - 1] + hop_length / sr)
-                notes.append((int(midis[i]), t0, t1))
-            i = j
-    return notes
-# ---------- Main multi-instrument conversion ----------
-def audio_to_midi_multi(
-    audio,
-    hop_length=256,
-    frame_length=2048,
-    max_voices=3,
-    percussive=True,
-    bpm=120,
-    quantize=True,
-    division=4,
-    velocity=100,
-    program_map=None,
-    top_n=4,
-    min_confidence=0.10,
-    min_note_ms=80,
-):
-    """
-    Full pipeline:
-     - load audio
-     - HPSS
-     - detect percussive hits -> drum track
-     - extract multi-pitch candidates from harmonic part
-     - cluster candidates into tracks (voices)
-     - convert tracks to MIDI notes and split into separate instruments by pitch ranges
-    """
-    try:
-        # Load audio
-        if isinstance(audio, tuple):
-            sr, y = audio
-            y = np.array(y, dtype=np.float32)
-        else:
-            y, sr = librosa.load(audio, sr=None, mono=True)
-        if y.size == 0:
-            raise ValueError("Empty audio")
-        # normalize
-        if np.max(np.abs(y)) > 0:
-            y = y / np.max(np.abs(y))
-        # HPSS
-        y_h, y_p = separate_harmonic_percussive(y)
-        pm = pretty_midi.PrettyMIDI()
-        # Percussion track
-        if percussive:
-            hits = detect_percussive_hits(y_p, sr)
-            if hits:
-                drum_inst = pretty_midi.Instrument(program=0, is_drum=True)
-                for t, midi_note in hits:
-                    # tiny duration for hits
-                    drum_inst.notes.append(pretty_midi.Note(velocity=int(velocity), pitch=int(midi_note),
-                                                           start=float(t), end=float(t + 0.05)))
-                pm.instruments.append(drum_inst)
-        # Harmonic: multi-pitch extraction
-        candidates = extract_multi_pitches(y_h, sr, hop_length=hop_length, top_n=top_n, min_confidence=min_confidence)
-        tracks = cluster_pitch_trajectories(candidates, max_voices=max_voices)
-        notes = trajectories_to_notes(tracks, hop_length=hop_length, sr=sr, min_note_ms=min_note_ms)
-        # If we have notes, split by pitch quantiles into up to max_voices instrument tracks.
-        if notes:
-            midi_vals = np.array([n[0] for n in notes])
-            unique = np.unique(midi_vals)
-            groups = int(min(max_voices, max(1, len(unique))))
-            edges = np.quantile(midi_vals, np.linspace(0, 1, groups + 1))
-            for g in range(groups):
-                program = program_map[g] if (program_map and g < len(program_map)) else 0
-                inst = pretty_midi.Instrument(program=int(program))
-                low = edges[g]
-                high = edges[g + 1]
-                for m, t0, t1 in notes:
-                    if m >= low - 0.0001 and m <= high + 0.0001:
-                        inst.notes.append(pretty_midi.Note(velocity=int(velocity), pitch=int(m), start=float(t0),
-                                                           end=float(t1)))
-                # Only append instruments that have notes
-                if len(inst.notes) > 0:
                     pm.instruments.append(inst)
-        # Quantize to grid if requested
-        if quantize and bpm > 0:
-            for instr in pm.instruments:
-                for note in instr.notes:
-                    note.start = float(round_to_grid(note.start, bpm, division))
-                    note.end = float(round_to_grid(note.end, bpm, division))
-                    if note.end <= note.start:
-                        note.end = note.start + (60.0 / bpm) / division
-        # Save MIDI
-        tmpdir = tempfile.mkdtemp()
-        midi_path = os.path.join(tmpdir, "multi_output.mid")
-        pm.write(midi_path)
-        summary = {
-            "duration_s": round(len(y) / sr, 3),
-            "instruments": len(pm.instruments),
-            "notes_total": sum(len(i.notes) for i in pm.instruments),
-            "bpm": bpm,
-            "voices_requested": max_voices,
-        }
-        return midi_path, summary
-    except Exception as e:
-        traceback.print_exc()
-        raise
 # ---------- Gradio UI ----------
 CSS = """
-#app_title {font-size: 28px; font-weight: 800}
 #app_subtitle {opacity: .8}
 """
-with gr.Blocks(css=CSS, title="Audio → Multi-MIDI (AlexGPT)") as demo:
-    gr.Markdown("<div id='app_title'>🎤 Audio → 🎹 MIDI (Polyphonic & Multi-instrument)</div>"
-                "<div id='app_subtitle'>HPSS + Multi-pitch + Clustering → multi-track MIDI</div>")
     with gr.Row():
         with gr.Column(scale=2):
-            audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio de entrada (mono/mix)")
-            with gr.Accordion("Extracción / Separación", open=False):
-                hop = gr.Slider(128, 1024, value=256, step=64, label="Hop length (samples)")
-                frame = gr.Slider(1024, 4096, value=2048, step=256, label="Frame length (samples)")
-                max_voices = gr.Slider(1, 6, value=3, step=1, label="Máx voces (clusters)")
-                percussive = gr.Checkbox(value=True, label="Detectar percusión (HPSS)")
-                topn = gr.Slider(1, 8, value=4, step=1, label="Picos por frame (top N)")
-                min_conf = gr.Slider(0.01, 0.5, value=0.1, step=0.01, label="Umbral relativo de confianza")
-                min_note_ms = gr.Slider(10, 500, value=80, step=10, label="Duración mínima nota (ms)")
-            with gr.Accordion("Salida MIDI", open=True):
-                do_quant = gr.Checkbox(value=True, label="Cuantizar a rejilla")
-                bpm = gr.Slider(40, 220, value=120, step=1, label="BPM")
-                division = gr.Dropdown([1, 2, 4, 8, 16], value=4, label="División por negra (1=negra, 4=semicorchea)")
-                velocity = gr.Slider(1, 127, value=100, step=1, label="Velocidad (1-127)")
-                # program_map not editable in UI for simplicity; advanced: add dynamic inputs
-            run_btn = gr.Button("🔄 Convertir a MIDI", variant="primary")
         with gr.Column(scale=1):
-            midi_out = gr.File(label="Archivo MIDI generado")
-            summary_out = gr.JSON(label="Resumen")
-            gr.Markdown(
-                "**Sugerencias**\n\n"
-                "- Este método es heurístico: los mejores resultados salen de mezclas con instrumentos claros y poca reverb.\n"
-                "- Para separar pistas reales (vocal, synth, bass) usa modelos de source separation (Demucs/Spleeter) antes del análisis.\n"
-                "- Ajusta `Máx voces` al número aproximado de instrumentos melódicos.\n"
-            )
-    def _convert(audio_path, hop_length, frame_length, max_voices_val, percussive_val, topn_val,
-                 do_quantize, bpm_val, division_val, velocity_val, min_conf_val, min_note_ms_val):
         try:
-            midi_path, summary = audio_to_midi_multi(
-                audio=audio_path,
-                hop_length=int(hop_length),
-                frame_length=int(frame_length),
-                max_voices=int(max_voices_val),
-                percussive=bool(percussive_val),
-                bpm=float(bpm_val),
-                quantize=bool(do_quantize),
-                division=int(division_val),
-                velocity=int(velocity_val),
-                top_n=int(topn_val),
-                min_confidence=float(min_conf_val),
-                min_note_ms=int(min_note_ms_val),
-            )
-            return midi_path, summary
         except Exception as e:
-            return gr.update(value=None), {"error": str(e)}
-    run_btn.click(
-        _convert,
-        inputs=[audio_in, hop, frame, max_voices, percussive, topn, do_quant, bpm, division, velocity, min_conf, min_note_ms],
-        outputs=[midi_out, summary_out],
-    )
 if __name__ == "__main__":
     demo.launch()

+# app.py - Demucs + Basic-Pitch pipeline -> multi-track MIDI (Gradio)
+# Author: AlexGPT
+# WARNING: heavy deps (demucs, basic-pitch, torch, tensorflow). Use a beefy Space or local env.
 import os
 import tempfile
+import shutil
+import subprocess
 import traceback
 import numpy as np
 import librosa
 import pretty_midi
 import gradio as gr
+# Try imports for basic-pitch (tensorflow) if available
+HAS_DEMUCS = False
+HAS_BASIC_PITCH = False
+DEMucs_MODEL_NAME = "htdemucs_ft"  # reasonable default
+try:
+    import demucs  # noqa: F401
+    HAS_DEMUCS = True
+except Exception:
+    HAS_DEMUCS = False
+try:
+    # basic_pitch usage per README: import predict + load saved model
+    import tensorflow as tf  # basic-pitch uses TF saved_model
+    from basic_pitch.inference import predict
+    from basic_pitch import ICASSP_2022_MODEL_PATH
+    # load model once (this may be heavy)
     try:
+        BASIC_PITCH_MODEL = tf.saved_model.load(str(ICASSP_2022_MODEL_PATH))
+        HAS_BASIC_PITCH = True
     except Exception as e:
+        print("Could not load Basic-Pitch saved model:", e)
+        HAS_BASIC_PITCH = False
+except Exception as e:
+    print("basic-pitch/TensorFlow not available:", e)
+    HAS_BASIC_PITCH = False
+# Fallback simple pipeline (librosa-based) in case heavy libs missing
+def librosa_mono_pitch_to_midi(audio_path, hop_length=256, frame_length=2048, bpm=120, quantize=True, division=4):
+    y, sr = librosa.load(audio_path, sr=None, mono=True)
+    if np.max(np.abs(y))>0:
+        y = y / np.max(np.abs(y))
+    f0, voiced_flag, _ = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'),
+                                      sr=sr, frame_length=frame_length, hop_length=hop_length)
+    f0[~voiced_flag] = np.nan
+    # group frames into notes (simple)
+    times = np.arange(len(f0)) * hop_length / sr
+    midi_vals = np.array([69 + 12 * np.log2(v/440.0) if (v is not None and not np.isnan(v) and v>0) else np.nan for v in f0])
+    notes = []
+    i = 0
+    while i < len(midi_vals):
+        if np.isnan(midi_vals[i]):
+            i += 1
+            continue
+        v = int(round(midi_vals[i]))
+        start = i
+        j = i + 1
+        while j < len(midi_vals) and not np.isnan(midi_vals[j]) and int(round(midi_vals[j])) == v:
+            j += 1
+        t0 = times[start]
+        t1 = times[j-1] + hop_length/sr
+        notes.append((v, float(t0), float(t1)))
+        i = j
+    pm = pretty_midi.PrettyMIDI()
+    inst = pretty_midi.Instrument(program=0)
+    for m,t0,t1 in notes:
+        inst.notes.append(pretty_midi.Note(velocity=90, pitch=int(m), start=t0, end=t1))
+    pm.instruments.append(inst)
+    tmpdir = tempfile.mkdtemp()
+    out = os.path.join(tmpdir, "fallback.mid")
+    pm.write(out)
+    return out, {"engine":"librosa_pyin","notes":len(notes)}
+# Utility: run demucs CLI to separate stems
+def demucs_separate_cli(audio_path, model_name=DEMucs_MODEL_NAME):
+    # demucs CLI: demucs -n model audio.wav -o output_dir
+    out_root = tempfile.mkdtemp()
+    cmd = ["demucs", "-n", model_name, "-o", out_root, audio_path]
     try:
+        proc = subprocess.run(cmd, capture_output=True, text=True, check=True)
+    except FileNotFoundError:
+        # demucs not installed
+        raise RuntimeError("demucs CLI not found. Please install demucs in the environment.")
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"Demucs separation failed: {e.stderr or e.stdout}")
+    # output dir: out_root/separated/<model_name>/<basename> or demucs creates out_root/<model_name>/<basename>
+    # find the directory with stems
+    stems_dir = None
+    for root, dirs, files in os.walk(out_root):
+        if any(f.endswith(".wav") for f in files):
+            stems_dir = root
+            break
+    if stems_dir is None:
+        raise RuntimeError(f"demucs did not produce stems under {out_root}")
+    # expected stem names: vocals.wav, drums.wav, bass.wav, other.wav (depending on model)
+    return stems_dir
+# Utility: run Basic Pitch inference on a given WAV file
+def basic_pitch_transcribe(wav_path, model_obj=None):
     """
+    Uses basic_pitch.inference.predict(model, wav_path, ...) to produce MIDI bytes or notes.
+    According to basic-pitch README, predict returns a dict with keys including 'midi' and 'notes'.
+    We will attempt to call predict(BASIC_PITCH_MODEL, wav_path, **kwargs).
     """
+    if not HAS_BASIC_PITCH:
+        raise RuntimeError("basic-pitch is not available in this environment.")
+    # default parameters: see basic-pitch inference API
     try:
+        # predict returns dict with 'midi' as bytes or file path; adapt based on version
+        result = predict(model_obj if model_obj is not None else BASIC_PITCH_MODEL,
+                         wav_path,
+                         midi=False,  # some versions: midi=True returns bytes, but we prefer structured notes
+                         piano_roll=False)
+        # 'result' could have 'notes' key listing note dicts like {'start':, 'end':, 'pitch':, 'confidence':}
+        notes = result.get("notes") or result.get("pred_notes") or []
+        # Convert notes into pretty_midi instrument
+        inst = pretty_midi.Instrument(program=0)
+        for n in notes:
+            start = float(n.get("start", n.get("onset", 0.0)))
+            end = float(n.get("end", n.get("offset", start + 0.1)))
+            pitch = int(round(n.get("pitch", n.get("midi_pitch", 60))))
+            vel = int(n.get("velocity", 90)) if n.get("velocity") else 90
+            inst.notes.append(pretty_midi.Note(velocity=vel, pitch=pitch, start=start, end=end))
+        return inst, {"notes_count": len(inst.notes)}
     except Exception as e:
+        # fallback: raise with info
+        raise RuntimeError(f"basic_pitch prediction failed: {e}")
+# Merge stems transcriptions into a single PrettyMIDI object
+def merge_stems_to_midi(stem_paths, use_basic_pitch=True):
     """
+    stem_paths: dict {stem_name: path_wav}
+    For each stem:
+      - If basic-pitch available: transcribe with it (poliphonic)
+      - Else fallback to librosa_pyin per stem
+    Returns path_to_midi, summary
     """
+    pm = pretty_midi.PrettyMIDI()
+    summary = {"stems": {}, "engine": "mixed"}
+    for i, (stem_name, path) in enumerate(stem_paths.items()):
+        try:
+            if use_basic_pitch and HAS_BASIC_PITCH:
+                inst, info = basic_pitch_transcribe(path)
+                # assign instrument program heuristically (vocals->0, bass->32, drums as drum channel)
+                if stem_name.lower() == "drums" or stem_name.lower().startswith("drum"):
+                    # drums: create drum instrument (is_drum True)
+                    drum_inst = pretty_midi.Instrument(program=0, is_drum=True)
+                    # pretty_midi drum notes are normal notes but set is_drum at instrument level
+                    # copy notes from inst as hits
+                    for n in inst.notes:
+                        drum_inst.notes.append(pretty_midi.Note(velocity=n.velocity, pitch=n.pitch, start=n.start, end=n.end))
+                    pm.instruments.append(drum_inst)
+                else:
+                    # set program per stem (simple heuristics)
+                    program = 0
+                    if "bass" in stem_name.lower():
+                        program = 32  # acoustic bass
+                    elif "voc" in stem_name.lower() or "vocal" in stem_name.lower():
+                        program = 54  # synth lead (as example)
+                    inst.program = int(program)
                     pm.instruments.append(inst)
+                summary["stems"][stem_name] = {"notes": info.get("notes_count", 0), "engine":"basic_pitch"}
+            else:
+                # fallback per-stem: librosa pyin then create instrument
+                out, info = librosa_mono_pitch_to_midi(path)
+                # load that MIDI and append tracks
+                midi = pretty_midi.PrettyMIDI(out)
+                # set program heuristics
+                for inst in midi.instruments:
+                    if "drum" in stem_name.lower():
+                        inst.is_drum = True
+                    if "bass" in stem_name.lower():
+                        inst.program = 32
+                    pm.instruments.append(inst)
+                summary["stems"][stem_name] = {"notes": info.get("notes", 0), "engine": "librosa_fallback"}
+        except Exception as e:
+            # store error but continue
+            summary["stems"][stem_name] = {"error": str(e)}
+    # write midi
+    tmpdir = tempfile.mkdtemp()
+    out_midi = os.path.join(tmpdir, "separated_multi.mid")
+    pm.write(out_midi)
+    summary["instruments"] = len(pm.instruments)
+    summary["notes_total"] = sum(len(inst.notes) for inst in pm.instruments)
+    return out_midi, summary
+# High-level pipeline: separate -> transcribe each stem -> merge
+def full_pipeline(audio_filepath, demucs_model=DEMucs_MODEL_NAME, use_basic_pitch=True):
+    # 1) Demucs separation
+    if HAS_DEMUCS:
+        try:
+            stems_dir = demucs_separate_cli(audio_filepath, model_name=demucs_model)
+            # collect typical stems
+            available = {}
+            for name in os.listdir(stems_dir):
+                if name.endswith(".wav"):
+                    stem_name = os.path.splitext(name)[0]
+                    available[stem_name] = os.path.join(stems_dir, name)
+            # If demucs produced e.g. mix/<basename>/<stem>.wav or similar, try to find deeper
+            if not available:
+                # try nested
+                for root, dirs, files in os.walk(stems_dir):
+                    for f in files:
+                        if f.endswith(".wav"):
+                            available[os.path.splitext(f)[0]] = os.path.join(root, f)
+            if not available:
+                raise RuntimeError("No stems found after Demucs separation.")
+            # 2) For each stem, transcribe
+            midi_path, summary = merge_stems_to_midi(available, use_basic_pitch=use_basic_pitch)
+            return midi_path, {"demucs_model":demucs_model, **summary}
+        except Exception as e:
+            traceback.print_exc()
+            # fallback to mono approach
+            print("Demucs pipeline failed, falling back to librosa mono pipeline:", e)
+            return librosa_mono_pitch_to_midi(audio_filepath)
+    else:
+        # If demucs not available, fallback to single-track transcribe (basic-pitch on full mix if available)
+        if use_basic_pitch and HAS_BASIC_PITCH:
+            try:
+                # basic-pitch on full mix
+                inst, info = basic_pitch_transcribe(audio_filepath)
+                pm = pretty_midi.PrettyMIDI()
+                inst.program = 0
+                pm.instruments.append(inst)
+                tmpdir = tempfile.mkdtemp()
+                out = os.path.join(tmpdir, "basicpitch_full.mid")
+                pm.write(out)
+                return out, {"engine":"basic_pitch_full","notes":info.get("notes_count",0)}
+            except Exception as e:
+                print("basic-pitch on full mix failed:", e)
+        # final fallback
+        return librosa_mono_pitch_to_midi(audio_filepath)
 # ---------- Gradio UI ----------
 CSS = """
+#app_title {font-size: 26px; font-weight: 800}
 #app_subtitle {opacity: .8}
 """
+with gr.Blocks(css=CSS, title="Demucs + BasicPitch -> Multi-MIDI") as demo:
+    gr.Markdown("<div id='app_title'>🔊 Separate & Transcribe → Multi-track MIDI</div>"
+                "<div id='app_subtitle'>Demucs (stems) + Basic-Pitch (polyphonic) pipeline. Fallbacks included.</div>")
     with gr.Row():
         with gr.Column(scale=2):
+            audio_in = gr.Audio(source="upload", type="filepath", label="Audio (mix) - WAV/MP3")
+            demucs_model = gr.Dropdown(["htdemucs_ft","htdemucs","htdemucs_6s","mdx","mdx_extra"], value=DEMucs_MODEL_NAME, label="Demucs model")
+            use_basic = gr.Checkbox(value=True, label="Use Basic-Pitch for stems (if available)")
+            run_btn = gr.Button("🚀 Run pipeline")
         with gr.Column(scale=1):
+            midi_out = gr.File(label="MIDI output")
+            log_out = gr.Textbox(label="Summary / Log", lines=12)
+    def run_pipeline(audio_path, demucs_model_name, use_basic_bool):
         try:
+            midi_path, summary = full_pipeline(audio_path, demucs_model=demucs_model_name, use_basic_pitch=use_basic_bool)
+            return midi_path, str(summary)
         except Exception as e:
+            tb = traceback.format_exc()
+            return None, f"Error: {e}\\n\\nTrace:\\n{tb}"
+    run_btn.click(run_pipeline, inputs=[audio_in, demucs_model, use_basic], outputs=[midi_out, log_out])
 if __name__ == "__main__":
     demo.launch()