Spaces:

mrblackdev
/

Voice-To-MiDI-VTM

Runtime error

App Files Files Community

mrblackdev commited on Aug 24, 2025

Commit

bf213f8

verified ·

1 Parent(s): 55ac48d

Update app.py

Browse files

Files changed (1) hide show

app.py +296 -127

app.py CHANGED Viewed

@@ -1,29 +1,40 @@
 import os
 import tempfile
 import numpy as np
 import librosa
 import pretty_midi
 import gradio as gr
 A440 = 440.0
 def hz_to_midi(f):
-    if f is None or np.isnan(f) or f <= 0:
         return np.nan
-    return 69 + 12 * np.log2(f / A440)
 def safe_median_filter(data, size=3):
     try:
         from scipy.ndimage import median_filter
-        if data.dtype != np.float64:
-            data = data.astype(np.float64)
-        return median_filter(data, size=size)
     except Exception as e:
-        print("Median filter fallback:", e)
-        return data
 def round_to_grid(seconds, bpm, division=4):
     if bpm <= 0:
@@ -33,166 +44,324 @@ def round_to_grid(seconds, bpm, division=4):
     ticks = np.round(seconds / grid)
     return ticks * grid
-def group_notes(f0, sr, hop_length, min_note_ms=80, merge_gap_ms=60, midi_smoothing_window=3):
-    times = np.arange(len(f0)) * hop_length / sr
-    midi_vals = np.array([hz_to_midi(x) for x in f0], dtype=np.float64)
-    if midi_smoothing_window and midi_smoothing_window > 1:
-        midi_vals = safe_median_filter(midi_vals, size=midi_smoothing_window)
-    midi_round = np.round(midi_vals)
-    midi_round[np.isnan(midi_vals)] = np.nan
-    notes, i, n = [], 0, len(midi_round)
-    frame_ms = 1000.0 * hop_length / sr
-    min_frames = max(1, int(np.ceil(min_note_ms / frame_ms)))
-    merge_gap_frames = int(np.ceil(merge_gap_ms / frame_ms))
-    while i < n:
-        if np.isnan(midi_round[i]):
-            i += 1
             continue
-        note_val, start, j, gap = int(midi_round[i]), i, i + 1, 0
-        while j < n:
-            if np.isnan(midi_round[j]):
-                gap += 1
-                if gap > merge_gap_frames:
-                    break
                 j += 1
-                continue
-            gap = 0
-            if int(midi_round[j]) != note_val:
-                break
-            j += 1
-        if (j - start) >= min_frames:
-            t0, t1 = times[start], times[j - 1] + hop_length / sr
-            notes.append((note_val, t0, t1))
-        i = j + 1
     return notes
-def audio_to_midi(audio, fmin_note='C2', fmax_note='C7', hop_length=256, frame_length=2048,
-                  voicing_thres=0.1, min_note_ms=80, merge_gap_ms=60, bpm=100,
-                  quantize=True, division=4, velocity=80, program=0):
     try:
         if isinstance(audio, tuple):
             sr, y = audio
             y = np.array(y, dtype=np.float32)
         else:
             y, sr = librosa.load(audio, sr=None, mono=True)
         if np.max(np.abs(y)) > 0:
             y = y / np.max(np.abs(y))
-    except Exception as e:
-        raise RuntimeError(f"Error al cargar audio: {e}")
-    try:
-        fmin_hz = librosa.note_to_hz(fmin_note)
-        fmax_hz = librosa.note_to_hz(fmax_note)
-        f0, voiced_flag, _ = librosa.pyin(y, fmin=fmin_hz, fmax=fmax_hz, frame_length=frame_length, hop_length=hop_length, sr=sr)
-        f0[~voiced_flag] = np.nan
     except Exception as e:
-        raise RuntimeError(f"Error al extraer pitch: {e}")
-    notes = group_notes(f0, sr, hop_length, min_note_ms, merge_gap_ms, 3)
-    if not notes:
-        raise RuntimeError("No se detectaron notas. Ajusta parámetros o usa audio más claro.")
-    if quantize and bpm > 0:
-        q_notes = []
-        for m, t0, t1 in notes:
-            qt0, qt1 = round_to_grid(t0, bpm, division), round_to_grid(t1, bpm, division)
-            if qt1 <= qt0:
-                qt1 = qt0 + (60.0 / bpm) / division
-            q_notes.append((m, qt0, qt1))
-        notes = q_notes
-    pm = pretty_midi.PrettyMIDI()
-    instrument = pretty_midi.Instrument(program=program)
-    for m, t0, t1 in notes:
-        v = int(np.clip(velocity, 1, 127))
-        instrument.notes.append(pretty_midi.Note(velocity=v, pitch=int(m), start=float(t0), end=float(t1)))
-    pm.instruments.append(instrument)
-    tmpdir = tempfile.mkdtemp()
-    midi_path = os.path.join(tmpdir, "output.mid")
-    pm.write(midi_path)
-    summary = {
-        "duracion_audio_s": round(len(y) / sr, 3),
-        "notas_detectadas": len(notes),
-        "rango_midi_min": int(np.min([n[0] for n in notes])) if notes else None,
-        "rango_midi_max": int(np.max([n[0] for n in notes])) if notes else None,
-        "bpm": bpm,
-        "division": division,
-    }
-    return midi_path, summary
-# Interfaz Gradio
 CSS = """
 #app_title {font-size: 28px; font-weight: 800}
 #app_subtitle {opacity: .8}
 """
-with gr.Blocks(css=CSS, fill_height=True) as demo:
-    gr.Markdown("""
-    <div id='app_title'>🎤 Audio → 🎹 MIDI (Pitch‑to‑MIDI)</div>
-    <div id='app_subtitle'>Sube o graba tu voz, detecta notas y exporta un archivo MIDI listo para tu DAW.</div>
-    """)
     with gr.Row():
         with gr.Column(scale=2):
-            audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio de entrada (voz, monofónica)")
-            with gr.Accordion("Opciones de detección", open=False):
-                fmin = gr.Dropdown(["C1", "C2", "C3", "C4", "C5"], value="C2", label="Nota mínima")
-                fmax = gr.Dropdown(["C4", "C5", "C6", "C7"], value="C7", label="Nota máxima")
-                hop = gr.Slider(128, 1024, value=256, step=64, label="Hop length (muestras)")
-                frame = gr.Slider(1024, 4096, value=2048, step=256, label="Frame length (muestras)")
-                voice_th = gr.Slider(0.01, 0.5, value=0.1, step=0.01, label="Umbral de voicing")
-                min_ms = gr.Slider(10, 200, value=80, step=5, label="Duración mínima de nota (ms)")
-                gap_ms = gr.Slider(0, 200, value=60, step=5, label="Unir huecos ≤ (ms)")
-            with gr.Accordion("Cuantización y salida", open=True):
                 do_quant = gr.Checkbox(value=True, label="Cuantizar a rejilla")
-                bpm = gr.Slider(40, 220, value=100, step=1, label="BPM")
-                division = gr.Dropdown([2, 4, 8], value=4, label="División por negra")
-                velocity = gr.Slider(1, 127, value=90, step=1, label="Velocidad (1-127)")
-                program = gr.Slider(0, 127, value=0, step=1, label="Programa/MIDI Instrument (0=Piano)")
             run_btn = gr.Button("🔄 Convertir a MIDI", variant="primary")
         with gr.Column(scale=1):
             midi_out = gr.File(label="Archivo MIDI generado")
             summary_out = gr.JSON(label="Resumen")
-            gr.Markdown("""
-            **Tips**
-            - Usa melodías monofónicas.
-            - Ajusta rango de notas.
-            - Si falla, prueba menos smoothing.
-            """)
-    def _convert(audio_path, fmin_note, fmax_note, hop_length, frame_length, voice_thres, min_ms, gap_join_ms, do_quantize, bpm_val, division_val, velocity_val, program_val):
         try:
-            return audio_to_midi(
                 audio=audio_path,
-                fmin_note=fmin_note,
-                fmax_note=fmax_note,
                 hop_length=int(hop_length),
                 frame_length=int(frame_length),
-                voicing_thres=float(voice_thres),
-                min_note_ms=int(min_ms),
-                merge_gap_ms=int(gap_join_ms),
                 bpm=float(bpm_val),
                 quantize=bool(do_quantize),
                 division=int(division_val),
                 velocity=int(velocity_val),
-                program=int(program_val),
             )
         except Exception as e:
-            raise gr.Error(f"Error: {e}")
-    run_btn.click(_convert, inputs=[audio_in, fmin, fmax, hop, frame, voice_th, min_ms, gap_ms, do_quant, bpm, division, velocity, program], outputs=[midi_out, summary_out])
 if __name__ == "__main__":
-    demo.launch()

+# app.py - Audio -> Multi-track MIDI (HPSS + Multi-pitch + Clustering)
+# Designed for Hugging Face Spaces (Gradio).
+# Author: AlexGPT (responding to your request)
 import os
 import tempfile
+import traceback
 import numpy as np
 import librosa
 import pretty_midi
 import gradio as gr
+from sklearn.cluster import AgglomerativeClustering
+# ---------- Config ----------
 A440 = 440.0
+# ---------- Utilities ----------
 def hz_to_midi(f):
+    """Return float MIDI number or np.nan for invalid f."""
+    try:
+        if f is None or np.isnan(f) or f <= 0:
+            return np.nan
+        return 69 + 12 * np.log2(f / A440)
+    except Exception:
         return np.nan
 def safe_median_filter(data, size=3):
+    """Median filter forcing float64 to avoid scipy errors; fallback to identity."""
     try:
         from scipy.ndimage import median_filter
+        arr = np.asarray(data)
+        if arr.dtype != np.float64:
+            arr = arr.astype(np.float64)
+        return median_filter(arr, size=size)
     except Exception as e:
+        print("median_filter fallback:", e)
+        return np.asarray(data, dtype=np.float64)
 def round_to_grid(seconds, bpm, division=4):
     if bpm <= 0:
     ticks = np.round(seconds / grid)
     return ticks * grid
+# ---------- Signal separation & percussive detection ----------
+def separate_harmonic_percussive(y):
+    """HPSS separation; returns (harmonic, percussive). If fails, return (y, zeros)."""
+    try:
+        y_h, y_p = librosa.effects.hpss(y)
+        return y_h, y_p
+    except Exception as e:
+        print("HPSS fallback:", e)
+        return y, np.zeros_like(y)
+def detect_percussive_hits(y_p, sr, backtrack=False):
+    """
+    Detect percussive onsets and map them to simple drum MIDI notes.
+    Returns list of (time_seconds, midi_note).
+    Heuristics: use spectral centroid & onset energy to classify kick/snare/hihat.
+    """
+    try:
+        onset_env = librosa.onset.onset_strength(y=y_p, sr=sr)
+        onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, backtrack=backtrack)
+        hits = []
+        if len(onsets) == 0:
+            return hits
+        S = np.abs(librosa.stft(y_p, n_fft=2048))
+        for fr in onsets:
+            t = float(librosa.frames_to_time(fr, sr=sr))
+            # spectral centroid around the frame (safe slicing)
+            start = max(0, fr - 2)
+            end = min(fr + 3, S.shape[1] - 1)
+            try:
+                centroid = np.mean(librosa.feature.spectral_centroid(S=S[:, start:end+1], sr=sr))
+            except Exception:
+                centroid = 0.0
+            # Heurística simple:
+            # centroid small -> kick, medium -> snare, large -> hihat
+            if centroid < 1500:
+                midi_note = 36   # Kick
+            elif centroid < 3500:
+                midi_note = 38   # Acoustic snare
+            else:
+                midi_note = 42   # Closed hi-hat
+            hits.append((t, midi_note))
+        return hits
+    except Exception as e:
+        print("Percussive detection error:", e)
+        return []
+# ---------- Multi-pitch extraction ----------
+def extract_multi_pitches(y_h, sr, hop_length=256, top_n=3, min_confidence=0.08):
+    """
+    Use piptrack to extract candidate pitches per frame.
+    Returns list of (time_seconds, freq_hz).
+    """
+    try:
+        S = np.abs(librosa.stft(y_h, n_fft=2048, hop_length=hop_length))
+        pitches, mags = librosa.piptrack(S=S, sr=sr, hop_length=hop_length)
+        times = librosa.frames_to_time(np.arange(pitches.shape[1]), sr=sr, hop_length=hop_length)
+        candidates = []
+        for i in range(pitches.shape[1]):
+            col_p = pitches[:, i]
+            col_m = mags[:, i]
+            if np.max(col_m) <= 0:
+                continue
+            # pick top_n bins by magnitude
+            idx = np.argsort(col_m)[-top_n:]
+            max_col = np.max(col_m)
+            for k in idx:
+                if col_m[k] > 0 and col_m[k] >= min_confidence * max_col:
+                    candidates.append((times[i], float(col_p[k])))
+        # filter zeros & NaNs
+        candidates = [(t, p) for (t, p) in candidates if p is not None and p > 0 and not np.isnan(p)]
+        return candidates
+    except Exception as e:
+        print("extract_multi_pitches error:", e)
+        return []
+# ---------- Clustering / track formation ----------
+def cluster_pitch_trajectories(candidates, max_voices=4):
+    """
+    Cluster candidate (time, pitch) pairs into trajectories representing voices/instruments.
+    Returns list of tracks; each track is a sorted list of (time, freq_hz).
+    """
+    if not candidates:
+        return []
+    try:
+        X = np.array([[t, hz_to_midi(h)] for (t, h) in candidates], dtype=np.float64)
+        # Normalize columns
+        Xn = X.copy()
+        if Xn[:,0].ptp() > 1e-9:
+            Xn[:,0] = (Xn[:,0] - Xn[:,0].min()) / (Xn[:,0].ptp())
+        else:
+            Xn[:,0] = 0.0
+        if Xn[:,1].ptp() > 1e-9:
+            Xn[:,1] = (Xn[:,1] - Xn[:,1].min()) / (Xn[:,1].ptp())
+        else:
+            Xn[:,1] = 0.0
+        n_clusters = min(max_voices, max(1, int(np.unique(np.round(Xn, 3), axis=0).shape[0])))
+        if n_clusters <= 1:
+            labels = np.zeros(len(Xn), dtype=int)
+        else:
+            clustering = AgglomerativeClustering(n_clusters=n_clusters).fit(Xn)
+            labels = clustering.labels_
+        tracks = []
+        for lab in range(int(labels.max()) + 1):
+            idxs = np.where(labels == lab)[0]
+            if len(idxs) == 0:
+                continue
+            pts = [(float(X[i,0]), float(X[i,1])) for i in idxs]
+            # convert midi values back to hz for smoothing/processing (midi->hz)
+            pts_hz = [(t, A440 * (2 ** ((m - 69) / 12))) for (t, m) in pts]
+            pts_sorted = sorted(pts_hz, key=lambda x: x[0])
+            tracks.append(pts_sorted)
+        return tracks
+    except Exception as e:
+        print("cluster_pitch_trajectories error:", e)
+        return []
+def trajectories_to_notes(tracks, hop_length, sr, min_note_ms=80):
+    """
+    Convert each trajectory (time,freq) to notes (midi_int, start, end).
+    Groups consecutive equal rounded-midis and enforces minimum duration.
+    """
+    notes = []
+    for tr in tracks:
+        if not tr:
             continue
+        times = np.array([t for t, _ in tr])
+        freqs = np.array([f for _, f in tr])
+        # Smooth frequencies
+        freqs_s = safe_median_filter(freqs.astype(np.float64), size=3)
+        midis = np.round([hz_to_midi(f) for f in freqs_s])
+        # Group consecutive equal midis
+        i = 0
+        n = len(midis)
+        frame_ms = 1000.0 * hop_length / sr
+        min_frames = max(1, int(np.ceil(min_note_ms / frame_ms)))
+        while i < n:
+            j = i + 1
+            while j < n and midis[j] == midis[i]:
                 j += 1
+            if (j - i) >= min_frames and not np.isnan(midis[i]):
+                t0 = float(times[i])
+                t1 = float(times[j - 1] + hop_length / sr)
+                notes.append((int(midis[i]), t0, t1))
+            i = j
     return notes
+# ---------- Main multi-instrument conversion ----------
+def audio_to_midi_multi(
+    audio,
+    hop_length=256,
+    frame_length=2048,
+    max_voices=3,
+    percussive=True,
+    bpm=120,
+    quantize=True,
+    division=4,
+    velocity=100,
+    program_map=None,
+    top_n=4,
+    min_confidence=0.10,
+    min_note_ms=80,
+):
+    """
+    Full pipeline:
+     - load audio
+     - HPSS
+     - detect percussive hits -> drum track
+     - extract multi-pitch candidates from harmonic part
+     - cluster candidates into tracks (voices)
+     - convert tracks to MIDI notes and split into separate instruments by pitch ranges
+    """
     try:
+        # Load audio
         if isinstance(audio, tuple):
             sr, y = audio
             y = np.array(y, dtype=np.float32)
         else:
             y, sr = librosa.load(audio, sr=None, mono=True)
+        if y.size == 0:
+            raise ValueError("Empty audio")
+        # normalize
         if np.max(np.abs(y)) > 0:
             y = y / np.max(np.abs(y))
+        # HPSS
+        y_h, y_p = separate_harmonic_percussive(y)
+        pm = pretty_midi.PrettyMIDI()
+        # Percussion track
+        if percussive:
+            hits = detect_percussive_hits(y_p, sr)
+            if hits:
+                drum_inst = pretty_midi.Instrument(program=0, is_drum=True)
+                for t, midi_note in hits:
+                    # tiny duration for hits
+                    drum_inst.notes.append(pretty_midi.Note(velocity=int(velocity), pitch=int(midi_note),
+                                                           start=float(t), end=float(t + 0.05)))
+                pm.instruments.append(drum_inst)
+        # Harmonic: multi-pitch extraction
+        candidates = extract_multi_pitches(y_h, sr, hop_length=hop_length, top_n=top_n, min_confidence=min_confidence)
+        tracks = cluster_pitch_trajectories(candidates, max_voices=max_voices)
+        notes = trajectories_to_notes(tracks, hop_length=hop_length, sr=sr, min_note_ms=min_note_ms)
+        # If we have notes, split by pitch quantiles into up to max_voices instrument tracks.
+        if notes:
+            midi_vals = np.array([n[0] for n in notes])
+            unique = np.unique(midi_vals)
+            groups = int(min(max_voices, max(1, len(unique))))
+            edges = np.quantile(midi_vals, np.linspace(0, 1, groups + 1))
+            for g in range(groups):
+                program = program_map[g] if (program_map and g < len(program_map)) else 0
+                inst = pretty_midi.Instrument(program=int(program))
+                low = edges[g]
+                high = edges[g + 1]
+                for m, t0, t1 in notes:
+                    if m >= low - 0.0001 and m <= high + 0.0001:
+                        inst.notes.append(pretty_midi.Note(velocity=int(velocity), pitch=int(m), start=float(t0),
+                                                           end=float(t1)))
+                # Only append instruments that have notes
+                if len(inst.notes) > 0:
+                    pm.instruments.append(inst)
+        # Quantize to grid if requested
+        if quantize and bpm > 0:
+            for instr in pm.instruments:
+                for note in instr.notes:
+                    note.start = float(round_to_grid(note.start, bpm, division))
+                    note.end = float(round_to_grid(note.end, bpm, division))
+                    if note.end <= note.start:
+                        note.end = note.start + (60.0 / bpm) / division
+        # Save MIDI
+        tmpdir = tempfile.mkdtemp()
+        midi_path = os.path.join(tmpdir, "multi_output.mid")
+        pm.write(midi_path)
+        summary = {
+            "duration_s": round(len(y) / sr, 3),
+            "instruments": len(pm.instruments),
+            "notes_total": sum(len(i.notes) for i in pm.instruments),
+            "bpm": bpm,
+            "voices_requested": max_voices,
+        }
+        return midi_path, summary
     except Exception as e:
+        traceback.print_exc()
+        raise
+# ---------- Gradio UI ----------
 CSS = """
 #app_title {font-size: 28px; font-weight: 800}
 #app_subtitle {opacity: .8}
 """
+with gr.Blocks(css=CSS, title="Audio → Multi-MIDI (AlexGPT)") as demo:
+    gr.Markdown("<div id='app_title'>🎤 Audio → 🎹 MIDI (Polyphonic & Multi-instrument)</div>"
+                "<div id='app_subtitle'>HPSS + Multi-pitch + Clustering → multi-track MIDI</div>")
     with gr.Row():
         with gr.Column(scale=2):
+            audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio de entrada (mono/mix)")
+            with gr.Accordion("Extracción / Separación", open=False):
+                hop = gr.Slider(128, 1024, value=256, step=64, label="Hop length (samples)")
+                frame = gr.Slider(1024, 4096, value=2048, step=256, label="Frame length (samples)")
+                max_voices = gr.Slider(1, 6, value=3, step=1, label="Máx voces (clusters)")
+                percussive = gr.Checkbox(value=True, label="Detectar percusión (HPSS)")
+                topn = gr.Slider(1, 8, value=4, step=1, label="Picos por frame (top N)")
+                min_conf = gr.Slider(0.01, 0.5, value=0.1, step=0.01, label="Umbral relativo de confianza")
+                min_note_ms = gr.Slider(10, 500, value=80, step=10, label="Duración mínima nota (ms)")
+            with gr.Accordion("Salida MIDI", open=True):
                 do_quant = gr.Checkbox(value=True, label="Cuantizar a rejilla")
+                bpm = gr.Slider(40, 220, value=120, step=1, label="BPM")
+                division = gr.Dropdown([1, 2, 4, 8, 16], value=4, label="División por negra (1=negra, 4=semicorchea)")
+                velocity = gr.Slider(1, 127, value=100, step=1, label="Velocidad (1-127)")
+                # program_map not editable in UI for simplicity; advanced: add dynamic inputs
             run_btn = gr.Button("🔄 Convertir a MIDI", variant="primary")
         with gr.Column(scale=1):
             midi_out = gr.File(label="Archivo MIDI generado")
             summary_out = gr.JSON(label="Resumen")
+            gr.Markdown(
+                "**Sugerencias**\n\n"
+                "- Este método es heurístico: los mejores resultados salen de mezclas con instrumentos claros y poca reverb.\n"
+                "- Para separar pistas reales (vocal, synth, bass) usa modelos de source separation (Demucs/Spleeter) antes del análisis.\n"
+                "- Ajusta `Máx voces` al número aproximado de instrumentos melódicos.\n"
+            )
+    def _convert(audio_path, hop_length, frame_length, max_voices_val, percussive_val, topn_val,
+                 do_quantize, bpm_val, division_val, velocity_val, min_conf_val, min_note_ms_val):
         try:
+            midi_path, summary = audio_to_midi_multi(
                 audio=audio_path,
                 hop_length=int(hop_length),
                 frame_length=int(frame_length),
+                max_voices=int(max_voices_val),
+                percussive=bool(percussive_val),
                 bpm=float(bpm_val),
                 quantize=bool(do_quantize),
                 division=int(division_val),
                 velocity=int(velocity_val),
+                top_n=int(topn_val),
+                min_confidence=float(min_conf_val),
+                min_note_ms=int(min_note_ms_val),
             )
+            return midi_path, summary
         except Exception as e:
+            return gr.update(value=None), {"error": str(e)}
+    run_btn.click(
+        _convert,
+        inputs=[audio_in, hop, frame, max_voices, percussive, topn, do_quant, bpm, division, velocity, min_conf, min_note_ms],
+        outputs=[midi_out, summary_out],
+    )
 if __name__ == "__main__":
+    demo.launch()