Spaces:

mrblackdev
/

Voice-To-MiDI-VTM

Runtime error

App Files Files Community

mrblackdev commited on Aug 24, 2025

Commit

0aba475

verified ·

1 Parent(s): c431cef

Create app.py

Browse files

Files changed (1) hide show

app.py +265 -0

app.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import os
+import tempfile
+import numpy as np
+import librosa
+import pretty_midi
+import gradio as gr
+# =====================
+# Utilidades
+# =====================
+A440 = 440.0
+def hz_to_midi(f):
+    if f is None or np.isnan(f) or f <= 0:
+        return None
+    return 69 + 12 * np.log2(f / A440)
+def midi_to_hz(m):
+    return A440 * (2 ** ((m - 69) / 12))
+def round_to_grid(seconds, bpm, division=4):
+    """Cuantiza tiempo en segundos a la rejilla (division por negra, p.ej. 4=semicorchea)."""
+    if bpm <= 0:
+        return seconds
+    beat = 60.0 / bpm
+    grid = beat / division
+    ticks = np.round(seconds / grid)
+    return ticks * grid
+def group_notes(f0, sr, hop_length,
+                min_note_ms=80,
+                merge_gap_ms=60,
+                midi_smoothing_window=3):
+    """
+    Agrupa frames con el mismo número MIDI (tras redondeo) en notas con inicio/fin.
+    - f0: vector de frecuencias (Hz, NaN para no sonoro)
+    - Devuelve lista de (midi_note, t_start, t_end)
+    """
+    times = np.arange(len(f0)) * hop_length / sr
+    # Convertir a MIDI y enmascarar no sonoros
+    midi_vals = np.array([hz_to_midi(x) for x in f0])
+    # Suavizado mediano para reducir saltos espurios
+    if midi_smoothing_window and midi_smoothing_window > 1:
+        from scipy.ndimage import median_filter
+        midi_vals = median_filter(midi_vals, size=midi_smoothing_window)
+    # Redondeo al entero más cercano (clase de nota)
+    midi_round = np.round(midi_vals)
+    # No sonoros -> NaN
+    midi_round[np.isnan(midi_vals)] = np.nan
+    notes = []
+    i = 0
+    n = len(midi_round)
+    frame_ms = 1000.0 * hop_length / sr
+    min_frames = max(1, int(np.ceil(min_note_ms / frame_ms)))
+    merge_gap_frames = int(np.ceil(merge_gap_ms / frame_ms))
+    while i < n:
+        if np.isnan(midi_round[i]):
+            i += 1
+            continue
+        note_val = int(midi_round[i])
+        start = i
+        j = i + 1
+        # extender mientras siga misma nota (permitimos pequeños NaN huecos cortos)
+        gap = 0
+        while j < n:
+            if np.isnan(midi_round[j]):
+                gap += 1
+                if gap > merge_gap_frames:
+                    break
+                j += 1
+                continue
+            gap = 0
+            if int(midi_round[j]) != note_val:
+                break
+            j += 1
+        # Validar duración mínima
+        if (j - start) >= min_frames:
+            t0 = times[start]
+            t1 = times[j - 1] + hop_length / sr
+            notes.append((note_val, t0, t1))
+        i = j + 1
+    return notes
+def audio_to_midi(
+    audio,
+    fmin_note='C2',
+    fmax_note='C7',
+    hop_length=256,
+    frame_length=2048,
+    voicing_thres=0.1,
+    min_note_ms=80,
+    merge_gap_ms=60,
+    bpm=100,
+    quantize=True,
+    division=4,
+    velocity=80,
+    program=0,
+):
+    """
+    Convierte audio (ruta o ndarray) a un archivo MIDI temporal y retorna ruta + resumen.
+    """
+    # Cargar audio
+    if isinstance(audio, tuple):
+        # gradio mic: (sr, data)
+        sr, y = audio
+        y = np.array(y, dtype=np.float32)
+    else:
+        # gradio file: filepath str
+        y, sr = librosa.load(audio, sr=None, mono=True)
+    # Normalizar
+    if np.max(np.abs(y)) > 0:
+        y = y / np.max(np.abs(y))
+    # Pyin para f0
+    fmin_hz = librosa.note_to_hz(fmin_note)
+    fmax_hz = librosa.note_to_hz(fmax_note)
+    f0, voiced_flag, _ = librosa.pyin(
+        y,
+        fmin=fmin_hz,
+        fmax=fmax_hz,
+        frame_length=frame_length,
+        hop_length=hop_length,
+        center=True,
+        sr=sr,
+        trough_threshold=voicing_thres,
+    )
+    # Filtrar frames no sonoros
+    f0[~voiced_flag] = np.nan
+    # Agrupar en notas (midi, t0, t1)
+    notes = group_notes(
+        f0=f0,
+        sr=sr,
+        hop_length=hop_length,
+        min_note_ms=min_note_ms,
+        merge_gap_ms=merge_gap_ms,
+        midi_smoothing_window=3,
+    )
+    # Opcional: cuantización temporal
+    if quantize and bpm > 0:
+        q_notes = []
+        for m, t0, t1 in notes:
+            qt0 = float(round_to_grid(t0, bpm, division))
+            qt1 = float(round_to_grid(t1, bpm, division))
+            if qt1 <= qt0:
+                qt1 = qt0 + (60.0 / bpm) / division  # mínimo 1 grid
+            q_notes.append((m, qt0, qt1))
+        notes = q_notes
+    # Construir MIDI
+    pm = pretty_midi.PrettyMIDI()
+    instrument = pretty_midi.Instrument(program=program)  # 0 = Acoustic Grand Piano
+    for m, t0, t1 in notes:
+        v = int(np.clip(velocity, 1, 127))
+        instrument.notes.append(pretty_midi.Note(velocity=v, pitch=int(m), start=float(t0), end=float(t1)))
+    pm.instruments.append(instrument)
+    # Guardar a archivo temporal
+    tmpdir = tempfile.mkdtemp()
+    midi_path = os.path.join(tmpdir, "output.mid")
+    pm.write(midi_path)
+    # Métricas
+    dur = len(y) / sr
+    summary = {
+        "duracion_audio_s": round(dur, 3),
+        "notas_detectadas": len(notes),
+        "rango_midi_min": int(np.min([n[0] for n in notes])) if notes else None,
+        "rango_midi_max": int(np.max([n[0] for n in notes])) if notes else None,
+        "bpm": bpm,
+        "division": division,
+    }
+    return midi_path, summary
+# =====================
+# Interfaz Gradio
+# =====================
+CSS = """
+#app_title {font-size: 28px; font-weight: 800}
+#app_subtitle {opacity: .8}
+"""
+with gr.Blocks(css=CSS, fill_height=True) as demo:
+    gr.Markdown("""
+    <div id='app_title'>🎤 Audio → 🎹 MIDI (Pitch‑to‑MIDI)</div>
+    <div id='app_subtitle'>Sube o graba tu voz, detecta notas y exporta un archivo MIDI listo para tu DAW.</div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=2):
+            audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio de entrada (voz, monofónica)")
+            with gr.Accordion("Opciones de detección", open=False):
+                fmin = gr.Dropdown(["C1","C2","C3","C4","C5"], value="C2", label="Nota mínima")
+                fmax = gr.Dropdown(["C4","C5","C6","C7"], value="C7", label="Nota máxima")
+                hop = gr.Slider(128, 1024, value=256, step=64, label="Hop length (muestras)")
+                frame = gr.Slider(1024, 4096, value=2048, step=256, label="Frame length (muestras)")
+                voice_th = gr.Slider(0.01, 0.5, value=0.1, step=0.01, label="Umbral de voicing (pyin)")
+                min_ms = gr.Slider(10, 200, value=80, step=5, label="Duración mínima de nota (ms)")
+                gap_ms = gr.Slider(0, 200, value=60, step=5, label="Unir huecos ≤ (ms)")
+            with gr.Accordion("Cuantización y salida", open=True):
+                do_quant = gr.Checkbox(value=True, label="Cuantizar a rejilla")
+                bpm = gr.Slider(40, 220, value=100, step=1, label="BPM")
+                division = gr.Dropdown([(2, "Corchea"), (4, "Semicorchea"), (8, "Fusa")], value=4, label="División por negra", info="Más alto = rejilla más fina")
+                velocity = gr.Slider(1, 127, value=90, step=1, label="Velocidad (1-127)")
+                program = gr.Slider(0, 127, value=0, step=1, label="Programa/MIDI Instrument (0=Piano)")
+            run_btn = gr.Button("🔄 Convertir a MIDI", variant="primary")
+        with gr.Column(scale=1):
+            midi_out = gr.File(label="Archivo MIDI generado")
+            summary_out = gr.JSON(label="Resumen")
+            gr.Markdown("""
+            **Tips**
+            - Canta una melodía monofónica, sin armonías.
+            - Ajusta el rango de notas (C2–C7) si cantas muy grave o agudo.
+            - Usa la cuantización para encajar a tempo; si quieres naturalidad, desactívala.
+            """)
+    def _convert(audio_path, fmin_note, fmax_note, hop_length, frame_length, voice_thres, min_ms, gap_join_ms, do_quantize, bpm_val, division_val, velocity_val, program_val):
+        # division puede venir como tuple(label) o int (según Gradio). Normalizamos.
+        if isinstance(division_val, tuple):
+            division_val = division_val[0]
+        midi_path, summary = audio_to_midi(
+            audio=audio_path,
+            fmin_note=fmin_note,
+            fmax_note=fmax_note,
+            hop_length=int(hop_length),
+            frame_length=int(frame_length),
+            voicing_thres=float(voice_thres),
+            min_note_ms=int(min_ms),
+            merge_gap_ms=int(gap_join_ms),
+            bpm=float(bpm_val),
+            quantize=bool(do_quantize),
+            division=int(division_val),
+            velocity=int(velocity_val),
+            program=int(program_val),
+        )
+        return midi_path, summary
+    run_btn.click(
+        _convert,
+        inputs=[audio_in, fmin, fmax, hop, frame, voice_th, min_ms, gap_ms, do_quant, bpm, division, velocity, program],
+        outputs=[midi_out, summary_out]
+    )
+if __name__ == "__main__":
+    demo.launch()