Spaces:

mrblackdev
/

Voice-To-MiDI-VTM

Runtime error

App Files Files Community

mrblackdev commited on Aug 24, 2025

Commit

7d16dd5

verified ·

1 Parent(s): 88ac55f

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -97

app.py CHANGED Viewed

@@ -5,9 +5,6 @@ import librosa
 import pretty_midi
 import gradio as gr
-# =====================
-# Utilidades
-# =====================
 A440 = 440.0
@@ -17,12 +14,7 @@ def hz_to_midi(f):
     return 69 + 12 * np.log2(f / A440)
-def midi_to_hz(m):
-    return A440 * (2 ** ((m - 69) / 12))
 def round_to_grid(seconds, bpm, division=4):
-    """Cuantiza tiempo en segundos a la rejilla (division por negra, p.ej. 4=semicorchea)."""
     if bpm <= 0:
         return seconds
     beat = 60.0 / bpm
@@ -31,34 +23,18 @@ def round_to_grid(seconds, bpm, division=4):
     return ticks * grid
-def group_notes(f0, sr, hop_length,
-                min_note_ms=80,
-                merge_gap_ms=60,
-                midi_smoothing_window=3):
-    """
-    Agrupa frames con el mismo número MIDI (tras redondeo) en notas con inicio/fin.
-    - f0: vector de frecuencias (Hz, NaN para no sonoro)
-    - Devuelve lista de (midi_note, t_start, t_end)
-    """
     times = np.arange(len(f0)) * hop_length / sr
-    # Convertir a MIDI y enmascarar no sonoros
     midi_vals = np.array([hz_to_midi(x) for x in f0])
-    # Suavizado mediano para reducir saltos espurios
     if midi_smoothing_window and midi_smoothing_window > 1:
         from scipy.ndimage import median_filter
         midi_vals = median_filter(midi_vals, size=midi_smoothing_window)
-    # Redondeo al entero más cercano (clase de nota)
     midi_round = np.round(midi_vals)
-    # No sonoros -> NaN
     midi_round[np.isnan(midi_vals)] = np.nan
-    notes = []
-    i = 0
-    n = len(midi_round)
     frame_ms = 1000.0 * hop_length / sr
     min_frames = max(1, int(np.ceil(min_note_ms / frame_ms)))
     merge_gap_frames = int(np.ceil(merge_gap_ms / frame_ms))
@@ -67,11 +43,7 @@ def group_notes(f0, sr, hop_length,
         if np.isnan(midi_round[i]):
             i += 1
             continue
-        note_val = int(midi_round[i])
-        start = i
-        j = i + 1
-        # extender mientras siga misma nota (permitimos pequeños NaN huecos cortos)
-        gap = 0
         while j < n:
             if np.isnan(midi_round[j]):
                 gap += 1
@@ -83,115 +55,66 @@ def group_notes(f0, sr, hop_length,
             if int(midi_round[j]) != note_val:
                 break
             j += 1
-        # Validar duración mínima
         if (j - start) >= min_frames:
-            t0 = times[start]
-            t1 = times[j - 1] + hop_length / sr
             notes.append((note_val, t0, t1))
         i = j + 1
     return notes
-def audio_to_midi(
-    audio,
-    fmin_note='C2',
-    fmax_note='C7',
-    hop_length=256,
-    frame_length=2048,
-    voicing_thres=0.1,
-    min_note_ms=80,
-    merge_gap_ms=60,
-    bpm=100,
-    quantize=True,
-    division=4,
-    velocity=80,
-    program=0,
-):
-    """
-    Convierte audio (ruta o ndarray) a un archivo MIDI temporal y retorna ruta + resumen.
-    """
-    # Cargar audio
     if isinstance(audio, tuple):
-        # gradio mic: (sr, data)
         sr, y = audio
         y = np.array(y, dtype=np.float32)
     else:
-        # gradio file: filepath str
         y, sr = librosa.load(audio, sr=None, mono=True)
-    # Normalizar
     if np.max(np.abs(y)) > 0:
         y = y / np.max(np.abs(y))
-    # Pyin para f0
     fmin_hz = librosa.note_to_hz(fmin_note)
     fmax_hz = librosa.note_to_hz(fmax_note)
     f0, voiced_flag, _ = librosa.pyin(
-        y,
-        fmin=fmin_hz,
-        fmax=fmax_hz,
-        frame_length=frame_length,
-        hop_length=hop_length,
-        center=True,
-        sr=sr,
-        trough_threshold=voicing_thres,
-    )
-    # Filtrar frames no sonoros
     f0[~voiced_flag] = np.nan
-    # Agrupar en notas (midi, t0, t1)
-    notes = group_notes(
-        f0=f0,
-        sr=sr,
-        hop_length=hop_length,
-        min_note_ms=min_note_ms,
-        merge_gap_ms=merge_gap_ms,
-        midi_smoothing_window=3,
-    )
-    # Opcional: cuantización temporal
     if quantize and bpm > 0:
         q_notes = []
         for m, t0, t1 in notes:
-            qt0 = float(round_to_grid(t0, bpm, division))
-            qt1 = float(round_to_grid(t1, bpm, division))
             if qt1 <= qt0:
-                qt1 = qt0 + (60.0 / bpm) / division  # mínimo 1 grid
             q_notes.append((m, qt0, qt1))
         notes = q_notes
-    # Construir MIDI
     pm = pretty_midi.PrettyMIDI()
-    instrument = pretty_midi.Instrument(program=program)  # 0 = Acoustic Grand Piano
     for m, t0, t1 in notes:
         v = int(np.clip(velocity, 1, 127))
         instrument.notes.append(pretty_midi.Note(velocity=v, pitch=int(m), start=float(t0), end=float(t1)))
     pm.instruments.append(instrument)
-    # Guardar a archivo temporal
     tmpdir = tempfile.mkdtemp()
     midi_path = os.path.join(tmpdir, "output.mid")
     pm.write(midi_path)
-    # Métricas
-    dur = len(y) / sr
     summary = {
-        "duracion_audio_s": round(dur, 3),
         "notas_detectadas": len(notes),
         "rango_midi_min": int(np.min([n[0] for n in notes])) if notes else None,
         "rango_midi_max": int(np.max([n[0] for n in notes])) if notes else None,
         "bpm": bpm,
         "division": division,
     }
     return midi_path, summary
-# =====================
-# Interfaz Gradio
-# =====================
 CSS = """
 #app_title {font-size: 28px; font-weight: 800}
 #app_subtitle {opacity: .8}
@@ -207,8 +130,8 @@ with gr.Blocks(css=CSS, fill_height=True) as demo:
         with gr.Column(scale=2):
             audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio de entrada (voz, monofónica)")
             with gr.Accordion("Opciones de detección", open=False):
-                fmin = gr.Dropdown(["C1","C2","C3","C4","C5"], value="C2", label="Nota mínima")
-                fmax = gr.Dropdown(["C4","C5","C6","C7"], value="C7", label="Nota máxima")
                 hop = gr.Slider(128, 1024, value=256, step=64, label="Hop length (muestras)")
                 frame = gr.Slider(1024, 4096, value=2048, step=256, label="Frame length (muestras)")
                 voice_th = gr.Slider(0.01, 0.5, value=0.1, step=0.01, label="Umbral de voicing (pyin)")
@@ -218,7 +141,7 @@ with gr.Blocks(css=CSS, fill_height=True) as demo:
             with gr.Accordion("Cuantización y salida", open=True):
                 do_quant = gr.Checkbox(value=True, label="Cuantizar a rejilla")
                 bpm = gr.Slider(40, 220, value=100, step=1, label="BPM")
-                division = gr.Dropdown([(2, "Corchea"), (4, "Semicorchea"), (8, "Fusa")], value=4, label="División por negra", info="Más alto = rejilla más fina")
                 velocity = gr.Slider(1, 127, value=90, step=1, label="Velocidad (1-127)")
                 program = gr.Slider(0, 127, value=0, step=1, label="Programa/MIDI Instrument (0=Piano)")
@@ -235,9 +158,6 @@ with gr.Blocks(css=CSS, fill_height=True) as demo:
             """)
     def _convert(audio_path, fmin_note, fmax_note, hop_length, frame_length, voice_thres, min_ms, gap_join_ms, do_quantize, bpm_val, division_val, velocity_val, program_val):
-        # division puede venir como tuple(label) o int (según Gradio). Normalizamos.
-        if isinstance(division_val, tuple):
-            division_val = division_val[0]
         midi_path, summary = audio_to_midi(
             audio=audio_path,
             fmin_note=fmin_note,

 import pretty_midi
 import gradio as gr
 A440 = 440.0
     return 69 + 12 * np.log2(f / A440)
 def round_to_grid(seconds, bpm, division=4):
     if bpm <= 0:
         return seconds
     beat = 60.0 / bpm
     return ticks * grid
+def group_notes(f0, sr, hop_length, min_note_ms=80, merge_gap_ms=60, midi_smoothing_window=3):
     times = np.arange(len(f0)) * hop_length / sr
     midi_vals = np.array([hz_to_midi(x) for x in f0])
     if midi_smoothing_window and midi_smoothing_window > 1:
         from scipy.ndimage import median_filter
         midi_vals = median_filter(midi_vals, size=midi_smoothing_window)
     midi_round = np.round(midi_vals)
     midi_round[np.isnan(midi_vals)] = np.nan
+    notes, i, n = [], 0, len(midi_round)
     frame_ms = 1000.0 * hop_length / sr
     min_frames = max(1, int(np.ceil(min_note_ms / frame_ms)))
     merge_gap_frames = int(np.ceil(merge_gap_ms / frame_ms))
         if np.isnan(midi_round[i]):
             i += 1
             continue
+        note_val, start, j, gap = int(midi_round[i]), i, i + 1, 0
         while j < n:
             if np.isnan(midi_round[j]):
                 gap += 1
             if int(midi_round[j]) != note_val:
                 break
             j += 1
         if (j - start) >= min_frames:
+            t0, t1 = times[start], times[j - 1] + hop_length / sr
             notes.append((note_val, t0, t1))
         i = j + 1
     return notes
+def audio_to_midi(audio, fmin_note='C2', fmax_note='C7', hop_length=256, frame_length=2048,
+                  voicing_thres=0.1, min_note_ms=80, merge_gap_ms=60, bpm=100,
+                  quantize=True, division=4, velocity=80, program=0):
     if isinstance(audio, tuple):
         sr, y = audio
         y = np.array(y, dtype=np.float32)
     else:
         y, sr = librosa.load(audio, sr=None, mono=True)
     if np.max(np.abs(y)) > 0:
         y = y / np.max(np.abs(y))
     fmin_hz = librosa.note_to_hz(fmin_note)
     fmax_hz = librosa.note_to_hz(fmax_note)
     f0, voiced_flag, _ = librosa.pyin(
+        y, fmin=fmin_hz, fmax=fmax_hz, frame_length=frame_length,
+        hop_length=hop_length, center=True, sr=sr, trough_threshold=voicing_thres)
     f0[~voiced_flag] = np.nan
+    notes = group_notes(f0, sr, hop_length, min_note_ms, merge_gap_ms, 3)
     if quantize and bpm > 0:
         q_notes = []
         for m, t0, t1 in notes:
+            qt0, qt1 = round_to_grid(t0, bpm, division), round_to_grid(t1, bpm, division)
             if qt1 <= qt0:
+                qt1 = qt0 + (60.0 / bpm) / division
             q_notes.append((m, qt0, qt1))
         notes = q_notes
     pm = pretty_midi.PrettyMIDI()
+    instrument = pretty_midi.Instrument(program=program)
     for m, t0, t1 in notes:
         v = int(np.clip(velocity, 1, 127))
         instrument.notes.append(pretty_midi.Note(velocity=v, pitch=int(m), start=float(t0), end=float(t1)))
     pm.instruments.append(instrument)
     tmpdir = tempfile.mkdtemp()
     midi_path = os.path.join(tmpdir, "output.mid")
     pm.write(midi_path)
     summary = {
+        "duracion_audio_s": round(len(y) / sr, 3),
         "notas_detectadas": len(notes),
         "rango_midi_min": int(np.min([n[0] for n in notes])) if notes else None,
         "rango_midi_max": int(np.max([n[0] for n in notes])) if notes else None,
         "bpm": bpm,
         "division": division,
     }
     return midi_path, summary
 CSS = """
 #app_title {font-size: 28px; font-weight: 800}
 #app_subtitle {opacity: .8}
         with gr.Column(scale=2):
             audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio de entrada (voz, monofónica)")
             with gr.Accordion("Opciones de detección", open=False):
+                fmin = gr.Dropdown(["C1", "C2", "C3", "C4", "C5"], value="C2", label="Nota mínima")
+                fmax = gr.Dropdown(["C4", "C5", "C6", "C7"], value="C7", label="Nota máxima")
                 hop = gr.Slider(128, 1024, value=256, step=64, label="Hop length (muestras)")
                 frame = gr.Slider(1024, 4096, value=2048, step=256, label="Frame length (muestras)")
                 voice_th = gr.Slider(0.01, 0.5, value=0.1, step=0.01, label="Umbral de voicing (pyin)")
             with gr.Accordion("Cuantización y salida", open=True):
                 do_quant = gr.Checkbox(value=True, label="Cuantizar a rejilla")
                 bpm = gr.Slider(40, 220, value=100, step=1, label="BPM")
+                division = gr.Dropdown([2, 4, 8], value=4, label="División por negra", info="2=Corchea, 4=Semicorchea, 8=Fusa")
                 velocity = gr.Slider(1, 127, value=90, step=1, label="Velocidad (1-127)")
                 program = gr.Slider(0, 127, value=0, step=1, label="Programa/MIDI Instrument (0=Piano)")
             """)
     def _convert(audio_path, fmin_note, fmax_note, hop_length, frame_length, voice_thres, min_ms, gap_join_ms, do_quantize, bpm_val, division_val, velocity_val, program_val):
         midi_path, summary = audio_to_midi(
             audio=audio_path,
             fmin_note=fmin_note,