mrblackdev commited on
Commit
0aba475
·
verified ·
1 Parent(s): c431cef

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +265 -0
app.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import numpy as np
4
+ import librosa
5
+ import pretty_midi
6
+ import gradio as gr
7
+
8
+ # =====================
9
+ # Utilidades
10
+ # =====================
11
+ A440 = 440.0
12
+
13
+
14
+ def hz_to_midi(f):
15
+ if f is None or np.isnan(f) or f <= 0:
16
+ return None
17
+ return 69 + 12 * np.log2(f / A440)
18
+
19
+
20
+ def midi_to_hz(m):
21
+ return A440 * (2 ** ((m - 69) / 12))
22
+
23
+
24
+ def round_to_grid(seconds, bpm, division=4):
25
+ """Cuantiza tiempo en segundos a la rejilla (division por negra, p.ej. 4=semicorchea)."""
26
+ if bpm <= 0:
27
+ return seconds
28
+ beat = 60.0 / bpm
29
+ grid = beat / division
30
+ ticks = np.round(seconds / grid)
31
+ return ticks * grid
32
+
33
+
34
+ def group_notes(f0, sr, hop_length,
35
+ min_note_ms=80,
36
+ merge_gap_ms=60,
37
+ midi_smoothing_window=3):
38
+ """
39
+ Agrupa frames con el mismo número MIDI (tras redondeo) en notas con inicio/fin.
40
+ - f0: vector de frecuencias (Hz, NaN para no sonoro)
41
+ - Devuelve lista de (midi_note, t_start, t_end)
42
+ """
43
+ times = np.arange(len(f0)) * hop_length / sr
44
+
45
+ # Convertir a MIDI y enmascarar no sonoros
46
+ midi_vals = np.array([hz_to_midi(x) for x in f0])
47
+
48
+ # Suavizado mediano para reducir saltos espurios
49
+ if midi_smoothing_window and midi_smoothing_window > 1:
50
+ from scipy.ndimage import median_filter
51
+ midi_vals = median_filter(midi_vals, size=midi_smoothing_window)
52
+
53
+ # Redondeo al entero más cercano (clase de nota)
54
+ midi_round = np.round(midi_vals)
55
+
56
+ # No sonoros -> NaN
57
+ midi_round[np.isnan(midi_vals)] = np.nan
58
+
59
+ notes = []
60
+ i = 0
61
+ n = len(midi_round)
62
+ frame_ms = 1000.0 * hop_length / sr
63
+ min_frames = max(1, int(np.ceil(min_note_ms / frame_ms)))
64
+ merge_gap_frames = int(np.ceil(merge_gap_ms / frame_ms))
65
+
66
+ while i < n:
67
+ if np.isnan(midi_round[i]):
68
+ i += 1
69
+ continue
70
+ note_val = int(midi_round[i])
71
+ start = i
72
+ j = i + 1
73
+ # extender mientras siga misma nota (permitimos pequeños NaN huecos cortos)
74
+ gap = 0
75
+ while j < n:
76
+ if np.isnan(midi_round[j]):
77
+ gap += 1
78
+ if gap > merge_gap_frames:
79
+ break
80
+ j += 1
81
+ continue
82
+ gap = 0
83
+ if int(midi_round[j]) != note_val:
84
+ break
85
+ j += 1
86
+ # Validar duración mínima
87
+ if (j - start) >= min_frames:
88
+ t0 = times[start]
89
+ t1 = times[j - 1] + hop_length / sr
90
+ notes.append((note_val, t0, t1))
91
+ i = j + 1
92
+ return notes
93
+
94
+
95
+ def audio_to_midi(
96
+ audio,
97
+ fmin_note='C2',
98
+ fmax_note='C7',
99
+ hop_length=256,
100
+ frame_length=2048,
101
+ voicing_thres=0.1,
102
+ min_note_ms=80,
103
+ merge_gap_ms=60,
104
+ bpm=100,
105
+ quantize=True,
106
+ division=4,
107
+ velocity=80,
108
+ program=0,
109
+ ):
110
+ """
111
+ Convierte audio (ruta o ndarray) a un archivo MIDI temporal y retorna ruta + resumen.
112
+ """
113
+ # Cargar audio
114
+ if isinstance(audio, tuple):
115
+ # gradio mic: (sr, data)
116
+ sr, y = audio
117
+ y = np.array(y, dtype=np.float32)
118
+ else:
119
+ # gradio file: filepath str
120
+ y, sr = librosa.load(audio, sr=None, mono=True)
121
+
122
+ # Normalizar
123
+ if np.max(np.abs(y)) > 0:
124
+ y = y / np.max(np.abs(y))
125
+
126
+ # Pyin para f0
127
+ fmin_hz = librosa.note_to_hz(fmin_note)
128
+ fmax_hz = librosa.note_to_hz(fmax_note)
129
+
130
+ f0, voiced_flag, _ = librosa.pyin(
131
+ y,
132
+ fmin=fmin_hz,
133
+ fmax=fmax_hz,
134
+ frame_length=frame_length,
135
+ hop_length=hop_length,
136
+ center=True,
137
+ sr=sr,
138
+ trough_threshold=voicing_thres,
139
+ )
140
+
141
+ # Filtrar frames no sonoros
142
+ f0[~voiced_flag] = np.nan
143
+
144
+ # Agrupar en notas (midi, t0, t1)
145
+ notes = group_notes(
146
+ f0=f0,
147
+ sr=sr,
148
+ hop_length=hop_length,
149
+ min_note_ms=min_note_ms,
150
+ merge_gap_ms=merge_gap_ms,
151
+ midi_smoothing_window=3,
152
+ )
153
+
154
+ # Opcional: cuantización temporal
155
+ if quantize and bpm > 0:
156
+ q_notes = []
157
+ for m, t0, t1 in notes:
158
+ qt0 = float(round_to_grid(t0, bpm, division))
159
+ qt1 = float(round_to_grid(t1, bpm, division))
160
+ if qt1 <= qt0:
161
+ qt1 = qt0 + (60.0 / bpm) / division # mínimo 1 grid
162
+ q_notes.append((m, qt0, qt1))
163
+ notes = q_notes
164
+
165
+ # Construir MIDI
166
+ pm = pretty_midi.PrettyMIDI()
167
+ instrument = pretty_midi.Instrument(program=program) # 0 = Acoustic Grand Piano
168
+ for m, t0, t1 in notes:
169
+ v = int(np.clip(velocity, 1, 127))
170
+ instrument.notes.append(pretty_midi.Note(velocity=v, pitch=int(m), start=float(t0), end=float(t1)))
171
+ pm.instruments.append(instrument)
172
+
173
+ # Guardar a archivo temporal
174
+ tmpdir = tempfile.mkdtemp()
175
+ midi_path = os.path.join(tmpdir, "output.mid")
176
+ pm.write(midi_path)
177
+
178
+ # Métricas
179
+ dur = len(y) / sr
180
+ summary = {
181
+ "duracion_audio_s": round(dur, 3),
182
+ "notas_detectadas": len(notes),
183
+ "rango_midi_min": int(np.min([n[0] for n in notes])) if notes else None,
184
+ "rango_midi_max": int(np.max([n[0] for n in notes])) if notes else None,
185
+ "bpm": bpm,
186
+ "division": division,
187
+ }
188
+
189
+ return midi_path, summary
190
+
191
+
192
+ # =====================
193
+ # Interfaz Gradio
194
+ # =====================
195
+ CSS = """
196
+ #app_title {font-size: 28px; font-weight: 800}
197
+ #app_subtitle {opacity: .8}
198
+ """
199
+
200
+ with gr.Blocks(css=CSS, fill_height=True) as demo:
201
+ gr.Markdown("""
202
+ <div id='app_title'>🎤 Audio → 🎹 MIDI (Pitch‑to‑MIDI)</div>
203
+ <div id='app_subtitle'>Sube o graba tu voz, detecta notas y exporta un archivo MIDI listo para tu DAW.</div>
204
+ """)
205
+
206
+ with gr.Row():
207
+ with gr.Column(scale=2):
208
+ audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio de entrada (voz, monofónica)")
209
+ with gr.Accordion("Opciones de detección", open=False):
210
+ fmin = gr.Dropdown(["C1","C2","C3","C4","C5"], value="C2", label="Nota mínima")
211
+ fmax = gr.Dropdown(["C4","C5","C6","C7"], value="C7", label="Nota máxima")
212
+ hop = gr.Slider(128, 1024, value=256, step=64, label="Hop length (muestras)")
213
+ frame = gr.Slider(1024, 4096, value=2048, step=256, label="Frame length (muestras)")
214
+ voice_th = gr.Slider(0.01, 0.5, value=0.1, step=0.01, label="Umbral de voicing (pyin)")
215
+ min_ms = gr.Slider(10, 200, value=80, step=5, label="Duración mínima de nota (ms)")
216
+ gap_ms = gr.Slider(0, 200, value=60, step=5, label="Unir huecos ≤ (ms)")
217
+
218
+ with gr.Accordion("Cuantización y salida", open=True):
219
+ do_quant = gr.Checkbox(value=True, label="Cuantizar a rejilla")
220
+ bpm = gr.Slider(40, 220, value=100, step=1, label="BPM")
221
+ division = gr.Dropdown([(2, "Corchea"), (4, "Semicorchea"), (8, "Fusa")], value=4, label="División por negra", info="Más alto = rejilla más fina")
222
+ velocity = gr.Slider(1, 127, value=90, step=1, label="Velocidad (1-127)")
223
+ program = gr.Slider(0, 127, value=0, step=1, label="Programa/MIDI Instrument (0=Piano)")
224
+
225
+ run_btn = gr.Button("🔄 Convertir a MIDI", variant="primary")
226
+
227
+ with gr.Column(scale=1):
228
+ midi_out = gr.File(label="Archivo MIDI generado")
229
+ summary_out = gr.JSON(label="Resumen")
230
+ gr.Markdown("""
231
+ **Tips**
232
+ - Canta una melodía monofónica, sin armonías.
233
+ - Ajusta el rango de notas (C2–C7) si cantas muy grave o agudo.
234
+ - Usa la cuantización para encajar a tempo; si quieres naturalidad, desactívala.
235
+ """)
236
+
237
+ def _convert(audio_path, fmin_note, fmax_note, hop_length, frame_length, voice_thres, min_ms, gap_join_ms, do_quantize, bpm_val, division_val, velocity_val, program_val):
238
+ # division puede venir como tuple(label) o int (según Gradio). Normalizamos.
239
+ if isinstance(division_val, tuple):
240
+ division_val = division_val[0]
241
+ midi_path, summary = audio_to_midi(
242
+ audio=audio_path,
243
+ fmin_note=fmin_note,
244
+ fmax_note=fmax_note,
245
+ hop_length=int(hop_length),
246
+ frame_length=int(frame_length),
247
+ voicing_thres=float(voice_thres),
248
+ min_note_ms=int(min_ms),
249
+ merge_gap_ms=int(gap_join_ms),
250
+ bpm=float(bpm_val),
251
+ quantize=bool(do_quantize),
252
+ division=int(division_val),
253
+ velocity=int(velocity_val),
254
+ program=int(program_val),
255
+ )
256
+ return midi_path, summary
257
+
258
+ run_btn.click(
259
+ _convert,
260
+ inputs=[audio_in, fmin, fmax, hop, frame, voice_th, min_ms, gap_ms, do_quant, bpm, division, velocity, program],
261
+ outputs=[midi_out, summary_out]
262
+ )
263
+
264
+ if __name__ == "__main__":
265
+ demo.launch()