| r'''#=================================================================================================================== |
| # |
| # MIDI to Colab AUdio Python Module |
| # |
| # Converts any MIDI file to raw audio which is compatible |
| # with Google Colab or HUgging Face Gradio |
| # |
| # Version 2.0 |
| # |
| # Includes full source code of MIDI and pyfluidsynth |
| # |
| # Original source code for all modules was retrieved on 07/31/2025 |
| # |
| # Project Los Angeles |
| # Tegridy Code 2025 |
| # |
| #=================================================================================================================== |
| # |
| # Critical dependencies |
| # |
| # pip install numpy |
| # sudo apt install fluidsynth |
| # |
| #=================================================================================================================== |
| # |
| # Example usage: |
| # |
| # from midi_to_colab_audio import midi_to_colab_audio |
| # from IPython.display import display, Audio |
| # |
| # raw_audio = midi_to_colab_audio('/content/input.mid') |
| # |
| # display(Audio(raw_audio, rate=16000, normalize=False)) |
| # |
| #=================================================================================================================== |
| ''' |
|
|
| import fluidsynth |
| from src import MIDI |
|
|
| |
|
|
| import numpy as np |
| import wave |
|
|
| |
|
|
| def normalize_audio(audio: np.ndarray, |
| method: str = 'peak', |
| target_level_db: float = -1.0, |
| per_channel: bool = False, |
| eps: float = 1e-9 |
| ) -> np.ndarray: |
| |
| """ |
| Normalize audio to a target dBFS level. |
| |
| Parameters |
| ---------- |
| audio : np.ndarray |
| Float-valued array in range [-1, 1] with shape (channels, samples) |
| or (samples,) for mono. |
| method : {'peak', 'rms'} |
| - 'peak': scale so that max(|audio|) = target_level_lin |
| - 'rms' : scale so that RMS(audio) = target_level_lin |
| target_level_db : float |
| Desired output level, in dBFS (0 dBFS = max digital full scale). |
| e.g. -1.0 dBFS means ~0.8913 linear gain. |
| per_channel : bool |
| If True, normalize each channel independently. Otherwise, use a |
| global measure across all channels. |
| eps : float |
| Small constant to avoid division by zero. |
| |
| Returns |
| ------- |
| normalized : np.ndarray |
| Audio array of same shape, scaled so that levels meet the target. |
| """ |
| |
| |
| target_lin = 10 ** (target_level_db / 20.0) |
|
|
| |
| audio = audio.astype(np.float32) |
|
|
| |
| if audio.ndim == 1: |
| audio = audio[np.newaxis, :] |
|
|
| |
| axis = 1 if per_channel else None |
|
|
| if method == 'peak': |
| |
| peak = np.max(np.abs(audio), axis=axis, keepdims=True) |
| peak = np.maximum(peak, eps) |
| scales = target_lin / peak |
|
|
| elif method == 'rms': |
| |
| rms = np.sqrt(np.mean(audio ** 2, axis=axis, keepdims=True)) |
| rms = np.maximum(rms, eps) |
| scales = target_lin / rms |
|
|
| else: |
| raise ValueError(f"Unsupported method '{method}'; choose 'peak' or 'rms'.") |
|
|
| |
| normalized = audio * scales |
|
|
| |
| return np.clip(normalized, -1.0, 1.0) |
|
|
| |
|
|
| def midi_opus_to_colab_audio(midi_opus, |
| soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2', |
| sample_rate=16000, |
| volume_level_db=-1, |
| trim_silence=True, |
| silence_threshold=0.1, |
| output_for_gradio=False, |
| write_audio_to_WAV='' |
| ): |
|
|
| if midi_opus[1]: |
|
|
| ticks_per_beat, *tracks = midi_opus |
| if not tracks: |
| return None |
| |
| |
| events = [] |
| for track in tracks: |
| abs_t = 0 |
| for name, dt, *data in track: |
| abs_t += dt |
| events.append([name, abs_t, *data]) |
| events.sort(key=lambda e: e[1]) |
| |
| |
| fl = fluidsynth.Synth(samplerate=float(sample_rate)) |
| sfid = fl.sfload(soundfont_path) |
| for chan in range(16): |
| |
| fl.program_select(chan, sfid, 128 if chan == 9 else 0, 0) |
| |
| |
| tempo = int((60 / 120) * 1e6) |
| last_t = 0 |
| ss = np.empty((0, 2), dtype=np.int16) |
| |
| for name, cur_t, *data in events: |
| |
| delta_ticks = cur_t - last_t |
| last_t = cur_t |
| dt_seconds = (delta_ticks / ticks_per_beat) * (tempo / 1e6) |
| sample_len = int(dt_seconds * sample_rate) |
| if sample_len > 0: |
| buf = fl.get_samples(sample_len).reshape(-1, 2) |
| ss = np.concatenate([ss, buf], axis=0) |
| |
| |
| if name == "note_on" and data[2] > 0: |
| chan, note, vel = data |
| fl.noteon(chan, note, vel) |
| |
| elif name == "note_off" or (name == "note_on" and data[2] == 0): |
| chan, note = data[:2] |
| fl.noteoff(chan, note) |
| |
| elif name == "patch_change": |
| chan, patch = data[:2] |
| bank = 128 if chan == 9 else 0 |
| fl.program_select(chan, sfid, bank, patch) |
| |
| elif name == "control_change": |
| chan, ctrl, val = data[:3] |
| fl.cc(chan, ctrl, val) |
| |
| elif name == "key_after_touch": |
| chan, note, vel = data |
| fl.key_pressure(chan, note, vel) |
| |
| elif name == "channel_after_touch": |
| chan, vel = data |
| fl.channel_pressure(chan, vel) |
| |
| elif name == "pitch_wheel_change": |
| chan, wheel = data |
| fl.pitch_bend(chan, wheel) |
| |
| elif name == "song_position": |
| |
| pass |
| |
| elif name == "song_select": |
| |
| pass |
| |
| elif name == "tune_request": |
| |
| pass |
| |
| elif name in ("sysex_f0", "sysex_f7"): |
| raw_bytes = data[0] |
| fl.sysex(raw_bytes) |
| |
| |
| elif name in ( |
| "set_tempo", |
| "end_track", |
| "text_event", "text_event_08", "text_event_09", "text_event_0a", |
| "text_event_0b", "text_event_0c", "text_event_0d", "text_event_0e", "text_event_0f", |
| "copyright_text_event", "track_name", "instrument_name", |
| "lyric", "marker", "cue_point", |
| "smpte_offset", "time_signature", "key_signature", |
| "sequencer_specific", "raw_meta_event" |
| ): |
| if name == "set_tempo": |
| tempo = data[0] |
| |
| continue |
| |
| else: |
| |
| continue |
| |
| |
| fl.delete() |
| |
| if ss.size: |
| maxv = np.abs(ss).max() |
| if maxv: |
| ss = (ss / maxv) * np.iinfo(np.int16).max |
| ss = ss.astype(np.int16) |
| |
| |
| if trim_silence and ss.size: |
| thresh = np.std(np.abs(ss)) * silence_threshold |
| idx = np.where(np.abs(ss) > thresh)[0] |
| if idx.size: |
| ss = ss[: idx[-1] + 1] |
| |
| |
| if output_for_gradio: |
| return ss |
| |
| |
| ss = ss.T |
| raw_audio = normalize_audio(ss, target_level_db=volume_level_db) |
| |
| |
| if write_audio_to_WAV: |
| wav_name = midi_file.rsplit('.', 1)[0] + '.wav' |
| pcm = np.int16(raw_audio.T / np.max(np.abs(raw_audio)) * 32767) |
| with wave.open(wav_name, 'wb') as wf: |
| wf.setframerate(sample_rate) |
| wf.setsampwidth(2) |
| wf.setnchannels(pcm.shape[1]) |
| wf.writeframes(pcm.tobytes()) |
| |
| return raw_audio |
| |
| else: |
| return None |
|
|
| |
|
|
| def midi_to_colab_audio(midi_file, |
| soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2', |
| sample_rate=16000, |
| volume_level_db=-1, |
| trim_silence=True, |
| silence_threshold=0.1, |
| output_for_gradio=False, |
| write_audio_to_WAV=False |
| ): |
| """ |
| Returns raw audio to pass to IPython.disaply.Audio func |
| |
| Example usage: |
| |
| from IPython.display import Audio |
| |
| display(Audio(raw_audio, rate=16000, normalize=False)) |
| """ |
|
|
| |
| if isinstance(midi_file, str): |
| |
| try: |
| with open(midi_file, 'rb') as f: |
| midi_bytes = f.read() |
| except FileNotFoundError: |
| print(f"Error: Could not find or open the file at {midi_file}") |
| return None |
| elif isinstance(midi_file, bytes): |
| |
| midi_bytes = midi_file |
| else: |
| raise TypeError("midi_input must be a file path (str) or file content (bytes)") |
|
|
| |
| ticks_per_beat, *tracks = MIDI.midi2opus(midi_bytes) |
| if not tracks: |
| return None |
|
|
| |
| events = [] |
| for track in tracks: |
| abs_t = 0 |
| for name, dt, *data in track: |
| abs_t += dt |
| events.append([name, abs_t, *data]) |
| events.sort(key=lambda e: e[1]) |
|
|
| |
| fl = fluidsynth.Synth(samplerate=float(sample_rate)) |
| sfid = fl.sfload(soundfont_path) |
| for chan in range(16): |
| |
| fl.program_select(chan, sfid, 128 if chan == 9 else 0, 0) |
|
|
| |
| tempo = int((60 / 120) * 1e6) |
| last_t = 0 |
| |
| |
| audio_chunks = [] |
|
|
| for name, cur_t, *data in events: |
| |
| delta_ticks = cur_t - last_t |
| last_t = cur_t |
| dt_seconds = (delta_ticks / ticks_per_beat) * (tempo / 1e6) |
| sample_len = int(dt_seconds * sample_rate) |
|
|
| if sample_len > 0: |
| buf = fl.get_samples(sample_len).reshape(-1, 2) |
| |
| audio_chunks.append(buf) |
|
|
| |
| if name == "note_on" and data[2] > 0: |
| chan, note, vel = data |
| fl.noteon(chan, note, vel) |
|
|
| elif name == "note_off" or (name == "note_on" and data[2] == 0): |
| chan, note = data[:2] |
| fl.noteoff(chan, note) |
|
|
| elif name == "patch_change": |
| chan, patch = data[:2] |
| bank = 128 if chan == 9 else 0 |
| fl.program_select(chan, sfid, bank, patch) |
|
|
| elif name == "control_change": |
| chan, ctrl, val = data[:3] |
| fl.cc(chan, ctrl, val) |
|
|
| elif name == "key_after_touch": |
| chan, note, vel = data |
| fl.key_pressure(chan, note, vel) |
|
|
| elif name == "channel_after_touch": |
| chan, vel = data |
| fl.channel_pressure(chan, vel) |
|
|
| elif name == "pitch_wheel_change": |
| chan, wheel = data |
| fl.pitch_bend(chan, wheel) |
|
|
| elif name == "song_position": |
| |
| pass |
|
|
| elif name == "song_select": |
| |
| pass |
|
|
| elif name == "tune_request": |
| |
| pass |
|
|
| elif name in ("sysex_f0", "sysex_f7"): |
| raw_bytes = data[0] |
| fl.sysex(raw_bytes) |
|
|
| |
| elif name in ( |
| "set_tempo", |
| "end_track", |
| "text_event", "text_event_08", "text_event_09", "text_event_0a", |
| "text_event_0b", "text_event_0c", "text_event_0d", "text_event_0e", "text_event_0f", |
| "copyright_text_event", "track_name", "instrument_name", |
| "lyric", "marker", "cue_point", |
| "smpte_offset", "time_signature", "key_signature", |
| "sequencer_specific", "raw_meta_event" |
| ): |
| if name == "set_tempo": |
| tempo = data[0] |
| |
| continue |
|
|
| else: |
| |
| continue |
|
|
| |
| |
| tail_len_seconds = 2 |
| tail_buf = fl.get_samples(int(sample_rate * tail_len_seconds)).reshape(-1, 2) |
| audio_chunks.append(tail_buf) |
| |
| |
| fl.delete() |
| |
| |
| if not audio_chunks: |
| return None |
| ss = np.concatenate(audio_chunks, axis=0) |
| |
|
|
| |
| if trim_silence and ss.size: |
| |
| |
| dtype_max = np.iinfo(ss.dtype).max |
| fixed_threshold = int(dtype_max * 0.005) |
| |
| |
| indices = np.where(np.abs(ss) > fixed_threshold)[0] |
| if indices.size > 0: |
| |
| first_idx = indices[0] |
| last_idx = indices[-1] |
| ss = ss[first_idx : last_idx + 1] |
| else: |
| |
| ss = np.empty((0, 2), dtype=ss.dtype) |
|
|
| if ss.size: |
| maxv = np.abs(ss).max() |
| if maxv: |
| ss = (ss / maxv) * np.iinfo(np.int16).max |
| ss = ss.astype(np.int16) |
|
|
| |
| if output_for_gradio: |
| return ss |
|
|
| |
| ss = ss.T |
| raw_audio = normalize_audio(ss, target_level_db=volume_level_db) |
|
|
| |
| if write_audio_to_WAV and isinstance(midi_file, str): |
| wav_name = midi_file.rsplit('.', 1)[0] + '.wav' |
| |
| if np.max(np.abs(raw_audio)) > 0: |
| pcm = np.int16(raw_audio.T / np.max(np.abs(raw_audio)) * 32767) |
| else: |
| pcm = np.int16(raw_audio.T * 32767) |
| |
| with wave.open(wav_name, 'wb') as wf: |
| wf.setframerate(sample_rate) |
| wf.setsampwidth(2) |
| wf.setnchannels(pcm.shape[1]) |
| wf.writeframes(pcm.tobytes()) |
|
|
| return raw_audio |
|
|
| |