Audio-To-MIDI-And-Advanced-Renderer

Running

App Files Files Community

Audio-To-MIDI-And-Advanced-Renderer / src /midi_to_colab_audio.py

avans06

Integrated and Enhanced: Audio-to-MIDI and Advanced MIDI Renderer

adcbc9f 9 months ago

raw

history blame

16.4 kB

	r'''#===================================================================================================================
	#
	# MIDI to Colab AUdio Python Module
	#
	# Converts any MIDI file to raw audio which is compatible
	# with Google Colab or HUgging Face Gradio
	#
	# Version 2.0
	#
	# Includes full source code of MIDI and pyfluidsynth
	#
	# Original source code for all modules was retrieved on 07/31/2025
	#
	# Project Los Angeles
	# Tegridy Code 2025
	#
	#===================================================================================================================
	#
	# Critical dependencies
	#
	# pip install numpy
	# sudo apt install fluidsynth
	#
	#===================================================================================================================
	#
	# Example usage:
	#
	# from midi_to_colab_audio import midi_to_colab_audio
	# from IPython.display import display, Audio
	#
	# raw_audio = midi_to_colab_audio('/content/input.mid')
	#
	# display(Audio(raw_audio, rate=16000, normalize=False))
	#
	#===================================================================================================================
	'''

	import fluidsynth
	from src import MIDI

	#===============================================================================

	import numpy as np
	import wave

	#===============================================================================

	def normalize_audio(audio: np.ndarray,
	method: str = 'peak',
	target_level_db: float = -1.0,
	per_channel: bool = False,
	eps: float = 1e-9
	) -> np.ndarray:

	"""
	Normalize audio to a target dBFS level.

	Parameters
	----------
	audio : np.ndarray
	Float-valued array in range [-1, 1] with shape (channels, samples)
	or (samples,) for mono.
	method : {'peak', 'rms'}
	- 'peak': scale so that max(\|audio\|) = target_level_lin
	- 'rms' : scale so that RMS(audio) = target_level_lin
	target_level_db : float
	Desired output level, in dBFS (0 dBFS = max digital full scale).
	e.g. -1.0 dBFS means ~0.8913 linear gain.
	per_channel : bool
	If True, normalize each channel independently. Otherwise, use a
	global measure across all channels.
	eps : float
	Small constant to avoid division by zero.

	Returns
	-------
	normalized : np.ndarray
	Audio array of same shape, scaled so that levels meet the target.
	"""

	# Convert target dB to linear gain
	target_lin = 10 ** (target_level_db / 20.0)

	# Ensure audio is float
	audio = audio.astype(np.float32)

	# if mono, make it (1, N)
	if audio.ndim == 1:
	audio = audio[np.newaxis, :]

	# Choose measurement axis
	axis = 1 if per_channel else None

	if method == 'peak':
	# Compute peak per channel or global
	peak = np.max(np.abs(audio), axis=axis, keepdims=True)
	peak = np.maximum(peak, eps)
	scales = target_lin / peak

	elif method == 'rms':
	# Compute RMS per channel or global
	rms = np.sqrt(np.mean(audio ** 2, axis=axis, keepdims=True))
	rms = np.maximum(rms, eps)
	scales = target_lin / rms

	else:
	raise ValueError(f"Unsupported method '{method}'; choose 'peak' or 'rms'.")

	# Broadcast scales back to audio shape
	normalized = audio * scales

	# Clip just in case of rounding
	return np.clip(normalized, -1.0, 1.0)

	#===============================================================================

	def midi_opus_to_colab_audio(midi_opus,
	soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2',
	sample_rate=16000, # 44100
	volume_level_db=-1,
	trim_silence=True,
	silence_threshold=0.1,
	output_for_gradio=False,
	write_audio_to_WAV=''
	):

	if midi_opus[1]:

	ticks_per_beat, *tracks = midi_opus
	if not tracks:
	return None

	# Flatten & convert delta-times to absolute-time
	events = []
	for track in tracks:
	abs_t = 0
	for name, dt, *data in track:
	abs_t += dt
	events.append([name, abs_t, *data])
	events.sort(key=lambda e: e[1])

	# Setup FluidSynth
	fl = fluidsynth.Synth(samplerate=float(sample_rate))
	sfid = fl.sfload(soundfont_path)
	for chan in range(16):
	# channel 9 = percussion GM bank 128
	fl.program_select(chan, sfid, 128 if chan == 9 else 0, 0)

	# Playback vars
	tempo = int((60 / 120) * 1e6) # default 120bpm
	last_t = 0
	ss = np.empty((0, 2), dtype=np.int16)

	for name, cur_t, *data in events:
	# compute how many samples have passed since the last event
	delta_ticks = cur_t - last_t
	last_t = cur_t
	dt_seconds = (delta_ticks / ticks_per_beat) * (tempo / 1e6)
	sample_len = int(dt_seconds * sample_rate)
	if sample_len > 0:
	buf = fl.get_samples(sample_len).reshape(-1, 2)
	ss = np.concatenate([ss, buf], axis=0)

	# Dispatch every known event
	if name == "note_on" and data[2] > 0:
	chan, note, vel = data
	fl.noteon(chan, note, vel)

	elif name == "note_off" or (name == "note_on" and data[2] == 0):
	chan, note = data[:2]
	fl.noteoff(chan, note)

	elif name == "patch_change":
	chan, patch = data[:2]
	bank = 128 if chan == 9 else 0
	fl.program_select(chan, sfid, bank, patch)

	elif name == "control_change":
	chan, ctrl, val = data[:3]
	fl.cc(chan, ctrl, val)

	elif name == "key_after_touch":
	chan, note, vel = data
	fl.key_pressure(chan, note, vel)

	elif name == "channel_after_touch":
	chan, vel = data
	fl.channel_pressure(chan, vel)

	elif name == "pitch_wheel_change":
	chan, wheel = data
	fl.pitch_bend(chan, wheel)

	elif name == "song_position":
	# song_pos = data[0]; # often not needed for playback
	pass

	elif name == "song_select":
	# song_number = data[0]
	pass

	elif name == "tune_request":
	# typically resets tuning; FS handles internally
	pass

	elif name in ("sysex_f0", "sysex_f7"):
	raw_bytes = data[0]
	fl.sysex(raw_bytes)

	# Meta events & others—no direct audio effect, so we skip or log
	elif name in (
	"set_tempo", # handled below
	"end_track",
	"text_event", "text_event_08", "text_event_09", "text_event_0a",
	"text_event_0b", "text_event_0c", "text_event_0d", "text_event_0e", "text_event_0f",
	"copyright_text_event", "track_name", "instrument_name",
	"lyric", "marker", "cue_point",
	"smpte_offset", "time_signature", "key_signature",
	"sequencer_specific", "raw_meta_event"
	):
	if name == "set_tempo":
	tempo = data[0]
	# else: skip all other meta & text; you could hook in logging here
	continue

	else:
	# unknown event type
	continue

	# Cleanup synth
	fl.delete()

	if ss.size:
	maxv = np.abs(ss).max()
	if maxv:
	ss = (ss / maxv) * np.iinfo(np.int16).max
	ss = ss.astype(np.int16)

	# Optional trimming of trailing silence
	if trim_silence and ss.size:
	thresh = np.std(np.abs(ss)) * silence_threshold
	idx = np.where(np.abs(ss) > thresh)[0]
	if idx.size:
	ss = ss[: idx[-1] + 1]

	# For Gradio you might want raw int16 PCM
	if output_for_gradio:
	return ss

	# Swap to (channels, samples) and normalize for playback
	ss = ss.T
	raw_audio = normalize_audio(ss, target_level_db=volume_level_db)

	# Optionally write WAV to disk
	if write_audio_to_WAV:
	wav_name = midi_file.rsplit('.', 1)[0] + '.wav'
	pcm = np.int16(raw_audio.T / np.max(np.abs(raw_audio)) * 32767)
	with wave.open(wav_name, 'wb') as wf:
	wf.setframerate(sample_rate)
	wf.setsampwidth(2)
	wf.setnchannels(pcm.shape[1])
	wf.writeframes(pcm.tobytes())

	return raw_audio

	else:
	return None

	#===============================================================================

	def midi_to_colab_audio(midi_file,
	soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2',
	sample_rate=16000,
	volume_level_db=-1,
	trim_silence=True,
	silence_threshold=0.1,
	output_for_gradio=False,
	write_audio_to_WAV=False
	):
	"""
	Returns raw audio to pass to IPython.disaply.Audio func

	Example usage:

	from IPython.display import Audio

	display(Audio(raw_audio, rate=16000, normalize=False))
	"""

	# Check if midi_input is a path (string) or file content (bytes)
	if isinstance(midi_file, str):
	# It's a file path, open and read it.
	try:
	with open(midi_file, 'rb') as f:
	midi_bytes = f.read()
	except FileNotFoundError:
	print(f"Error: Could not find or open the file at {midi_file}")
	return None # Or handle the error appropriately
	elif isinstance(midi_file, bytes):
	# It's already the file content.
	midi_bytes = midi_file
	else:
	raise TypeError("midi_input must be a file path (str) or file content (bytes)")

	# Read and decode MIDI → opus event list from bytes
	ticks_per_beat, *tracks = MIDI.midi2opus(midi_bytes)
	if not tracks:
	return None

	# Flatten & convert delta-times to absolute-time
	events = []
	for track in tracks:
	abs_t = 0
	for name, dt, *data in track:
	abs_t += dt
	events.append([name, abs_t, *data])
	events.sort(key=lambda e: e[1])

	# Setup FluidSynth
	fl = fluidsynth.Synth(samplerate=float(sample_rate))
	sfid = fl.sfload(soundfont_path)
	for chan in range(16):
	# channel 9 = percussion GM bank 128
	fl.program_select(chan, sfid, 128 if chan == 9 else 0, 0)

	# Playback vars
	tempo = int((60 / 120) * 1e6) # default 120bpm
	last_t = 0

	# Initialize a Python list to store audio chunks
	audio_chunks = []

	for name, cur_t, *data in events:
	# compute how many samples have passed since the last event
	delta_ticks = cur_t - last_t
	last_t = cur_t
	dt_seconds = (delta_ticks / ticks_per_beat) * (tempo / 1e6)
	sample_len = int(dt_seconds * sample_rate)

	if sample_len > 0:
	buf = fl.get_samples(sample_len).reshape(-1, 2)
	# Append the audio chunk to the list
	audio_chunks.append(buf)

	# Dispatch every known event
	if name == "note_on" and data[2] > 0:
	chan, note, vel = data
	fl.noteon(chan, note, vel)

	elif name == "note_off" or (name == "note_on" and data[2] == 0):
	chan, note = data[:2]
	fl.noteoff(chan, note)

	elif name == "patch_change":
	chan, patch = data[:2]
	bank = 128 if chan == 9 else 0
	fl.program_select(chan, sfid, bank, patch)

	elif name == "control_change":
	chan, ctrl, val = data[:3]
	fl.cc(chan, ctrl, val)

	elif name == "key_after_touch":
	chan, note, vel = data
	fl.key_pressure(chan, note, vel)

	elif name == "channel_after_touch":
	chan, vel = data
	fl.channel_pressure(chan, vel)

	elif name == "pitch_wheel_change":
	chan, wheel = data
	fl.pitch_bend(chan, wheel)

	elif name == "song_position":
	# song_pos = data[0]; # often not needed for playback
	pass

	elif name == "song_select":
	# song_number = data[0]
	pass

	elif name == "tune_request":
	# typically resets tuning; FS handles internally
	pass

	elif name in ("sysex_f0", "sysex_f7"):
	raw_bytes = data[0]
	fl.sysex(raw_bytes)

	# Meta events & others—no direct audio effect, so we skip or log
	elif name in (
	"set_tempo", # handled below
	"end_track",
	"text_event", "text_event_08", "text_event_09", "text_event_0a",
	"text_event_0b", "text_event_0c", "text_event_0d", "text_event_0e", "text_event_0f",
	"copyright_text_event", "track_name", "instrument_name",
	"lyric", "marker", "cue_point",
	"smpte_offset", "time_signature", "key_signature",
	"sequencer_specific", "raw_meta_event"
	):
	if name == "set_tempo":
	tempo = data[0]
	# else: skip all other meta & text; you could hook in logging here
	continue

	else:
	# unknown event type
	continue

	# This captures the sound of the last notes, allowing them to decay naturally.
	# We render an extra 2 seconds of audio. A shorter time like 1 second might be sufficient.
	tail_len_seconds = 2
	tail_buf = fl.get_samples(int(sample_rate * tail_len_seconds)).reshape(-1, 2)
	audio_chunks.append(tail_buf)

	# Cleanup synth
	fl.delete()

	# After the loop finishes, concatenate all audio chunks in a single operation
	if not audio_chunks:
	return None # No audio was generated
	ss = np.concatenate(audio_chunks, axis=0)


	# Optimized silence trimming logic
	if trim_silence and ss.size:
	# Using a fixed amplitude threshold based on the data type's max value.
	# This is more robust than using standard deviation for trimming the tail.
	dtype_max = np.iinfo(ss.dtype).max
	fixed_threshold = int(dtype_max * 0.005) # 0.5% of max amplitude

	# Find the first and last samples exceeding the threshold.
	indices = np.where(np.abs(ss) > fixed_threshold)[0]
	if indices.size > 0:
	# We trim from the start as well in case of leading silence
	first_idx = indices[0]
	last_idx = indices[-1]
	ss = ss[first_idx : last_idx + 1]
	else:
	# If it's all silence, return an empty array.
	ss = np.empty((0, 2), dtype=ss.dtype)

	if ss.size:
	maxv = np.abs(ss).max()
	if maxv:
	ss = (ss / maxv) * np.iinfo(np.int16).max
	ss = ss.astype(np.int16)

	# For Gradio you might want raw int16 PCM
	if output_for_gradio:
	return ss

	# Swap to (channels, samples) and normalize for playback
	ss = ss.T
	raw_audio = normalize_audio(ss, target_level_db=volume_level_db)

	# Optionally write WAV to disk
	if write_audio_to_WAV and isinstance(midi_file, str):
	wav_name = midi_file.rsplit('.', 1)[0] + '.wav'
	# Note: raw_audio is float, needs conversion back to int16 for WAV format.
	if np.max(np.abs(raw_audio)) > 0:
	pcm = np.int16(raw_audio.T / np.max(np.abs(raw_audio)) * 32767)
	else:
	pcm = np.int16(raw_audio.T * 32767)

	with wave.open(wav_name, 'wb') as wf:
	wf.setframerate(sample_rate)
	wf.setsampwidth(2)
	wf.setnchannels(pcm.shape[1])
	wf.writeframes(pcm.tobytes())

	return raw_audio

	#===================================================================================================================