avans06's picture
feat(synth): Enhance 8-bit synthesizer and expand presets
e835c5f
raw
history blame
81.4 kB
# =================================================================
#
# Merged and Integrated Script for Audio/MIDI Processing and Rendering (Stereo Enhanced)
#
# This script combines two functionalities:
# 1. Transcribing audio to MIDI using two methods:
# a) A general-purpose model (basic-pitch by Spotify).
# b) A model specialized for solo piano (ByteDance).
# - Includes stereo processing by splitting channels, transcribing independently, and merging MIDI.
# 2. Applying advanced transformations and re-rendering MIDI files using:
# a) Standard SoundFonts via FluidSynth (produces stereo audio).
# b) A custom 8-bit style synthesizer for a chiptune sound (updated for stereo output).
#
# The user can upload a Audio (e.g., WAV, MP3), or MIDI file.
# - If an audio file is uploaded, it is first transcribed to MIDI using the selected method.
# - The resulting MIDI (or an uploaded MIDI) can then be processed
# with various effects and rendered into audio.
#
#================================================================
# Original sources:
# https://huggingface.co/spaces/asigalov61/ByteDance-Solo-Piano-Audio-to-MIDI-Transcription
# https://huggingface.co/spaces/asigalov61/Advanced-MIDI-Renderer
#================================================================
# Packages:
#
# sudo apt install fluidsynth
#
# =================================================================
# Requirements:
#
# pip install gradio torch pytz numpy scipy matplotlib networkx scikit-learn
# pip install piano_transcription_inference huggingface_hub
# pip install basic-pitch pretty_midi librosa soundfile
#
# =================================================================
# Core modules:
#
# git clone --depth 1 https://github.com/asigalov61/tegridy-tools
#
# =================================================================
import os
import hashlib
import time as reqtime
import copy
import librosa
import pyloudnorm as pyln
import soundfile as sf
import torch
import gradio as gr
from src.piano_transcription.utils import initialize_app
from piano_transcription_inference import PianoTranscription, utilities, sample_rate as transcription_sample_rate
# --- Import core transcription and MIDI processing libraries ---
from src import TMIDIX, TPLOTS
from src import MIDI
from src.midi_to_colab_audio import midi_to_colab_audio
# --- Imports for General Purpose Transcription (basic-pitch) ---
import basic_pitch
from basic_pitch.inference import predict
from basic_pitch import ICASSP_2022_MODEL_PATH
# --- Imports for 8-bit Synthesizer & MIDI Merging ---
import pretty_midi
import numpy as np
from scipy import signal
# =================================================================================================
# === Hugging Face SoundFont Downloader ===
# =================================================================================================
from huggingface_hub import hf_hub_download
import glob
# --- Define a constant for the 8-bit synthesizer option ---
SYNTH_8_BIT_LABEL = "None (8-bit Synthesizer)"
def prepare_soundfonts():
"""
Ensures a default set of SoundFonts are downloaded, then scans the 'src/sf2'
directory recursively for all .sf2 files.
Returns a dictionary mapping a user-friendly name to its full file path, with
default soundfonts listed first in their specified order.
Downloads soundfont files from the specified Hugging Face Space repository
to a local 'src/sf2' directory if they don't already exist.
Returns a list of local paths to the soundfont files.
"""
SF2_REPO_ID = "asigalov61/Advanced-MIDI-Renderer"
SF2_DIR = "src/sf2"
# This list is now just for ensuring default files exist
# {"Super GM": 0, "Orpheus GM": 1, "Live HQ GM": 2, "Nice Strings + Orchestra": 3, "Real Choir": 4, "Super Game Boy": 5, "Proto Square": 6}
DEFAULT_SF2_FILENAMES = [
"SGM-v2.01-YamahaGrand-Guit-Bass-v2.7.sf2",
"Orpheus_18.06.2020.sf2",
"Live HQ Natural SoundFont GM.sf2",
"Nice-Strings-PlusOrchestra-v1.6.sf2",
"KBH-Real-Choir-V2.5.sf2",
"SuperGameBoy.sf2",
"ProtoSquare.sf2"
]
# Create the target directory if it doesn't exist
os.makedirs(SF2_DIR, exist_ok=True)
# --- Step 1: Ensure default SoundFonts are available ---
print("Checking for SoundFont files...")
for filename in DEFAULT_SF2_FILENAMES:
local_path = os.path.join(SF2_DIR, filename)
# Check if the file already exists locally to avoid re-downloading
if not os.path.exists(local_path):
print(f"Downloading '{filename}' from Hugging Face Hub...")
try:
# Use hf_hub_download to get the file
# It will be downloaded to the specified local directory
hf_hub_download(
repo_id=SF2_REPO_ID,
repo_type='space', # Specify that the repository is a Space
filename=f"{filename}", # The path to the file within the repository
local_dir=SF2_DIR,
# local_dir_use_symlinks=False # Copy file to the dir for a clean folder structure
)
print(f"'{filename}' downloaded successfully.")
except Exception as e:
print(f"Error downloading {filename}: {e}")
# If download fails, we might not be able to use this soundfont
# --- Step 2: Scan the entire directory for all .sf2 files ---
print(f"Scanning '{SF2_DIR}' for all .sf2 files...")
all_sfs_map = {}
# Use glob with recursive=True to find all .sf2 files in subdirectories
search_pattern = os.path.join(SF2_DIR, '**', '*.sf2')
for full_path in glob.glob(search_pattern, recursive=True):
# Create a user-friendly display name, including subfolder if it exists
relative_path = os.path.relpath(full_path, SF2_DIR)
display_name = os.path.splitext(relative_path)[0].replace("\\", "/") # Use forward slashes for consistency
all_sfs_map[display_name] = full_path
# --- Step 3: Create the final ordered dictionary based on priority ---
ordered_soundfont_map = {}
# Create display names for default files (filename without extension)
default_display_names = [os.path.splitext(f)[0] for f in DEFAULT_SF2_FILENAMES]
# Separate other files from the default ones
other_display_names = [name for name in all_sfs_map.keys() if name not in default_display_names]
other_display_names.sort() # Sort the rest alphabetically
# Add default soundfonts first, maintaining the order from DEFAULT_SF2_FILENAMES
for name in default_display_names:
if name in all_sfs_map: # Check if the file was actually found by the scanner
ordered_soundfont_map[name] = all_sfs_map[name]
# Add all other soundfonts after the default ones
for name in other_display_names:
ordered_soundfont_map[name] = all_sfs_map[name]
return ordered_soundfont_map
# =================================================================================================
# === 8-bit Style Synthesizer (Stereo Enabled) ===
# =================================================================================================
def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s, pulse_width,
vibrato_rate, vibrato_depth, bass_boost_level, fs=44100,
smooth_notes=False, continuous_vibrato=False, noise_level=0.0,
distortion_level=0.0, fm_modulation_depth=0.0, fm_modulation_rate=0.0):
"""
Synthesizes an 8-bit style audio waveform from a PrettyMIDI object.
This function generates waveforms manually instead of using a synthesizer like FluidSynth.
Includes an optional sub-octave bass booster with adjustable level.
Instruments are panned based on their order in the MIDI file.
Instrument 1 -> Left, Instrument 2 -> Right.
"""
total_duration = midi_data.get_end_time()
# Initialize a stereo waveform buffer (2 channels: Left, Right)
waveform = np.zeros((2, int(total_duration * fs) + fs))
num_instruments = len(midi_data.instruments)
# Phase tracking: main oscillator phase for each instrument
osc_phase = {}
# Vibrato phase tracking
vibrato_phase = 0.0
for i, instrument in enumerate(midi_data.instruments):
# --- Panning Logic ---
# Default to center-panned mono
pan_l, pan_r = 0.707, 0.707
if num_instruments == 2:
if i == 0: # First instrument panned left
pan_l, pan_r = 1.0, 0.0
elif i == 1: # Second instrument panned right
pan_l, pan_r = 0.0, 1.0
elif num_instruments > 2:
if i == 0: # Left
pan_l, pan_r = 1.0, 0.0
elif i == 1: # Right
pan_l, pan_r = 0.0, 1.0
# Other instruments remain centered
osc_phase[i] = 0.0 # Independent phase tracking for each instrument
for note in instrument.notes:
freq = pretty_midi.note_number_to_hz(note.pitch)
note_duration = note.end - note.start
num_samples = int(note_duration * fs)
if num_samples <= 0:
continue
t = np.arange(num_samples) / fs
# --- Vibrato LFO ---
if continuous_vibrato:
# Use accumulated phase to avoid vibrato reset per note
vib_phase_inc = 2 * np.pi * vibrato_rate / fs
vib_phase_array = vibrato_phase + np.arange(num_samples) * vib_phase_inc
vibrato_phase = (vib_phase_array[-1] + vib_phase_inc) % (2 * np.pi)
vibrato_lfo = vibrato_depth * np.sin(vib_phase_array)
else:
vibrato_lfo = vibrato_depth * np.sin(2 * np.pi * vibrato_rate * t)
# --- Waveform Generation (Main Oscillator with phase continuity) ---
phase_inc = 2 * np.pi * (freq + vibrato_lfo) / fs
phase = osc_phase[i] + np.cumsum(phase_inc)
osc_phase[i] = phase[-1] % (2 * np.pi) # Store last phase
if waveform_type == 'Square':
note_waveform = signal.square(phase, duty=pulse_width)
elif waveform_type == 'Sawtooth':
note_waveform = signal.sawtooth(phase)
elif waveform_type == 'Triangle':
note_waveform = signal.sawtooth(phase, width=0.5)
# --- Bass Boost (Sub-Octave Oscillator) ---
if bass_boost_level > 0:
bass_freq = freq / 2.0
# Only add bass if the frequency is reasonably audible
if bass_freq > 20:
# Bass uses a simple square wave, no vibrato, for stability
bass_phase_inc = 2 * np.pi * bass_freq / fs
bass_phase = np.cumsum(np.full(num_samples, bass_phase_inc))
bass_sub_waveform = signal.square(bass_phase, duty=0.5)
# Mix the main and bass waveforms.
# As bass level increases, slightly decrease main waveform volume to prevent clipping.
main_level = 1.0 - (0.5 * bass_boost_level)
note_waveform = (note_waveform * main_level) + (bass_sub_waveform * bass_boost_level)
# --- Noise Channel Simulation (White Noise) ---
if noise_level > 0:
noise_waveform = np.random.uniform(-1, 1, num_samples)
note_waveform += noise_waveform * noise_level
# --- Distortion (Wave Shaping) ---
if distortion_level > 0:
note_waveform = np.sign(note_waveform) * np.abs(note_waveform) ** (1.0 - distortion_level)
# --- Frequency Modulation (FM) ---
if fm_modulation_depth > 0:
modulated_freq = freq * (1 + fm_modulation_depth * np.sin(2 * np.pi * fm_modulation_rate * t))
phase_inc = 2 * np.pi * modulated_freq / fs
phase = osc_phase[i] + np.cumsum(phase_inc)
osc_phase[i] = phase[-1] % (2 * np.pi) # Store last phase
if waveform_type == 'Square':
note_waveform = signal.square(phase, duty=pulse_width)
elif waveform_type == 'Sawtooth':
note_waveform = signal.sawtooth(phase)
elif waveform_type == 'Triangle':
note_waveform = signal.sawtooth(phase, width=0.5)
# --- ADSR Envelope ---
start_amp = note.velocity / 127.0
envelope = np.zeros(num_samples)
if envelope_type == 'Plucky (AD Envelope)':
attack_time_s = 0.005
attack_samples = min(int(attack_time_s * fs), num_samples)
decay_samples = min(int(decay_time_s * fs), num_samples - attack_samples)
envelope[:attack_samples] = np.linspace(0, start_amp, attack_samples)
if decay_samples > 0:
envelope[attack_samples:attack_samples+decay_samples] = np.linspace(start_amp, 0, decay_samples)
elif envelope_type == 'Sustained (Full Decay)':
envelope = np.linspace(start_amp, 0, num_samples)
if smooth_notes:
# Add short release
release_samples = min(int(0.005 * fs), num_samples)
envelope[-release_samples:] *= np.linspace(1, 0, release_samples)
# Small crossfade (to avoid clicks)
envelope[:min(10, num_samples)] *= np.linspace(0.5, 1, min(10, num_samples))
# Apply envelope to the (potentially combined) waveform
note_waveform *= envelope
start_sample = int(note.start * fs)
end_sample = start_sample + num_samples
if end_sample > waveform.shape[1]:
end_sample = waveform.shape[1]
note_waveform = note_waveform[:end_sample-start_sample]
# Add the mono note waveform to the stereo buffer with panning
waveform[0, start_sample:end_sample] += note_waveform * pan_l
waveform[1, start_sample:end_sample] += note_waveform * pan_r
return waveform # Returns a (2, N) numpy array
def analyze_midi_velocity(midi_path):
midi = pretty_midi.PrettyMIDI(midi_path)
all_velocities = []
print(f"Analyzing velocity for MIDI: {midi_path}")
for i, instrument in enumerate(midi.instruments):
velocities = [note.velocity for note in instrument.notes]
all_velocities.extend(velocities)
if velocities:
print(f"Instrument {i} ({instrument.name}):")
print(f" Notes count: {len(velocities)}")
print(f" Velocity min: {min(velocities)}")
print(f" Velocity max: {max(velocities)}")
print(f" Velocity mean: {np.mean(velocities):.2f}")
else:
print(f"Instrument {i} ({instrument.name}): no notes found.")
if all_velocities:
print("\nOverall MIDI velocity stats:")
print(f" Total notes: {len(all_velocities)}")
print(f" Velocity min: {min(all_velocities)}")
print(f" Velocity max: {max(all_velocities)}")
print(f" Velocity mean: {np.mean(all_velocities):.2f}")
else:
print("No notes found in this MIDI.")
def scale_instrument_velocity(instrument, scale=0.8):
for note in instrument.notes:
note.velocity = max(1, min(127, int(note.velocity * scale)))
def normalize_loudness(audio_data, sample_rate, target_lufs=-23.0):
"""
Normalizes the audio data to a target integrated loudness (LUFS).
This provides more consistent perceived volume than peak normalization.
Args:
audio_data (np.ndarray): The audio signal.
sample_rate (int): The sample rate of the audio.
target_lufs (float): The target loudness in LUFS. Defaults to -23.0,
a common standard for broadcast.
Returns:
np.ndarray: The loudness-normalized audio data.
"""
try:
# 1. Measure the integrated loudness of the input audio
meter = pyln.Meter(sample_rate) # create meter
loudness = meter.integrated_loudness(audio_data)
# 2. Calculate the gain needed to reach the target loudness
# The gain is applied in the linear domain, so we convert from dB
loudness_gain_db = target_lufs - loudness
loudness_gain_linear = 10.0 ** (loudness_gain_db / 20.0)
# 3. Apply the gain
normalized_audio = audio_data * loudness_gain_linear
# 4. Final safety check: peak normalize to prevent clipping, just in case
# the loudness normalization results in peaks > 1.0
peak_val = np.max(np.abs(normalized_audio))
if peak_val > 1.0:
normalized_audio /= peak_val
print(f"Warning: Loudness normalization resulted in clipping. Audio was peak-normalized as a safeguard.")
print(f"Audio normalized from {loudness:.2f} LUFS to target {target_lufs} LUFS.")
return normalized_audio
except Exception as e:
print(f"Loudness normalization failed: {e}. Falling back to original audio.")
return audio_data
# =================================================================================================
# === MIDI Merging Function ===
# =================================================================================================
def merge_midis(midi_path_left, midi_path_right, output_path):
"""
Merges two MIDI files into a single MIDI file. This robust version iterates
through ALL instruments in both MIDI files, ensuring no data is lost if the
source files are multi-instrumental.
It applies hard-left panning (Pan=0) to every instrument from the left MIDI
and hard-right panning (Pan=127) to every instrument from the right MIDI.
"""
try:
analyze_midi_velocity(midi_path_left)
analyze_midi_velocity(midi_path_right)
midi_left = pretty_midi.PrettyMIDI(midi_path_left)
midi_right = pretty_midi.PrettyMIDI(midi_path_right)
merged_midi = pretty_midi.PrettyMIDI()
# --- Process ALL instruments from the left channel MIDI ---
if midi_left.instruments:
print(f"Found {len(midi_left.instruments)} instrument(s) in the left channel MIDI.")
# Use a loop to iterate through every instrument
for instrument in midi_left.instruments:
scale_instrument_velocity(instrument, scale=0.8)
# To avoid confusion, we can prefix the instrument name
instrument.name = f"Left - {instrument.name if instrument.name else 'Instrument'}"
# Create and add the Pan Left control change
# Create a Control Change event for Pan (controller number 10).
# Set its value to 0 for hard left panning.
# Add it at the very beginning of the track (time=0.0).
pan_left = pretty_midi.ControlChange(number=10, value=0, time=0.0)
# Use insert() to ensure the pan event is the very first one
instrument.control_changes.insert(0, pan_left)
# Append the fully processed instrument to the merged MIDI
merged_midi.instruments.append(instrument)
# --- Process ALL instruments from the right channel MIDI ---
if midi_right.instruments:
print(f"Found {len(midi_right.instruments)} instrument(s) in the right channel MIDI.")
# Use a loop here as well
for instrument in midi_right.instruments:
scale_instrument_velocity(instrument, scale=0.8)
instrument.name = f"Right - {instrument.name if instrument.name else 'Instrument'}"
# Create and add the Pan Right control change
# Create a Control Change event for Pan (controller number 10).
# Set its value to 127 for hard right panning.
# Add it at the very beginning of the track (time=0.0).
pan_right = pretty_midi.ControlChange(number=10, value=127, time=0.0)
instrument.control_changes.insert(0, pan_right)
merged_midi.instruments.append(instrument)
merged_midi.write(output_path)
print(f"Successfully merged all instruments and panned into '{os.path.basename(output_path)}'")
analyze_midi_velocity(output_path)
return output_path
except Exception as e:
print(f"Error merging MIDI files: {e}")
# Fallback logic remains the same
if os.path.exists(midi_path_left):
print("Fallback: Using only the left channel MIDI.")
return midi_path_left
return None
# =================================================================================================
# === Stage 1: Audio to MIDI Transcription Functions ===
# =================================================================================================
def TranscribePianoAudio(input_file):
"""
Transcribes a WAV or MP3 audio file of a SOLO PIANO performance into a MIDI file.
This uses the ByteDance model.
Args:
input_file_path (str): The path to the input audio file.
Returns:
str: The file path of the generated MIDI file.
"""
print('=' * 70)
print('STAGE 1: Starting Piano-Specific Transcription')
print('=' * 70)
# Generate a unique output filename for the MIDI
fn = os.path.basename(input_file)
fn1 = fn.split('.')[0]
# Use os.path.join to create a platform-independent directory path
output_dir = os.path.join("output", "transcribed_piano_")
out_mid_path = os.path.join(output_dir, fn1 + '.mid')
# Check for the directory's existence and create it if necessary
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print('-' * 70)
print(f'Input file name: {fn}')
print(f'Output MIDI path: {out_mid_path}')
print('-' * 70)
# Load audio using the utility function
print('Loading audio...')
(audio, _) = utilities.load_audio(input_file, sr=transcription_sample_rate, mono=True)
print('Audio loaded successfully.')
print('-' * 70)
# Initialize the transcription model
# Use 'cuda' if a GPU is available and configured, otherwise 'cpu'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Loading transcriptor model... device= {device}')
transcriptor = PianoTranscription(device=device, checkpoint_path="src/models/CRNN_note_F1=0.9677_pedal_F1=0.9186.pth")
print('Transcriptor loaded.')
print('-' * 70)
# Perform transcription
print('Transcribing audio to MIDI (Piano-Specific)...')
# This function call saves the MIDI file to the specified path
transcriptor.transcribe(audio, out_mid_path)
print('Piano transcription complete.')
print('=' * 70)
# Return the path to the newly created MIDI file
return out_mid_path
def TranscribeGeneralAudio(input_file, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool):
"""
Transcribes a general audio file into a MIDI file using basic-pitch.
This is suitable for various instruments and vocals.
"""
print('=' * 70)
print('STAGE 1: Starting General Purpose Transcription')
print('=' * 70)
fn = os.path.basename(input_file)
fn1 = fn.split('.')[0]
output_dir = os.path.join("output", "transcribed_general_")
out_mid_path = os.path.join(output_dir, fn1 + '.mid')
os.makedirs(output_dir, exist_ok=True)
print(f'Input file: {fn}\nOutput MIDI: {out_mid_path}')
# --- Perform transcription using basic-pitch ---
print('Transcribing audio to MIDI (General Purpose)...')
# The predict function handles audio loading internally
model_output, midi_data, note_events = basic_pitch.inference.predict(
audio_path=input_file,
model_or_model_path=ICASSP_2022_MODEL_PATH,
onset_threshold=onset_thresh,
frame_threshold=frame_thresh,
minimum_note_length=min_note_len,
minimum_frequency=min_freq,
maximum_frequency=max_freq,
infer_onsets=infer_onsets_bool,
melodia_trick=melodia_trick_bool,
multiple_pitch_bends=multiple_bends_bool
)
# --- Save the MIDI file ---
midi_data.write(out_mid_path)
print('General transcription complete.')
print('=' * 70)
return out_mid_path
# =================================================================================================
# === Stage 2: MIDI Transformation and Rendering Function ===
# =================================================================================================
def Render_MIDI(input_midi_path,
render_type,
soundfont_bank,
render_sample_rate,
render_with_sustains,
merge_misaligned_notes,
custom_render_patch,
render_align,
render_transpose_value,
render_transpose_to_C4,
render_output_as_solo_piano,
render_remove_drums,
# --- 8-bit synth params ---
s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s,
s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth,
s8bit_bass_boost_level, s8bit_smooth_notes, s8bit_continuous_vibrato,
s8bit_noise_level, s8bit_distortion_level, s8bit_fm_modulation_depth, s8bit_fm_modulation_rate
):
"""
Processes and renders a MIDI file according to user-defined settings.
Can render using SoundFonts or a custom 8-bit synthesizer.
Args:
input_midi_path (str): The path to the input MIDI file.
All other arguments are rendering options from the Gradio UI.
Returns:
A tuple containing all the output elements for the Gradio UI.
"""
print('*' * 70)
print('STAGE 2: Starting MIDI Rendering')
print('*' * 70)
# --- File and Settings Setup ---
fn = os.path.basename(input_midi_path)
fn1 = fn.split('.')[0]
# Use os.path.join to create a platform-independent directory path
output_dir = os.path.join("output", "rendered_midi")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Now, join the clean directory path with the filename
new_fn_path = os.path.join(output_dir, fn1 + '_rendered.mid')
try:
with open(input_midi_path, 'rb') as f:
fdata = f.read()
input_midi_md5hash = hashlib.md5(fdata).hexdigest()
except FileNotFoundError:
# Handle cases where the input file might not exist
print(f"Error: Input MIDI file not found at {input_midi_path}")
return [None] * 7 # Return empty values for all outputs
print('=' * 70)
print('Requested settings:')
print(f'Input MIDI file name: {fn}')
print(f'Input MIDI md5 hash: {input_midi_md5hash}')
print('-' * 70)
print(f'Render type: {render_type}')
print(f'Soundfont bank: {soundfont_bank}')
print(f'Audio render sample rate: {render_sample_rate}')
# ... (add other print statements for settings if needed)
print('=' * 70)
# --- MIDI Processing using TMIDIX ---
print('Processing MIDI... Please wait...')
raw_score = MIDI.midi2single_track_ms_score(fdata)
escore = TMIDIX.advanced_score_processor(raw_score,
return_enhanced_score_notes=True,
apply_sustain=render_with_sustains
)[0]
# Handle cases where the MIDI might not contain any notes
if not escore:
print("Warning: MIDI file contains no processable notes.")
return ("N/A", fn1, "MIDI file contains no notes.",None, None, None, "No notes found.")
# This line will now work correctly because merge_misaligned_notes is guaranteed to be an integer.
if merge_misaligned_notes > 0:
escore = TMIDIX.merge_escore_notes(escore, merge_threshold=merge_misaligned_notes)
escore = TMIDIX.augment_enhanced_score_notes(escore, timings_divider=1)
first_note_index = [e[0] for e in raw_score[1]].index('note')
cscore = TMIDIX.chordify_score([1000, escore])
meta_data = raw_score[1][:first_note_index] + [escore[0]] + [escore[-1]] + [raw_score[1][-1]]
aux_escore_notes = TMIDIX.augment_enhanced_score_notes(escore, sort_drums_last=True)
song_description = TMIDIX.escore_notes_to_text_description(aux_escore_notes)
print('Done!')
print('=' * 70)
print('Input MIDI metadata:', meta_data[:5])
print('=' * 70)
print('Input MIDI song description:', song_description)
print('=' * 70)
print('Processing...Please wait...')
# A deep copy of the score to be modified
output_score = copy.deepcopy(escore)
# Apply transformations based on render_type
if render_type == "Extract melody":
output_score = TMIDIX.add_melody_to_enhanced_score_notes(escore, return_melody=True)
output_score = TMIDIX.recalculate_score_timings(output_score)
elif render_type == "Flip":
output_score = TMIDIX.flip_enhanced_score_notes(escore)
elif render_type == "Reverse":
output_score = TMIDIX.reverse_enhanced_score_notes(escore)
elif render_type == 'Repair Durations':
output_score = TMIDIX.fix_escore_notes_durations(escore, min_notes_gap=0)
elif render_type == 'Repair Chords':
fixed_cscore = TMIDIX.advanced_check_and_fix_chords_in_chordified_score(cscore)[0]
output_score = TMIDIX.flatten(fixed_cscore)
elif render_type == 'Remove Duplicate Pitches':
output_score = TMIDIX.remove_duplicate_pitches_from_escore_notes(escore)
elif render_type == "Add Drum Track":
nd_escore = [e for e in escore if e[3] != 9]
nd_escore = TMIDIX.augment_enhanced_score_notes(nd_escore)
output_score = TMIDIX.advanced_add_drums_to_escore_notes(nd_escore)
for e in output_score:
e[1] *= 16
e[2] *= 16
print('MIDI processing complete.')
print('=' * 70)
# --- Final Processing and Patching ---
if render_type != "Render as-is":
print('Applying final adjustments (transpose, align, patch)...')
if custom_render_patch != -1: # -1 indicates no change
for e in output_score:
if e[3] != 9: # not a drum channel
e[6] = custom_render_patch
if render_transpose_value != 0:
output_score = TMIDIX.transpose_escore_notes(output_score, render_transpose_value)
if render_transpose_to_C4:
output_score = TMIDIX.transpose_escore_notes_to_pitch(output_score, 60) # C4 is MIDI pitch 60
if render_align == "Start Times":
output_score = TMIDIX.recalculate_score_timings(output_score)
output_score = TMIDIX.align_escore_notes_to_bars(output_score)
elif render_align == "Start Times and Durations":
output_score = TMIDIX.recalculate_score_timings(output_score)
output_score = TMIDIX.align_escore_notes_to_bars(output_score, trim_durations=True)
elif render_align == "Start Times and Split Durations":
output_score = TMIDIX.recalculate_score_timings(output_score)
output_score = TMIDIX.align_escore_notes_to_bars(output_score, split_durations=True)
if render_type == "Longest Repeating Phrase":
zscore = TMIDIX.recalculate_score_timings(output_score)
lrno_score = TMIDIX.escore_notes_lrno_pattern_fast(zscore)
if lrno_score is not None:
output_score = lrno_score
else:
output_score = TMIDIX.recalculate_score_timings(TMIDIX.escore_notes_middle(output_score, 50))
if render_type == "Multi-Instrumental Summary":
zscore = TMIDIX.recalculate_score_timings(output_score)
c_escore_notes = TMIDIX.compress_patches_in_escore_notes_chords(zscore)
if len(c_escore_notes) > 128:
cmatrix = TMIDIX.escore_notes_to_image_matrix(c_escore_notes, filter_out_zero_rows=True, filter_out_duplicate_rows=True)
smatrix = TPLOTS.square_image_matrix(cmatrix, num_pca_components=max(1, min(5, len(c_escore_notes) // 128)))
output_score = TMIDIX.image_matrix_to_original_escore_notes(smatrix)
for o in output_score:
o[1] *= 250
o[2] *= 250
if render_output_as_solo_piano:
output_score = TMIDIX.solo_piano_escore_notes(output_score, keep_drums=(not render_remove_drums))
if render_remove_drums and not render_output_as_solo_piano:
output_score = TMIDIX.strip_drums_from_escore_notes(output_score)
if render_type == "Solo Piano Summary":
sp_escore_notes = TMIDIX.solo_piano_escore_notes(output_score, keep_drums=False)
zscore = TMIDIX.recalculate_score_timings(sp_escore_notes)
if len(zscore) > 128:
bmatrix = TMIDIX.escore_notes_to_binary_matrix(zscore)
cmatrix = TMIDIX.compress_binary_matrix(bmatrix, only_compress_zeros=True)
smatrix = TPLOTS.square_binary_matrix(cmatrix, interpolation_order=max(1, min(5, len(zscore) // 128)))
output_score = TMIDIX.binary_matrix_to_original_escore_notes(smatrix)
for o in output_score:
o[1] *= 200
o[2] *= 200
print('Final adjustments complete.')
print('=' * 70)
# --- Saving Processed MIDI File ---
# Save the transformed MIDI data
SONG, patches, _ = TMIDIX.patch_enhanced_score_notes(output_score)
# The underlying function mistakenly adds a '.mid' extension.
# We must pass the path without the extension to compensate.
path_without_ext = new_fn_path.rsplit('.mid', 1)[0]
TMIDIX.Tegridy_ms_SONG_to_MIDI_Converter(SONG,
output_signature = 'Integrated-MIDI-Processor',
output_file_name = path_without_ext,
track_name='Processed Track',
list_of_MIDI_patches=patches
)
midi_to_render_path = new_fn_path
else:
# If "Render as-is", use the original MIDI data
with open(new_fn_path, 'wb') as f:
f.write(fdata)
midi_to_render_path = new_fn_path
# --- Audio Rendering ---
print('Rendering final audio...')
# Select sample rate
srate = int(render_sample_rate)
# --- Conditional Rendering Logic ---
if soundfont_bank == SYNTH_8_BIT_LABEL:
print("Using 8-bit style synthesizer...")
try:
# Load the MIDI file with pretty_midi for manual synthesis
midi_data_for_synth = pretty_midi.PrettyMIDI(midi_to_render_path)
# Synthesize the waveform
# --- Passing new FX parameters to the synthesis function ---
audio = synthesize_8bit_style(
midi_data_for_synth,
s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s,
s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth,
s8bit_bass_boost_level,
fs=srate,
smooth_notes=s8bit_smooth_notes,
continuous_vibrato=s8bit_continuous_vibrato,
noise_level=s8bit_noise_level,
distortion_level=s8bit_distortion_level,
fm_modulation_depth=s8bit_fm_modulation_depth,
fm_modulation_rate=s8bit_fm_modulation_rate
)
# Normalize and prepare for Gradio
peak_val = np.max(np.abs(audio))
if peak_val > 0:
audio /= peak_val
# Transpose from (2, N) to (N, 2) and convert to int16 for Gradio
audio_out = (audio.T * 32767).astype(np.int16)
except Exception as e:
print(f"Error during 8-bit synthesis: {e}")
return [None] * 7
else:
print(f"Using SoundFont: {soundfont_bank}")
# Get the full path from the global dictionary
soundfont_path = soundfonts_dict.get(soundfont_bank)
# Select soundfont
if not soundfont_path or not os.path.exists(soundfont_path):
# Error handling in case the selected file is not found
error_msg = f"SoundFont '{soundfont_bank}' not found!"
print(f"ERROR: {error_msg}")
# Fallback to the first available soundfont if possible
if soundfonts_dict:
fallback_key = list(soundfonts_dict.keys())[0]
soundfont_path = soundfonts_dict[fallback_key]
print(f"Falling back to '{fallback_key}'.")
else:
# If no soundfonts are available at all, raise an error
raise gr.Error("No SoundFonts are available for rendering!")
with open(midi_to_render_path, 'rb') as f:
midi_file_content = f.read()
audio_out = midi_to_colab_audio(midi_file_content,
soundfont_path=soundfont_path, # Use the dynamically found path
sample_rate=srate,
output_for_gradio=True
)
print('Audio rendering complete.')
print('=' * 70)
# --- Preparing Outputs for Gradio ---
with open(midi_to_render_path, 'rb') as f:
new_md5_hash = hashlib.md5(f.read()).hexdigest()
output_plot = TPLOTS.plot_ms_SONG(output_score, plot_title=f"Score of {fn1}", return_plt=True)
output_midi_summary = str(meta_data)
return new_md5_hash, fn1, output_midi_summary, midi_to_render_path, (srate, audio_out), output_plot, song_description
# =================================================================================================
# === Main Application Logic ===
# =================================================================================================
def process_and_render_file(input_file,
# --- Transcription params ---
enable_stereo_processing,
transcription_method,
onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool,
# --- MIDI rendering params ---
render_type, soundfont_bank, render_sample_rate,
render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align,
render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums,
# --- 8-bit synth params ---
s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s,
s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth,
s8bit_bass_boost_level, s8bit_smooth_notes, s8bit_continuous_vibrato,
s8bit_noise_level, s8bit_distortion_level, s8bit_fm_modulation_depth, s8bit_fm_modulation_rate
):
"""
Main function to handle file processing. It determines the file type and calls the
appropriate functions for transcription and/or rendering based on user selections.
"""
start_time = reqtime.time()
if input_file is None:
# Return a list of updates to clear all output fields
return [gr.update(value=None)] * 7
# The input_file from gr.Audio(type="filepath") is now the direct path (a string),
# not a temporary file object. We no longer need to access the .name attribute.
input_file_path = input_file
filename = os.path.basename(input_file_path)
print(f"Processing new file: {filename}")
try:
audio_data, native_sample_rate = librosa.load(input_file_path, sr=None, mono=False)
except Exception as e:
raise gr.Error(f"Failed to load audio file: {e}")
# --- Step 1: Check file type and transcribe if necessary ---
if filename.lower().endswith(('.mid', '.midi', '.kar')):
print("MIDI file detected. Proceeding directly to rendering.")
midi_path_for_rendering = input_file_path
else: #if filename.lower().endswith(('.wav', '.mp3'))
print("Audio file detected. Starting transcription...")
base_name = os.path.splitext(filename)[0]
temp_dir = "output/temp_normalized"
os.makedirs(temp_dir, exist_ok=True)
# === STEREO PROCESSING LOGIC ===
if enable_stereo_processing:
if audio_data.ndim != 2 or audio_data.shape[0] != 2:
print("Warning: Audio is not stereo or could not be loaded as stereo. Falling back to mono transcription.")
enable_stereo_processing = False # Disable stereo processing if audio is not stereo
if enable_stereo_processing:
print("Stereo processing enabled. Splitting channels...")
try:
left_channel = audio_data[0]
right_channel = audio_data[1]
normalized_left = normalize_loudness(left_channel, native_sample_rate)
normalized_right = normalize_loudness(right_channel, native_sample_rate)
temp_left_wav_path = os.path.join(temp_dir, f"{base_name}_left.wav")
temp_right_wav_path = os.path.join(temp_dir, f"{base_name}_right.wav")
sf.write(temp_left_wav_path, normalized_left, native_sample_rate)
sf.write(temp_right_wav_path, normalized_right, native_sample_rate)
print(f"Saved left channel to: {temp_left_wav_path}")
print(f"Saved right channel to: {temp_right_wav_path}")
print("Transcribing left channel...")
if transcription_method == "General Purpose":
midi_path_left = TranscribeGeneralAudio(temp_left_wav_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
else:
midi_path_left = TranscribePianoAudio(temp_left_wav_path)
print("Transcribing right channel...")
if transcription_method == "General Purpose":
midi_path_right = TranscribeGeneralAudio(temp_right_wav_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
else:
midi_path_right = TranscribePianoAudio(temp_right_wav_path)
if midi_path_left and midi_path_right:
merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
midi_path_for_rendering = merge_midis(midi_path_left, midi_path_right, merged_midi_path)
elif midi_path_left:
print("Warning: Right channel transcription failed. Using left channel only.")
midi_path_for_rendering = midi_path_left
elif midi_path_right:
print("Warning: Left channel transcription failed. Using right channel only.")
midi_path_for_rendering = midi_path_right
else:
raise gr.Error("Both left and right channel transcriptions failed.")
except Exception as e:
print(f"An error occurred during stereo processing: {e}")
raise gr.Error(f"Stereo Processing Failed: {e}")
else:
print("Stereo processing disabled. Using standard mono transcription.")
if audio_data.ndim == 1:
mono_signal = audio_data
else:
mono_signal = np.mean(audio_data, axis=0)
normalized_mono = normalize_loudness(mono_signal, native_sample_rate)
temp_mono_wav_path = os.path.join(temp_dir, f"{base_name}_mono.wav")
sf.write(temp_mono_wav_path, normalized_mono, native_sample_rate)
try:
if transcription_method == "General Purpose":
midi_path_for_rendering = TranscribeGeneralAudio(
temp_mono_wav_path, onset_thresh, frame_thresh, min_note_len,
min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool
)
else: # Piano-Specific
midi_path_for_rendering = TranscribePianoAudio(temp_mono_wav_path)
analyze_midi_velocity(midi_path_for_rendering)
except Exception as e:
print(f"An error occurred during transcription: {e}")
raise gr.Error(f"Transcription Failed: {e}")
# --- Step 2: Render the MIDI file with selected options ---
print(f"Proceeding to render MIDI file: {os.path.basename(midi_path_for_rendering)}")
# --- Passing new FX parameters to the Render_MIDI function ---
results = Render_MIDI(midi_path_for_rendering,
render_type, soundfont_bank, render_sample_rate,
render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align,
render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums,
s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s,
s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, s8bit_bass_boost_level,
s8bit_smooth_notes, s8bit_continuous_vibrato,
s8bit_noise_level, s8bit_distortion_level, s8bit_fm_modulation_depth, s8bit_fm_modulation_rate
)
print(f'Total processing time: {(reqtime.time() - start_time):.2f} sec')
print('*' * 70)
return results
# =================================================================================================
# === Gradio UI Setup ===
# =================================================================================================
def update_ui_visibility(transcription_method, soundfont_choice):
"""
Dynamically updates the visibility of UI components based on user selections.
"""
is_general = (transcription_method == "General Purpose")
is_8bit = (soundfont_choice == SYNTH_8_BIT_LABEL)
return {
general_transcription_settings: gr.update(visible=is_general),
synth_8bit_settings: gr.update(visible=is_8bit),
}
# --- Function to apply 8-bit synthesizer presets ---
def apply_8bit_preset(preset_name):
"""
Takes the name of a preset and returns a dictionary of gr.update objects
to set the values of the 8-bit synthesizer's UI components.
"""
# If the user selects "Custom" or the preset is not found, do not change the values.
if preset_name == "Custom" or preset_name not in S8BIT_PRESETS:
return {
s8bit_waveform_type: gr.update(),
s8bit_pulse_width: gr.update(),
s8bit_envelope_type: gr.update(),
s8bit_decay_time_s: gr.update(),
s8bit_vibrato_rate: gr.update(),
s8bit_vibrato_depth: gr.update(),
s8bit_smooth_notes: gr.update(),
s8bit_continuous_vibrato: gr.update(),
s8bit_bass_boost_level: gr.update()
}
# Get the settings dictionary for the chosen preset.
settings = S8BIT_PRESETS[preset_name]
# Return a dictionary that maps each UI component to a gr.update call with the new value.
return {
s8bit_waveform_type: gr.update(value=settings['waveform_type']),
s8bit_pulse_width: gr.update(value=settings['pulse_width']),
s8bit_envelope_type: gr.update(value=settings['envelope_type']),
s8bit_decay_time_s: gr.update(value=settings['decay_time_s']),
s8bit_vibrato_rate: gr.update(value=settings['vibrato_rate']),
s8bit_vibrato_depth: gr.update(value=settings['vibrato_depth']),
s8bit_smooth_notes: gr.update(value=settings['smooth_notes']),
s8bit_continuous_vibrato: gr.update(value=settings['continuous_vibrato']),
s8bit_bass_boost_level: gr.update(value=settings['bass_boost_level'])
}
if __name__ == "__main__":
# Initialize the app: download model (if needed) and apply patches
# Set to False if you don't have 'requests' or 'tqdm' installed
initialize_app()
# --- Prepare soundfonts and make the map globally accessible ---
global soundfonts_dict
# On application start, download SoundFonts from Hugging Face Hub if they don't exist.
soundfonts_dict = prepare_soundfonts()
print(f"Found {len(soundfonts_dict)} local SoundFonts.")
if not soundfonts_dict:
print("\nWARNING: No SoundFonts were found or could be downloaded.")
print("Rendering with SoundFonts will fail. Only the 8-bit synthesizer will be available.")
# --- Data structure for 8-bit synthesizer presets ---
# Comprehensive preset dictionary with new FX parameters for all presets
# Comprehensive preset dictionary including new JRPG and Handheld classics
# Note: Vibrato depth is mapped to a representative value on the 0-50 Hz slider.
S8BIT_PRESETS = {
# --- Rhythmic & Action ---
"Rhythm Pop Lead": {
# Description: A clean, round square wave perfect for the snappy, catchy feel of rhythm games.
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18, 'vibrato_rate': 4.5, 'vibrato_depth': 4,
'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
"Arcade Brawler Lead": {
# Description: A gritty sawtooth lead with a hard attack, capturing the high-energy feel of classic fighting games.
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15, 'vibrato_rate': 5.0, 'vibrato_depth': 6,
'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.1, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
"Mega Man (Rockman)": {
# Description: A thin, sharp square wave lead with fast vibrato, iconic for its driving, heroic melodies.
'waveform_type': 'Square', 'pulse_width': 0.2, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15, 'vibrato_rate': 6.0, 'vibrato_depth': 8,
'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.05, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
"Kirby's Bubbly Melody": {
# Description: A soft, round square wave with a bouncy vibrato, creating a cheerful and adorable sound.
'waveform_type': 'Square', 'pulse_width': 0.4, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2, 'vibrato_rate': 6.0, 'vibrato_depth': 4,
'smooth_notes': True, 'continuous_vibrato': False, 'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
"Mario (Super Mario Bros)": {
# Description: A bright square wave with a per-note vibrato, producing the classic bouncy platformer sound.
'waveform_type': 'Square', 'pulse_width': 0.3, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25, 'vibrato_rate': 5.0, 'vibrato_depth': 5,
'smooth_notes': True, 'continuous_vibrato': False, 'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
# --- Epic & Atmospheric ---
"Mecha & Tactics Brass": {
# Description: A powerful, sustained sawtooth emulating the bold, heroic synth-brass of strategy and mecha anime themes.
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, 'vibrato_rate': 3.5, 'vibrato_depth': 5,
'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
"Mystic Mana Pad": {
# Description: A warm, ethereal square wave pad with slow vibrato, capturing a feeling of fantasy and wonder.
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5, 'vibrato_rate': 2.5, 'vibrato_depth': 4,
'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
"Dragon Quest (Orchestral Feel)": {
# Description: A pure triangle wave with a long decay, mimicking the grand, orchestral feel of a classical flute or string section.
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.6, 'vibrato_rate': 3.0, 'vibrato_depth': 4,
'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
"ONI V (Wafu Mystic)": {
# Description: A solemn triangle wave with a slow, expressive vibrato, evoking the mysterious atmosphere of Japanese folklore.
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, 'vibrato_rate': 3.5, 'vibrato_depth': 3,
'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.4, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
"Zelda (NES)": {
# Description: The classic pure triangle wave lead, perfect for heroic and adventurous overworld themes.
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.3, 'vibrato_rate': 4.5, 'vibrato_depth': 4,
'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.15, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
# --- JRPG & System Classics ---
"Falcom Ys (Rock Lead)": {
# Description: A powerful sawtooth with slight distortion, emulating the driving rock organ and guitar leads of action JRPGs.
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15, 'vibrato_rate': 5.5, 'vibrato_depth': 6,
'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.15, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
"Final Fantasy (Arpeggio)": {
# Description: A perfect, clean square wave with zero vibrato, creating the iconic, crystal-clear arpeggio sound.
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22, 'vibrato_rate': 5.0, 'vibrato_depth': 0,
'smooth_notes': True, 'continuous_vibrato': False, 'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
"Castlevania (Akumajō Dracula)": {
# Description: A sharp square wave with dramatic vibrato, ideal for fast, gothic, and baroque-inspired melodies.
'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18, 'vibrato_rate': 6.5, 'vibrato_depth': 6,
'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.35, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
"Pokémon (Game Boy Classics)": {
# Description: A full, friendly square wave sound, capturing the cheerful and adventurous spirit of early handheld RPGs.
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22, 'vibrato_rate': 5.0, 'vibrato_depth': 5,
'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
# --- Advanced System Impressions ---
"Commodore 64 (SID Feel)": {
# Description: (Impression) Uses high-speed, shallow vibrato to mimic the characteristic "buzzy" texture of the SID chip's PWM.
'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25, 'vibrato_rate': 8.0, 'vibrato_depth': 4,
'smooth_notes': True, 'continuous_vibrato': False, 'bass_boost_level': 0.2, 'noise_level': 0.05, 'distortion_level': 0.1, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
"Megadrive/Genesis (FM Grit)": {
# Description: (Impression) Uses FM, distortion, and noise to capture the gritty, metallic, and aggressive tone of the YM2612 chip.
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18, 'vibrato_rate': 0.0, 'vibrato_depth': 0,
'smooth_notes': False, 'continuous_vibrato': True, 'bass_boost_level': 0.4, 'noise_level': 0.1, 'distortion_level': 0.2, 'fm_modulation_depth': 0.2, 'fm_modulation_rate': 150
},
"PC-98 (Touhou Feel)": {
# Description: (Impression) A very sharp square wave with fast FM, emulating the bright, high-energy leads of Japanese PC games.
'waveform_type': 'Square', 'pulse_width': 0.15, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.12, 'vibrato_rate': 7.5, 'vibrato_depth': 7,
'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.1, 'fm_modulation_rate': 200
},
"Roland SC-88 (GM Vibe)": {
# Description: (Impression) A clean, stable triangle wave with no effects, mimicking the polished, sample-based sounds of General MIDI.
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35, 'vibrato_rate': 0, 'vibrato_depth': 0,
'smooth_notes': True, 'continuous_vibrato': False, 'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
# --- Experimental & Sound FX ---
"Sci-Fi Energy Field": {
# Description: (SFX) High-speed vibrato and noise create a constant, shimmering hum suitable for energy shields or force fields.
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, 'vibrato_rate': 10.0, 'vibrato_depth': 3,
'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.1, 'noise_level': 0.1, 'distortion_level': 0.0, 'fm_modulation_depth': 0.05, 'fm_modulation_rate': 50
},
"Industrial Alarm": {
# Description: (SFX) Extreme vibrato rate on a sawtooth wave produces a harsh, metallic, dissonant alarm sound.
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2, 'vibrato_rate': 15.0, 'vibrato_depth': 8,
'smooth_notes': False, 'continuous_vibrato': False, 'bass_boost_level': 0.3, 'noise_level': 0.2, 'distortion_level': 0.3, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
"Laser Charge-Up": {
# Description: (SFX) Extreme vibrato depth creates a dramatic, rising pitch effect, perfect for sci-fi weapon sounds.
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.3, 'vibrato_rate': 4.0, 'vibrato_depth': 25,
'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
"Unstable Machine Core": {
# Description: (SFX) Maximum depth and distortion create a chaotic, atonal noise, simulating a machine on the verge of exploding.
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5, 'vibrato_rate': 1.0, 'vibrato_depth': 50,
'smooth_notes': False, 'continuous_vibrato': True, 'bass_boost_level': 0.5, 'noise_level': 0.3, 'distortion_level': 0.4, 'fm_modulation_depth': 0.5, 'fm_modulation_rate': 10
},
"Hardcore Gabber Kick": {
# Description: (Experimental) Maximum bass boost and distortion create an overwhelmingly powerful, clipped kick drum sound.
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.1, 'vibrato_rate': 0, 'vibrato_depth': 0,
'smooth_notes': False, 'continuous_vibrato': False, 'bass_boost_level': 0.8, 'noise_level': 0.2, 'distortion_level': 0.5, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
# --- Utility ---
"Generic Chiptune Loop": {
# Description: A well-balanced, pleasant square wave lead that serves as a great starting point for custom sounds.
'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2, 'vibrato_rate': 5.5, 'vibrato_depth': 4,
'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
},
"Dark/Boss Atmosphere": {
# Description: An aggressive sawtooth with heavy bass and distortion, perfect for tense or menacing background music.
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35, 'vibrato_rate': 7.0, 'vibrato_depth': 12,
'smooth_notes': False, 'continuous_vibrato': False, 'bass_boost_level': 0.4, 'noise_level': 0.15, 'distortion_level': 0.25, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
}
}
app = gr.Blocks(theme=gr.themes.Base())
with app:
gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>Audio-to-MIDI & Advanced Renderer</h1>")
gr.Markdown(
"**Upload a Audio for transcription-then-rendering, or a MIDI for rendering-only.**\n\n"
"This application combines piano audio transcription with a powerful MIDI transformation and rendering toolkit. "
"Based on the work of [asigalov61](https://github.com/asigalov61)."
)
with gr.Row():
waveform_options = gr.WaveformOptions(show_recording_waveform=False)
with gr.Column(scale=1):
# --- INPUT COLUMN ---
gr.Markdown("## 1. Upload File")
# Changed from gr.File to gr.Audio to allow for audio preview.
# type="filepath" ensures the component returns a string path to the uploaded file.
# The component will show a player for supported audio types (e.g., WAV, MP3).
input_file = gr.Audio(
label="Input Audio or MIDI File",
type="filepath",
sources=["upload"], waveform_options=waveform_options
)
gr.Markdown("## 2. Configure Processing")
# --- Transcription Method Selector ---
transcription_method = gr.Radio(
["General Purpose", "Piano-Specific"],
label="Audio Transcription Method",
value="General Purpose",
info="Choose 'General Purpose' for most music (vocals, etc.). Choose 'Piano-Specific' only for solo piano recordings."
)
# --- Stereo Processing Checkbox ---
enable_stereo_processing = gr.Checkbox(
label="Enable Stereo Transcription",
value=False,
info="If checked, left/right audio channels are transcribed separately and merged. Doubles processing time."
)
with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
minimum_note_length = gr.Slider(10, 500, value=128, step=1, label="Minimum Note Length (ms)", info="Filters out very short, noisy notes.")
minimum_frequency = gr.Slider(0, 500, value=60, step=5, label="Minimum Frequency (Hz)", info="Ignores pitches below this frequency.")
maximum_frequency = gr.Slider(501, 10000, value=4000, step=10, label="Maximum Frequency (Hz)", info="Ignores pitches above this frequency.")
infer_onsets = gr.Checkbox(value=True, label="Infer Onsets (Boost Onsets)")
melodia_trick = gr.Checkbox(value=True, label="Melodia Trick (Contour Optimization)")
multiple_pitch_bends = gr.Checkbox(value=False, label="Allow Multiple Pitch Bends")
# --- Rendering Settings ---
render_type = gr.Radio(
["Render as-is", "Custom render", "Extract melody", "Flip", "Reverse", "Repair Durations", "Repair Chords", "Remove Duplicate Pitches", "Longest Repeating Phrase", "Multi-Instrumental Summary", "Solo Piano Summary", "Add Drum Track"],
label="MIDI Transformation Render Type",
value="Render as-is",
info="Apply transformations to the MIDI before rendering. Select 'Render as-is' for basic rendering or other options for transformations."
)
# --- SoundFont Bank with 8-bit option ---
# --- Dynamically create the list of choices ---
soundfont_choices = [SYNTH_8_BIT_LABEL] + list(soundfonts_dict.keys())
# Set a safe default value
default_sf_choice = "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7" if "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7" in soundfonts_dict else (soundfont_choices[0] if soundfont_choices else "")
soundfont_bank = gr.Dropdown(
soundfont_choices,
label="SoundFont / Synthesizer",
value=default_sf_choice
)
render_sample_rate = gr.Radio(
["16000", "32000", "44100"],
label="Audio Sample Rate",
value="44100"
)
# --- 8-bit Synthesizer Settings ---
#
# =================================================================================
# === 8-Bit Synthesizer Parameter Guide ===
# =================================================================================
#
# --- Basic Tone Shaping ---
#
# Waveform Type: The fundamental timbre of the sound.
# - Square: The classic, bright, somewhat hollow sound of the NES. Its tone is heavily modified by Pulse Width.
# - Sawtooth: Aggressive, buzzy, and rich. Great for intense leads or gritty basslines.
# - Triangle: Soft, pure, and flute-like. Often used for basslines or gentler melodies.
#
# Pulse Width (Square Wave Only): Modifies the character of the Square wave.
# - Low (near 0.1) or High (near 0.9): Creates a thin, sharp, or nasal sound. A common choice for classic leads.
# - Mid (near 0.5): A "perfect" square wave. The sound is full, round, and most robust.
#
# Envelope Type: Shapes the volume of each note over its duration.
# - Plucky (AD): Creates a percussive, short sound that attacks instantly and then fades. Ideal for fast melodies and arpeggios.
# - Sustained (Full Decay): Creates a held-out sound that lasts for the note's full duration. Ideal for pads and atmospheric sounds.
#
# Decay Time (s): Controls how long a note's sound lasts (in the Plucky envelope).
# - Low: Very short, staccato notes.
# - High: Longer, more resonant notes that can bleed into each other.
#
# Bass Boost Level: Mixes in a sub-octave (a square wave one octave lower).
# - Low (or 0): The pure, original waveform.
# - High: Adds significant weight, thickness, and power to the sound.
#
# --- Modulation & Performance ---
#
# Vibrato Rate (Hz): The SPEED of the pitch wobble.
# - Low: A slow, gentle wavering effect.
# - High (8Hz+): A fast, frantic buzzing or trembling effect. Can create "ring-mod" style sounds at extreme values.
#
# Vibrato Depth (Hz): The INTENSITY of the pitch wobble.
# - Low (or 0): A very subtle effect, or no vibrato at all.
# - High: An extreme, dramatic pitch bend. Can sound chaotic or like a siren at extreme values.
#
# Smooth Notes (Checkbox):
# - Enabled: Applies a tiny fade-in/out to reduce clicking artifacts. Makes the sound slightly softer but cleaner.
# - Disabled: More abrupt, harsh note onsets. Can be desirable for an aggressive sound.
#
# Continuous Vibrato (Checkbox):
# - Enabled: The vibrato is smooth and connected across a musical phrase, creating a "singing" or legato effect.
# - Disabled: The vibrato resets on each new note, creating a bouncy, per-note, staccato effect (key for the "Mario" style).
#
# --- FX & Advanced Synthesis ---
#
# Noise Level: Mixes in white noise with the main waveform.
# - Low (or 0): No noise.
# - High: Adds "air," "grit," or a "hissing" quality. Essential for simulating percussion or creating wind-like sound effects.
#
# Distortion Level: Applies a wave-shaping algorithm to make the sound harsher.
# - Low (or 0): The clean, original sound.
# - High: Progressively crushes and saturates the waveform, creating a very aggressive, "fuzzy" or "broken" tone.
#
# FM Depth (Frequency Modulation): Controls the intensity of the frequency modulation.
# - Low (or 0): No FM effect.
# - High: The main frequency is more heavily altered by the FM Rate, creating complex, bell-like, metallic, or dissonant tones.
#
# FM Rate (Frequency Modulation): Controls the speed of the modulating oscillator.
# - Low: Creates a slow, vibrato-like or "wobbling" FM effect.
# - High: Creates fast modulation, resulting in bright, complex, often metallic harmonics and sidebands.
# =================================================================================
#
with gr.Accordion("8-bit Synthesizer Settings", open=False, visible=False) as synth_8bit_settings:
# --- ADDED: Preset selector dropdown ---
s8bit_preset_selector = gr.Dropdown(
choices=["Custom"] + list(S8BIT_PRESETS.keys()),
value="Custom",
label="Style Preset",
info="Select a preset to auto-fill the settings below. Choose 'Custom' for manual control.\nFor reference and entertainment only. These presets are not guaranteed to be perfectly accurate."
)
s8bit_waveform_type = gr.Dropdown(['Square', 'Sawtooth', 'Triangle'], value='Square', label="Waveform Type")
s8bit_pulse_width = gr.Slider(0.01, 0.99, value=0.5, step=0.01, label="Pulse Width (Square Wave Only)")
s8bit_envelope_type = gr.Dropdown(['Plucky (AD Envelope)', 'Sustained (Full Decay)'], value='Plucky (AD Envelope)', label="Envelope Type")
s8bit_decay_time_s = gr.Slider(0.01, 0.6, value=0.1, step=0.01, label="Decay Time (s)")
s8bit_vibrato_rate = gr.Slider(0, 20, value=5, label="Vibrato Rate (Hz)")
s8bit_vibrato_depth = gr.Slider(0, 50, value=0, label="Vibrato Depth (Hz)")
s8bit_bass_boost_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="Bass Boost Level", info="Adjusts the volume of the sub-octave. 0 is off.")
s8bit_smooth_notes = gr.Checkbox(value=True, label="Smooth Notes", info="Applies a tiny fade-in/out to notes to reduce clicking.")
s8bit_continuous_vibrato = gr.Checkbox(value=True, label="Continuous Vibrato", info="Prevents vibrato from resetting on each note.")
# --- New accordion for advanced effects ---
with gr.Accordion("Advanced Synthesis & FX", open=False):
s8bit_noise_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Noise Level", info="Mixes in white noise. Great for percussion or adding 'air'.")
s8bit_distortion_level = gr.Slider(minimum=0.0, maximum=0.9, value=0.0, step=0.05, label="Distortion Level", info="Applies wave-shaping distortion for a grittier, harsher sound.")
s8bit_fm_modulation_depth = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="FM Depth", info="Depth of Frequency Modulation. Creates complex, metallic, or bell-like tones.")
s8bit_fm_modulation_rate = gr.Slider(minimum=0.0, maximum=500.0, value=0.0, step=1.0, label="FM Rate", info="Rate of Frequency Modulation. Higher values create brighter, more complex harmonics.")
# --- Original Advanced Options (Now tied to Piano-Specific) ---
with gr.Accordion("Advanced MIDI Rendering Options", open=False) as advanced_rendering_options:
render_with_sustains = gr.Checkbox(label="Apply sustain pedal effects (if present)", value=True)
render_output_as_solo_piano = gr.Checkbox(label="Convert to Solo Piano (Grand Piano patch)", value=False)
render_remove_drums = gr.Checkbox(label="Remove drum track", value=False)
render_transpose_to_C4 = gr.Checkbox(label="Transpose entire score to center around C4", value=False)
render_transpose_value = gr.Slider(-12, 12, value=0, step=1, label="Transpose (semitones)")
custom_render_patch = gr.Slider(-1, 127, value=-1, step=1, label="Force MIDI Patch (-1 to disable)")
merge_misaligned_notes = gr.Slider(-1, 127, value=-1, label="Time to merge notes in ms (-1 to disable)")
render_align = gr.Radio(
["Do not align", "Start Times", "Start Times and Durations", "Start Times and Split Durations"],
label="Align notes to musical bars",
value="Do not align"
)
submit_btn = gr.Button("Process and Render", variant="primary")
with gr.Column(scale=2):
# --- OUTPUT COLUMN ---
gr.Markdown("## 3. Results")
output_midi_title = gr.Textbox(label="MIDI Title")
output_song_description = gr.Textbox(label="MIDI Description", lines=3)
output_audio = gr.Audio(label="Rendered Audio Output", format="wav", waveform_options=waveform_options)
output_plot = gr.Plot(label="MIDI Score Plot")
with gr.Row():
output_midi = gr.File(label="Download Processed MIDI File", file_types=[".mid"])
output_midi_md5 = gr.Textbox(label="Output MIDI MD5 Hash")
output_midi_summary = gr.Textbox(label="MIDI metadata summary", lines=4)
# Define all input components for the click event, excluding the preset selector which is not a direct input to the final processing.
all_inputs = [
input_file,
enable_stereo_processing,
transcription_method,
onset_threshold, frame_threshold, minimum_note_length, minimum_frequency, maximum_frequency,
infer_onsets, melodia_trick, multiple_pitch_bends,
render_type, soundfont_bank, render_sample_rate,
render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align,
render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums,
s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s,
s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, s8bit_bass_boost_level,
s8bit_smooth_notes, s8bit_continuous_vibrato,
s8bit_noise_level, s8bit_distortion_level, s8bit_fm_modulation_depth, s8bit_fm_modulation_rate
]
all_outputs = [
output_midi_md5, output_midi_title, output_midi_summary,
output_midi, output_audio, output_plot, output_song_description
]
# Define the output components for the preset updater function.
s8bit_updater_outputs = [
s8bit_waveform_type, s8bit_pulse_width, s8bit_envelope_type,
s8bit_decay_time_s, s8bit_vibrato_rate, s8bit_vibrato_depth,
s8bit_smooth_notes, s8bit_continuous_vibrato, s8bit_bass_boost_level,
s8bit_noise_level, s8bit_distortion_level, s8bit_fm_modulation_depth, s8bit_fm_modulation_rate
]
# --- Event Handling ---
submit_btn.click(
process_and_render_file,
inputs=all_inputs,
outputs=all_outputs
)
# --- Listeners for dynamic UI updates ---
transcription_method.change(
fn=update_ui_visibility,
inputs=[transcription_method, soundfont_bank],
outputs=[general_transcription_settings, synth_8bit_settings]
)
soundfont_bank.change(
fn=update_ui_visibility,
inputs=[transcription_method, soundfont_bank],
outputs=[general_transcription_settings, synth_8bit_settings]
)
# --- Event listener for the preset selector ---
# When the preset dropdown changes, it calls the `apply_8bit_preset` function.
# The input to the function is the selected preset name.
# The outputs are all the individual 8-bit setting components that need to be updated.
s8bit_preset_selector.change(
fn=apply_8bit_preset,
inputs=[s8bit_preset_selector],
outputs=s8bit_updater_outputs
)
# Launch the Gradio app
app.queue().launch(inbrowser=True, debug=True)