| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import os |
| import hashlib |
| import time as reqtime |
| import copy |
| import librosa |
| import pyloudnorm as pyln |
| import soundfile as sf |
|
|
| import torch |
| import gradio as gr |
|
|
| from src.piano_transcription.utils import initialize_app |
|
|
| from piano_transcription_inference import PianoTranscription, utilities, sample_rate as transcription_sample_rate |
|
|
| |
| from src import TMIDIX, TPLOTS |
| from src import MIDI |
| from src.midi_to_colab_audio import midi_to_colab_audio |
|
|
| |
| import basic_pitch |
| from basic_pitch.inference import predict |
| from basic_pitch import ICASSP_2022_MODEL_PATH |
|
|
| |
| import pretty_midi |
| import numpy as np |
| from scipy import signal |
|
|
| |
| |
| |
| from huggingface_hub import hf_hub_download |
| import glob |
|
|
| |
| SYNTH_8_BIT_LABEL = "None (8-bit Synthesizer)" |
|
|
| def prepare_soundfonts(): |
| """ |
| Ensures a default set of SoundFonts are downloaded, then scans the 'src/sf2' |
| directory recursively for all .sf2 files. |
| Returns a dictionary mapping a user-friendly name to its full file path, with |
| default soundfonts listed first in their specified order. |
| |
| Downloads soundfont files from the specified Hugging Face Space repository |
| to a local 'src/sf2' directory if they don't already exist. |
| Returns a list of local paths to the soundfont files. |
| """ |
| SF2_REPO_ID = "asigalov61/Advanced-MIDI-Renderer" |
| SF2_DIR = "src/sf2" |
| |
| |
| DEFAULT_SF2_FILENAMES = [ |
| "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7.sf2", |
| "Orpheus_18.06.2020.sf2", |
| "Live HQ Natural SoundFont GM.sf2", |
| "Nice-Strings-PlusOrchestra-v1.6.sf2", |
| "KBH-Real-Choir-V2.5.sf2", |
| "SuperGameBoy.sf2", |
| "ProtoSquare.sf2" |
| ] |
|
|
| |
| os.makedirs(SF2_DIR, exist_ok=True) |
| |
| |
| print("Checking for SoundFont files...") |
| for filename in DEFAULT_SF2_FILENAMES: |
| local_path = os.path.join(SF2_DIR, filename) |
| |
| |
| if not os.path.exists(local_path): |
| print(f"Downloading '{filename}' from Hugging Face Hub...") |
| try: |
| |
| |
| hf_hub_download( |
| repo_id=SF2_REPO_ID, |
| repo_type='space', |
| filename=f"{filename}", |
| local_dir=SF2_DIR, |
| |
| ) |
| print(f"'{filename}' downloaded successfully.") |
| except Exception as e: |
| print(f"Error downloading {filename}: {e}") |
| |
|
|
| |
| print(f"Scanning '{SF2_DIR}' for all .sf2 files...") |
| all_sfs_map = {} |
| |
| search_pattern = os.path.join(SF2_DIR, '**', '*.sf2') |
| for full_path in glob.glob(search_pattern, recursive=True): |
| |
| relative_path = os.path.relpath(full_path, SF2_DIR) |
| display_name = os.path.splitext(relative_path)[0].replace("\\", "/") |
| all_sfs_map[display_name] = full_path |
| |
| |
| ordered_soundfont_map = {} |
|
|
| |
| default_display_names = [os.path.splitext(f)[0] for f in DEFAULT_SF2_FILENAMES] |
| |
| |
| other_display_names = [name for name in all_sfs_map.keys() if name not in default_display_names] |
| other_display_names.sort() |
|
|
| |
| for name in default_display_names: |
| if name in all_sfs_map: |
| ordered_soundfont_map[name] = all_sfs_map[name] |
| |
| |
| for name in other_display_names: |
| ordered_soundfont_map[name] = all_sfs_map[name] |
|
|
| return ordered_soundfont_map |
|
|
| |
| |
| |
| def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s, pulse_width, |
| vibrato_rate, vibrato_depth, bass_boost_level, fs=44100, |
| smooth_notes=False, continuous_vibrato=False, noise_level=0.0, |
| distortion_level=0.0, fm_modulation_depth=0.0, fm_modulation_rate=0.0): |
| """ |
| Synthesizes an 8-bit style audio waveform from a PrettyMIDI object. |
| This function generates waveforms manually instead of using a synthesizer like FluidSynth. |
| Includes an optional sub-octave bass booster with adjustable level. |
| Instruments are panned based on their order in the MIDI file. |
| Instrument 1 -> Left, Instrument 2 -> Right. |
| """ |
| total_duration = midi_data.get_end_time() |
| |
| waveform = np.zeros((2, int(total_duration * fs) + fs)) |
|
|
| num_instruments = len(midi_data.instruments) |
|
|
| |
| osc_phase = {} |
| |
| vibrato_phase = 0.0 |
|
|
| for i, instrument in enumerate(midi_data.instruments): |
| |
| |
| pan_l, pan_r = 0.707, 0.707 |
| if num_instruments == 2: |
| if i == 0: |
| pan_l, pan_r = 1.0, 0.0 |
| elif i == 1: |
| pan_l, pan_r = 0.0, 1.0 |
| elif num_instruments > 2: |
| if i == 0: |
| pan_l, pan_r = 1.0, 0.0 |
| elif i == 1: |
| pan_l, pan_r = 0.0, 1.0 |
| |
|
|
| osc_phase[i] = 0.0 |
|
|
| for note in instrument.notes: |
| freq = pretty_midi.note_number_to_hz(note.pitch) |
| note_duration = note.end - note.start |
| num_samples = int(note_duration * fs) |
| if num_samples <= 0: |
| continue |
|
|
| t = np.arange(num_samples) / fs |
|
|
| |
| if continuous_vibrato: |
| |
| vib_phase_inc = 2 * np.pi * vibrato_rate / fs |
| vib_phase_array = vibrato_phase + np.arange(num_samples) * vib_phase_inc |
| vibrato_phase = (vib_phase_array[-1] + vib_phase_inc) % (2 * np.pi) |
| vibrato_lfo = vibrato_depth * np.sin(vib_phase_array) |
| else: |
| vibrato_lfo = vibrato_depth * np.sin(2 * np.pi * vibrato_rate * t) |
| |
| |
| phase_inc = 2 * np.pi * (freq + vibrato_lfo) / fs |
| phase = osc_phase[i] + np.cumsum(phase_inc) |
| osc_phase[i] = phase[-1] % (2 * np.pi) |
|
|
| if waveform_type == 'Square': |
| note_waveform = signal.square(phase, duty=pulse_width) |
| elif waveform_type == 'Sawtooth': |
| note_waveform = signal.sawtooth(phase) |
| elif waveform_type == 'Triangle': |
| note_waveform = signal.sawtooth(phase, width=0.5) |
| |
| |
| if bass_boost_level > 0: |
| bass_freq = freq / 2.0 |
| |
| if bass_freq > 20: |
| |
| bass_phase_inc = 2 * np.pi * bass_freq / fs |
| bass_phase = np.cumsum(np.full(num_samples, bass_phase_inc)) |
| bass_sub_waveform = signal.square(bass_phase, duty=0.5) |
| |
| |
| main_level = 1.0 - (0.5 * bass_boost_level) |
| note_waveform = (note_waveform * main_level) + (bass_sub_waveform * bass_boost_level) |
|
|
| |
| if noise_level > 0: |
| noise_waveform = np.random.uniform(-1, 1, num_samples) |
| note_waveform += noise_waveform * noise_level |
|
|
| |
| if distortion_level > 0: |
| note_waveform = np.sign(note_waveform) * np.abs(note_waveform) ** (1.0 - distortion_level) |
|
|
| |
| if fm_modulation_depth > 0: |
| modulated_freq = freq * (1 + fm_modulation_depth * np.sin(2 * np.pi * fm_modulation_rate * t)) |
| phase_inc = 2 * np.pi * modulated_freq / fs |
| phase = osc_phase[i] + np.cumsum(phase_inc) |
| osc_phase[i] = phase[-1] % (2 * np.pi) |
| if waveform_type == 'Square': |
| note_waveform = signal.square(phase, duty=pulse_width) |
| elif waveform_type == 'Sawtooth': |
| note_waveform = signal.sawtooth(phase) |
| elif waveform_type == 'Triangle': |
| note_waveform = signal.sawtooth(phase, width=0.5) |
|
|
| |
| start_amp = note.velocity / 127.0 |
| envelope = np.zeros(num_samples) |
|
|
| if envelope_type == 'Plucky (AD Envelope)': |
| attack_time_s = 0.005 |
| attack_samples = min(int(attack_time_s * fs), num_samples) |
| decay_samples = min(int(decay_time_s * fs), num_samples - attack_samples) |
|
|
| envelope[:attack_samples] = np.linspace(0, start_amp, attack_samples) |
| if decay_samples > 0: |
| envelope[attack_samples:attack_samples+decay_samples] = np.linspace(start_amp, 0, decay_samples) |
| elif envelope_type == 'Sustained (Full Decay)': |
| envelope = np.linspace(start_amp, 0, num_samples) |
|
|
| if smooth_notes: |
| |
| release_samples = min(int(0.005 * fs), num_samples) |
| envelope[-release_samples:] *= np.linspace(1, 0, release_samples) |
| |
| envelope[:min(10, num_samples)] *= np.linspace(0.5, 1, min(10, num_samples)) |
|
|
| |
| note_waveform *= envelope |
|
|
| start_sample = int(note.start * fs) |
| end_sample = start_sample + num_samples |
| if end_sample > waveform.shape[1]: |
| end_sample = waveform.shape[1] |
| note_waveform = note_waveform[:end_sample-start_sample] |
|
|
| |
| waveform[0, start_sample:end_sample] += note_waveform * pan_l |
| waveform[1, start_sample:end_sample] += note_waveform * pan_r |
|
|
| return waveform |
|
|
|
|
| def analyze_midi_velocity(midi_path): |
| midi = pretty_midi.PrettyMIDI(midi_path) |
| all_velocities = [] |
|
|
| print(f"Analyzing velocity for MIDI: {midi_path}") |
| for i, instrument in enumerate(midi.instruments): |
| velocities = [note.velocity for note in instrument.notes] |
| all_velocities.extend(velocities) |
|
|
| if velocities: |
| print(f"Instrument {i} ({instrument.name}):") |
| print(f" Notes count: {len(velocities)}") |
| print(f" Velocity min: {min(velocities)}") |
| print(f" Velocity max: {max(velocities)}") |
| print(f" Velocity mean: {np.mean(velocities):.2f}") |
| else: |
| print(f"Instrument {i} ({instrument.name}): no notes found.") |
|
|
| if all_velocities: |
| print("\nOverall MIDI velocity stats:") |
| print(f" Total notes: {len(all_velocities)}") |
| print(f" Velocity min: {min(all_velocities)}") |
| print(f" Velocity max: {max(all_velocities)}") |
| print(f" Velocity mean: {np.mean(all_velocities):.2f}") |
| else: |
| print("No notes found in this MIDI.") |
|
|
|
|
| def scale_instrument_velocity(instrument, scale=0.8): |
| for note in instrument.notes: |
| note.velocity = max(1, min(127, int(note.velocity * scale))) |
|
|
|
|
| def normalize_loudness(audio_data, sample_rate, target_lufs=-23.0): |
| """ |
| Normalizes the audio data to a target integrated loudness (LUFS). |
| This provides more consistent perceived volume than peak normalization. |
| |
| Args: |
| audio_data (np.ndarray): The audio signal. |
| sample_rate (int): The sample rate of the audio. |
| target_lufs (float): The target loudness in LUFS. Defaults to -23.0, |
| a common standard for broadcast. |
| |
| Returns: |
| np.ndarray: The loudness-normalized audio data. |
| """ |
| try: |
| |
| meter = pyln.Meter(sample_rate) |
| loudness = meter.integrated_loudness(audio_data) |
|
|
| |
| |
| loudness_gain_db = target_lufs - loudness |
| loudness_gain_linear = 10.0 ** (loudness_gain_db / 20.0) |
|
|
| |
| normalized_audio = audio_data * loudness_gain_linear |
|
|
| |
| |
| peak_val = np.max(np.abs(normalized_audio)) |
| if peak_val > 1.0: |
| normalized_audio /= peak_val |
| print(f"Warning: Loudness normalization resulted in clipping. Audio was peak-normalized as a safeguard.") |
| |
| print(f"Audio normalized from {loudness:.2f} LUFS to target {target_lufs} LUFS.") |
| return normalized_audio |
|
|
| except Exception as e: |
| print(f"Loudness normalization failed: {e}. Falling back to original audio.") |
| return audio_data |
|
|
|
|
| |
| |
| |
| def merge_midis(midi_path_left, midi_path_right, output_path): |
| """ |
| Merges two MIDI files into a single MIDI file. This robust version iterates |
| through ALL instruments in both MIDI files, ensuring no data is lost if the |
| source files are multi-instrumental. |
| |
| It applies hard-left panning (Pan=0) to every instrument from the left MIDI |
| and hard-right panning (Pan=127) to every instrument from the right MIDI. |
| """ |
| try: |
| analyze_midi_velocity(midi_path_left) |
| analyze_midi_velocity(midi_path_right) |
| midi_left = pretty_midi.PrettyMIDI(midi_path_left) |
| midi_right = pretty_midi.PrettyMIDI(midi_path_right) |
| |
| merged_midi = pretty_midi.PrettyMIDI() |
|
|
| |
| if midi_left.instruments: |
| print(f"Found {len(midi_left.instruments)} instrument(s) in the left channel MIDI.") |
| |
| for instrument in midi_left.instruments: |
| scale_instrument_velocity(instrument, scale=0.8) |
| |
| instrument.name = f"Left - {instrument.name if instrument.name else 'Instrument'}" |
| |
| |
| |
| |
| |
| pan_left = pretty_midi.ControlChange(number=10, value=0, time=0.0) |
| |
| instrument.control_changes.insert(0, pan_left) |
| |
| |
| merged_midi.instruments.append(instrument) |
| |
| |
| if midi_right.instruments: |
| print(f"Found {len(midi_right.instruments)} instrument(s) in the right channel MIDI.") |
| |
| for instrument in midi_right.instruments: |
| scale_instrument_velocity(instrument, scale=0.8) |
| instrument.name = f"Right - {instrument.name if instrument.name else 'Instrument'}" |
| |
| |
| |
| |
| |
| pan_right = pretty_midi.ControlChange(number=10, value=127, time=0.0) |
| instrument.control_changes.insert(0, pan_right) |
| |
| merged_midi.instruments.append(instrument) |
| |
| merged_midi.write(output_path) |
| print(f"Successfully merged all instruments and panned into '{os.path.basename(output_path)}'") |
| analyze_midi_velocity(output_path) |
| return output_path |
| |
| except Exception as e: |
| print(f"Error merging MIDI files: {e}") |
| |
| if os.path.exists(midi_path_left): |
| print("Fallback: Using only the left channel MIDI.") |
| return midi_path_left |
| return None |
|
|
|
|
| |
| |
| |
|
|
| def TranscribePianoAudio(input_file): |
| """ |
| Transcribes a WAV or MP3 audio file of a SOLO PIANO performance into a MIDI file. |
| This uses the ByteDance model. |
| Args: |
| input_file_path (str): The path to the input audio file. |
| Returns: |
| str: The file path of the generated MIDI file. |
| """ |
| print('=' * 70) |
| print('STAGE 1: Starting Piano-Specific Transcription') |
| print('=' * 70) |
|
|
| |
| fn = os.path.basename(input_file) |
| fn1 = fn.split('.')[0] |
|
|
| |
| output_dir = os.path.join("output", "transcribed_piano_") |
| out_mid_path = os.path.join(output_dir, fn1 + '.mid') |
| |
| |
| if not os.path.exists(output_dir): |
| os.makedirs(output_dir) |
| |
| print('-' * 70) |
| print(f'Input file name: {fn}') |
| print(f'Output MIDI path: {out_mid_path}') |
| print('-' * 70) |
| |
| |
| print('Loading audio...') |
| (audio, _) = utilities.load_audio(input_file, sr=transcription_sample_rate, mono=True) |
| print('Audio loaded successfully.') |
| print('-' * 70) |
| |
| |
| |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' |
| print(f'Loading transcriptor model... device= {device}') |
| transcriptor = PianoTranscription(device=device, checkpoint_path="src/models/CRNN_note_F1=0.9677_pedal_F1=0.9186.pth") |
| print('Transcriptor loaded.') |
| print('-' * 70) |
| |
| |
| print('Transcribing audio to MIDI (Piano-Specific)...') |
| |
| transcriptor.transcribe(audio, out_mid_path) |
| print('Piano transcription complete.') |
| print('=' * 70) |
| |
| |
| return out_mid_path |
|
|
| def TranscribeGeneralAudio(input_file, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool): |
| """ |
| Transcribes a general audio file into a MIDI file using basic-pitch. |
| This is suitable for various instruments and vocals. |
| """ |
| print('=' * 70) |
| print('STAGE 1: Starting General Purpose Transcription') |
| print('=' * 70) |
|
|
| fn = os.path.basename(input_file) |
| fn1 = fn.split('.')[0] |
| output_dir = os.path.join("output", "transcribed_general_") |
| out_mid_path = os.path.join(output_dir, fn1 + '.mid') |
| os.makedirs(output_dir, exist_ok=True) |
|
|
| print(f'Input file: {fn}\nOutput MIDI: {out_mid_path}') |
| |
| |
| print('Transcribing audio to MIDI (General Purpose)...') |
| |
| model_output, midi_data, note_events = basic_pitch.inference.predict( |
| audio_path=input_file, |
| model_or_model_path=ICASSP_2022_MODEL_PATH, |
| onset_threshold=onset_thresh, |
| frame_threshold=frame_thresh, |
| minimum_note_length=min_note_len, |
| minimum_frequency=min_freq, |
| maximum_frequency=max_freq, |
| infer_onsets=infer_onsets_bool, |
| melodia_trick=melodia_trick_bool, |
| multiple_pitch_bends=multiple_bends_bool |
| ) |
| |
| |
| midi_data.write(out_mid_path) |
| print('General transcription complete.') |
| print('=' * 70) |
| |
| return out_mid_path |
|
|
| |
| |
| |
|
|
| def Render_MIDI(input_midi_path, |
| render_type, |
| soundfont_bank, |
| render_sample_rate, |
| render_with_sustains, |
| merge_misaligned_notes, |
| custom_render_patch, |
| render_align, |
| render_transpose_value, |
| render_transpose_to_C4, |
| render_output_as_solo_piano, |
| render_remove_drums, |
| |
| s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s, |
| s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, |
| s8bit_bass_boost_level, s8bit_smooth_notes, s8bit_continuous_vibrato, |
| s8bit_noise_level, s8bit_distortion_level, s8bit_fm_modulation_depth, s8bit_fm_modulation_rate |
| ): |
| """ |
| Processes and renders a MIDI file according to user-defined settings. |
| Can render using SoundFonts or a custom 8-bit synthesizer. |
| Args: |
| input_midi_path (str): The path to the input MIDI file. |
| All other arguments are rendering options from the Gradio UI. |
| Returns: |
| A tuple containing all the output elements for the Gradio UI. |
| """ |
| print('*' * 70) |
| print('STAGE 2: Starting MIDI Rendering') |
| print('*' * 70) |
|
|
| |
| fn = os.path.basename(input_midi_path) |
| fn1 = fn.split('.')[0] |
| |
| |
| output_dir = os.path.join("output", "rendered_midi") |
| if not os.path.exists(output_dir): |
| os.makedirs(output_dir) |
| |
| |
| new_fn_path = os.path.join(output_dir, fn1 + '_rendered.mid') |
|
|
| try: |
| with open(input_midi_path, 'rb') as f: |
| fdata = f.read() |
| input_midi_md5hash = hashlib.md5(fdata).hexdigest() |
| except FileNotFoundError: |
| |
| print(f"Error: Input MIDI file not found at {input_midi_path}") |
| return [None] * 7 |
|
|
| print('=' * 70) |
| print('Requested settings:') |
| print(f'Input MIDI file name: {fn}') |
| print(f'Input MIDI md5 hash: {input_midi_md5hash}') |
| print('-' * 70) |
| print(f'Render type: {render_type}') |
| print(f'Soundfont bank: {soundfont_bank}') |
| print(f'Audio render sample rate: {render_sample_rate}') |
| |
| print('=' * 70) |
| |
| |
| print('Processing MIDI... Please wait...') |
| raw_score = MIDI.midi2single_track_ms_score(fdata) |
| escore = TMIDIX.advanced_score_processor(raw_score, |
| return_enhanced_score_notes=True, |
| apply_sustain=render_with_sustains |
| )[0] |
|
|
| |
| if not escore: |
| print("Warning: MIDI file contains no processable notes.") |
| return ("N/A", fn1, "MIDI file contains no notes.",None, None, None, "No notes found.") |
|
|
| |
| if merge_misaligned_notes > 0: |
| escore = TMIDIX.merge_escore_notes(escore, merge_threshold=merge_misaligned_notes) |
| |
| escore = TMIDIX.augment_enhanced_score_notes(escore, timings_divider=1) |
|
|
| first_note_index = [e[0] for e in raw_score[1]].index('note') |
| cscore = TMIDIX.chordify_score([1000, escore]) |
|
|
| meta_data = raw_score[1][:first_note_index] + [escore[0]] + [escore[-1]] + [raw_score[1][-1]] |
|
|
| aux_escore_notes = TMIDIX.augment_enhanced_score_notes(escore, sort_drums_last=True) |
| song_description = TMIDIX.escore_notes_to_text_description(aux_escore_notes) |
| |
| print('Done!') |
| print('=' * 70) |
| print('Input MIDI metadata:', meta_data[:5]) |
| print('=' * 70) |
| print('Input MIDI song description:', song_description) |
| print('=' * 70) |
| print('Processing...Please wait...') |
|
|
| |
| output_score = copy.deepcopy(escore) |
|
|
| |
| if render_type == "Extract melody": |
| output_score = TMIDIX.add_melody_to_enhanced_score_notes(escore, return_melody=True) |
| output_score = TMIDIX.recalculate_score_timings(output_score) |
| elif render_type == "Flip": |
| output_score = TMIDIX.flip_enhanced_score_notes(escore) |
| elif render_type == "Reverse": |
| output_score = TMIDIX.reverse_enhanced_score_notes(escore) |
| elif render_type == 'Repair Durations': |
| output_score = TMIDIX.fix_escore_notes_durations(escore, min_notes_gap=0) |
| elif render_type == 'Repair Chords': |
| fixed_cscore = TMIDIX.advanced_check_and_fix_chords_in_chordified_score(cscore)[0] |
| output_score = TMIDIX.flatten(fixed_cscore) |
| elif render_type == 'Remove Duplicate Pitches': |
| output_score = TMIDIX.remove_duplicate_pitches_from_escore_notes(escore) |
| elif render_type == "Add Drum Track": |
| nd_escore = [e for e in escore if e[3] != 9] |
| nd_escore = TMIDIX.augment_enhanced_score_notes(nd_escore) |
| output_score = TMIDIX.advanced_add_drums_to_escore_notes(nd_escore) |
|
|
| for e in output_score: |
| e[1] *= 16 |
| e[2] *= 16 |
|
|
| print('MIDI processing complete.') |
| print('=' * 70) |
|
|
| |
| if render_type != "Render as-is": |
| print('Applying final adjustments (transpose, align, patch)...') |
| if custom_render_patch != -1: |
| for e in output_score: |
| if e[3] != 9: |
| e[6] = custom_render_patch |
| |
| if render_transpose_value != 0: |
| output_score = TMIDIX.transpose_escore_notes(output_score, render_transpose_value) |
|
|
| if render_transpose_to_C4: |
| output_score = TMIDIX.transpose_escore_notes_to_pitch(output_score, 60) |
| |
| if render_align == "Start Times": |
| output_score = TMIDIX.recalculate_score_timings(output_score) |
| output_score = TMIDIX.align_escore_notes_to_bars(output_score) |
|
|
| elif render_align == "Start Times and Durations": |
| output_score = TMIDIX.recalculate_score_timings(output_score) |
| output_score = TMIDIX.align_escore_notes_to_bars(output_score, trim_durations=True) |
| |
| elif render_align == "Start Times and Split Durations": |
| output_score = TMIDIX.recalculate_score_timings(output_score) |
| output_score = TMIDIX.align_escore_notes_to_bars(output_score, split_durations=True) |
|
|
| if render_type == "Longest Repeating Phrase": |
| zscore = TMIDIX.recalculate_score_timings(output_score) |
| lrno_score = TMIDIX.escore_notes_lrno_pattern_fast(zscore) |
| |
| if lrno_score is not None: |
| output_score = lrno_score |
| |
| else: |
| output_score = TMIDIX.recalculate_score_timings(TMIDIX.escore_notes_middle(output_score, 50)) |
| |
| if render_type == "Multi-Instrumental Summary": |
| zscore = TMIDIX.recalculate_score_timings(output_score) |
| c_escore_notes = TMIDIX.compress_patches_in_escore_notes_chords(zscore) |
| |
| if len(c_escore_notes) > 128: |
| cmatrix = TMIDIX.escore_notes_to_image_matrix(c_escore_notes, filter_out_zero_rows=True, filter_out_duplicate_rows=True) |
| smatrix = TPLOTS.square_image_matrix(cmatrix, num_pca_components=max(1, min(5, len(c_escore_notes) // 128))) |
| output_score = TMIDIX.image_matrix_to_original_escore_notes(smatrix) |
| |
| for o in output_score: |
| o[1] *= 250 |
| o[2] *= 250 |
|
|
| if render_output_as_solo_piano: |
| output_score = TMIDIX.solo_piano_escore_notes(output_score, keep_drums=(not render_remove_drums)) |
| |
| if render_remove_drums and not render_output_as_solo_piano: |
| output_score = TMIDIX.strip_drums_from_escore_notes(output_score) |
| |
| if render_type == "Solo Piano Summary": |
| sp_escore_notes = TMIDIX.solo_piano_escore_notes(output_score, keep_drums=False) |
| zscore = TMIDIX.recalculate_score_timings(sp_escore_notes) |
| |
| if len(zscore) > 128: |
| |
| bmatrix = TMIDIX.escore_notes_to_binary_matrix(zscore) |
| cmatrix = TMIDIX.compress_binary_matrix(bmatrix, only_compress_zeros=True) |
| smatrix = TPLOTS.square_binary_matrix(cmatrix, interpolation_order=max(1, min(5, len(zscore) // 128))) |
| output_score = TMIDIX.binary_matrix_to_original_escore_notes(smatrix) |
| |
| for o in output_score: |
| o[1] *= 200 |
| o[2] *= 200 |
|
|
| print('Final adjustments complete.') |
| print('=' * 70) |
| |
| |
| |
| SONG, patches, _ = TMIDIX.patch_enhanced_score_notes(output_score) |
| |
| |
| |
| path_without_ext = new_fn_path.rsplit('.mid', 1)[0] |
|
|
| TMIDIX.Tegridy_ms_SONG_to_MIDI_Converter(SONG, |
| output_signature = 'Integrated-MIDI-Processor', |
| output_file_name = path_without_ext, |
| track_name='Processed Track', |
| list_of_MIDI_patches=patches |
| ) |
| midi_to_render_path = new_fn_path |
| else: |
| |
| with open(new_fn_path, 'wb') as f: |
| f.write(fdata) |
| midi_to_render_path = new_fn_path |
|
|
| |
| print('Rendering final audio...') |
|
|
| |
| srate = int(render_sample_rate) |
| |
| |
| if soundfont_bank == SYNTH_8_BIT_LABEL: |
| print("Using 8-bit style synthesizer...") |
| try: |
| |
| midi_data_for_synth = pretty_midi.PrettyMIDI(midi_to_render_path) |
| |
| |
| audio = synthesize_8bit_style( |
| midi_data_for_synth, |
| s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s, |
| s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, |
| s8bit_bass_boost_level, |
| fs=srate, |
| smooth_notes=s8bit_smooth_notes, |
| continuous_vibrato=s8bit_continuous_vibrato, |
| noise_level=s8bit_noise_level, |
| distortion_level=s8bit_distortion_level, |
| fm_modulation_depth=s8bit_fm_modulation_depth, |
| fm_modulation_rate=s8bit_fm_modulation_rate |
| ) |
| |
| peak_val = np.max(np.abs(audio)) |
| if peak_val > 0: |
| audio /= peak_val |
| |
| audio_out = (audio.T * 32767).astype(np.int16) |
| except Exception as e: |
| print(f"Error during 8-bit synthesis: {e}") |
| return [None] * 7 |
| else: |
| print(f"Using SoundFont: {soundfont_bank}") |
| |
| soundfont_path = soundfonts_dict.get(soundfont_bank) |
|
|
| |
| if not soundfont_path or not os.path.exists(soundfont_path): |
| |
| error_msg = f"SoundFont '{soundfont_bank}' not found!" |
| print(f"ERROR: {error_msg}") |
| |
| if soundfonts_dict: |
| fallback_key = list(soundfonts_dict.keys())[0] |
| soundfont_path = soundfonts_dict[fallback_key] |
| print(f"Falling back to '{fallback_key}'.") |
| else: |
| |
| raise gr.Error("No SoundFonts are available for rendering!") |
|
|
| with open(midi_to_render_path, 'rb') as f: |
| midi_file_content = f.read() |
|
|
| audio_out = midi_to_colab_audio(midi_file_content, |
| soundfont_path=soundfont_path, |
| sample_rate=srate, |
| output_for_gradio=True |
| ) |
| |
| print('Audio rendering complete.') |
| print('=' * 70) |
|
|
| |
| with open(midi_to_render_path, 'rb') as f: |
| new_md5_hash = hashlib.md5(f.read()).hexdigest() |
| output_plot = TPLOTS.plot_ms_SONG(output_score, plot_title=f"Score of {fn1}", return_plt=True) |
|
|
| output_midi_summary = str(meta_data) |
| |
| return new_md5_hash, fn1, output_midi_summary, midi_to_render_path, (srate, audio_out), output_plot, song_description |
|
|
| |
| |
| |
|
|
| def process_and_render_file(input_file, |
| |
| enable_stereo_processing, |
| transcription_method, |
| onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool, |
| |
| render_type, soundfont_bank, render_sample_rate, |
| render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align, |
| render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums, |
| |
| s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s, |
| s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, |
| s8bit_bass_boost_level, s8bit_smooth_notes, s8bit_continuous_vibrato, |
| s8bit_noise_level, s8bit_distortion_level, s8bit_fm_modulation_depth, s8bit_fm_modulation_rate |
| ): |
| """ |
| Main function to handle file processing. It determines the file type and calls the |
| appropriate functions for transcription and/or rendering based on user selections. |
| """ |
| start_time = reqtime.time() |
| if input_file is None: |
| |
| return [gr.update(value=None)] * 7 |
|
|
| |
| |
| input_file_path = input_file |
| filename = os.path.basename(input_file_path) |
| print(f"Processing new file: {filename}") |
| |
| try: |
| audio_data, native_sample_rate = librosa.load(input_file_path, sr=None, mono=False) |
| except Exception as e: |
| raise gr.Error(f"Failed to load audio file: {e}") |
|
|
| |
| if filename.lower().endswith(('.mid', '.midi', '.kar')): |
| print("MIDI file detected. Proceeding directly to rendering.") |
| midi_path_for_rendering = input_file_path |
| else: |
| print("Audio file detected. Starting transcription...") |
| |
| base_name = os.path.splitext(filename)[0] |
| temp_dir = "output/temp_normalized" |
| os.makedirs(temp_dir, exist_ok=True) |
|
|
| |
| if enable_stereo_processing: |
| if audio_data.ndim != 2 or audio_data.shape[0] != 2: |
| print("Warning: Audio is not stereo or could not be loaded as stereo. Falling back to mono transcription.") |
| enable_stereo_processing = False |
|
|
| if enable_stereo_processing: |
| print("Stereo processing enabled. Splitting channels...") |
| try: |
| left_channel = audio_data[0] |
| right_channel = audio_data[1] |
| |
| normalized_left = normalize_loudness(left_channel, native_sample_rate) |
| normalized_right = normalize_loudness(right_channel, native_sample_rate) |
| |
| temp_left_wav_path = os.path.join(temp_dir, f"{base_name}_left.wav") |
| temp_right_wav_path = os.path.join(temp_dir, f"{base_name}_right.wav") |
| |
| sf.write(temp_left_wav_path, normalized_left, native_sample_rate) |
| sf.write(temp_right_wav_path, normalized_right, native_sample_rate) |
|
|
| print(f"Saved left channel to: {temp_left_wav_path}") |
| print(f"Saved right channel to: {temp_right_wav_path}") |
| |
| print("Transcribing left channel...") |
| if transcription_method == "General Purpose": |
| midi_path_left = TranscribeGeneralAudio(temp_left_wav_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool) |
| else: |
| midi_path_left = TranscribePianoAudio(temp_left_wav_path) |
| |
| print("Transcribing right channel...") |
| if transcription_method == "General Purpose": |
| midi_path_right = TranscribeGeneralAudio(temp_right_wav_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool) |
| else: |
| midi_path_right = TranscribePianoAudio(temp_right_wav_path) |
| |
| if midi_path_left and midi_path_right: |
| merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid") |
| midi_path_for_rendering = merge_midis(midi_path_left, midi_path_right, merged_midi_path) |
| elif midi_path_left: |
| print("Warning: Right channel transcription failed. Using left channel only.") |
| midi_path_for_rendering = midi_path_left |
| elif midi_path_right: |
| print("Warning: Left channel transcription failed. Using right channel only.") |
| midi_path_for_rendering = midi_path_right |
| else: |
| raise gr.Error("Both left and right channel transcriptions failed.") |
|
|
| except Exception as e: |
| print(f"An error occurred during stereo processing: {e}") |
| raise gr.Error(f"Stereo Processing Failed: {e}") |
| else: |
| print("Stereo processing disabled. Using standard mono transcription.") |
| if audio_data.ndim == 1: |
| mono_signal = audio_data |
| else: |
| mono_signal = np.mean(audio_data, axis=0) |
| |
| normalized_mono = normalize_loudness(mono_signal, native_sample_rate) |
|
|
| temp_mono_wav_path = os.path.join(temp_dir, f"{base_name}_mono.wav") |
| sf.write(temp_mono_wav_path, normalized_mono, native_sample_rate) |
| |
| try: |
| if transcription_method == "General Purpose": |
| midi_path_for_rendering = TranscribeGeneralAudio( |
| temp_mono_wav_path, onset_thresh, frame_thresh, min_note_len, |
| min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool |
| ) |
| else: |
| midi_path_for_rendering = TranscribePianoAudio(temp_mono_wav_path) |
| analyze_midi_velocity(midi_path_for_rendering) |
| except Exception as e: |
| print(f"An error occurred during transcription: {e}") |
| raise gr.Error(f"Transcription Failed: {e}") |
|
|
| |
| print(f"Proceeding to render MIDI file: {os.path.basename(midi_path_for_rendering)}") |
| |
| results = Render_MIDI(midi_path_for_rendering, |
| render_type, soundfont_bank, render_sample_rate, |
| render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align, |
| render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums, |
| s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s, |
| s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, s8bit_bass_boost_level, |
| s8bit_smooth_notes, s8bit_continuous_vibrato, |
| s8bit_noise_level, s8bit_distortion_level, s8bit_fm_modulation_depth, s8bit_fm_modulation_rate |
| ) |
| |
| print(f'Total processing time: {(reqtime.time() - start_time):.2f} sec') |
| print('*' * 70) |
| |
| return results |
|
|
| |
| |
| |
|
|
| def update_ui_visibility(transcription_method, soundfont_choice): |
| """ |
| Dynamically updates the visibility of UI components based on user selections. |
| """ |
| is_general = (transcription_method == "General Purpose") |
| is_8bit = (soundfont_choice == SYNTH_8_BIT_LABEL) |
|
|
| return { |
| general_transcription_settings: gr.update(visible=is_general), |
| synth_8bit_settings: gr.update(visible=is_8bit), |
| } |
|
|
| |
| def apply_8bit_preset(preset_name): |
| """ |
| Takes the name of a preset and returns a dictionary of gr.update objects |
| to set the values of the 8-bit synthesizer's UI components. |
| """ |
| |
| if preset_name == "Custom" or preset_name not in S8BIT_PRESETS: |
| return { |
| s8bit_waveform_type: gr.update(), |
| s8bit_pulse_width: gr.update(), |
| s8bit_envelope_type: gr.update(), |
| s8bit_decay_time_s: gr.update(), |
| s8bit_vibrato_rate: gr.update(), |
| s8bit_vibrato_depth: gr.update(), |
| s8bit_smooth_notes: gr.update(), |
| s8bit_continuous_vibrato: gr.update(), |
| s8bit_bass_boost_level: gr.update() |
| } |
| |
| |
| settings = S8BIT_PRESETS[preset_name] |
| |
| |
| return { |
| s8bit_waveform_type: gr.update(value=settings['waveform_type']), |
| s8bit_pulse_width: gr.update(value=settings['pulse_width']), |
| s8bit_envelope_type: gr.update(value=settings['envelope_type']), |
| s8bit_decay_time_s: gr.update(value=settings['decay_time_s']), |
| s8bit_vibrato_rate: gr.update(value=settings['vibrato_rate']), |
| s8bit_vibrato_depth: gr.update(value=settings['vibrato_depth']), |
| s8bit_smooth_notes: gr.update(value=settings['smooth_notes']), |
| s8bit_continuous_vibrato: gr.update(value=settings['continuous_vibrato']), |
| s8bit_bass_boost_level: gr.update(value=settings['bass_boost_level']) |
| } |
|
|
| if __name__ == "__main__": |
| |
| |
| initialize_app() |
| |
| |
| global soundfonts_dict |
| |
| soundfonts_dict = prepare_soundfonts() |
| print(f"Found {len(soundfonts_dict)} local SoundFonts.") |
|
|
| if not soundfonts_dict: |
| print("\nWARNING: No SoundFonts were found or could be downloaded.") |
| print("Rendering with SoundFonts will fail. Only the 8-bit synthesizer will be available.") |
| |
| |
| |
| |
| |
| S8BIT_PRESETS = { |
| |
| "Rhythm Pop Lead": { |
| |
| 'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18, 'vibrato_rate': 4.5, 'vibrato_depth': 4, |
| 'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| "Arcade Brawler Lead": { |
| |
| 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15, 'vibrato_rate': 5.0, 'vibrato_depth': 6, |
| 'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.1, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| "Mega Man (Rockman)": { |
| |
| 'waveform_type': 'Square', 'pulse_width': 0.2, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15, 'vibrato_rate': 6.0, 'vibrato_depth': 8, |
| 'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.05, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| "Kirby's Bubbly Melody": { |
| |
| 'waveform_type': 'Square', 'pulse_width': 0.4, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2, 'vibrato_rate': 6.0, 'vibrato_depth': 4, |
| 'smooth_notes': True, 'continuous_vibrato': False, 'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| "Mario (Super Mario Bros)": { |
| |
| 'waveform_type': 'Square', 'pulse_width': 0.3, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25, 'vibrato_rate': 5.0, 'vibrato_depth': 5, |
| 'smooth_notes': True, 'continuous_vibrato': False, 'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| |
| "Mecha & Tactics Brass": { |
| |
| 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, 'vibrato_rate': 3.5, 'vibrato_depth': 5, |
| 'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| "Mystic Mana Pad": { |
| |
| 'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5, 'vibrato_rate': 2.5, 'vibrato_depth': 4, |
| 'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| "Dragon Quest (Orchestral Feel)": { |
| |
| 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.6, 'vibrato_rate': 3.0, 'vibrato_depth': 4, |
| 'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| "ONI V (Wafu Mystic)": { |
| |
| 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, 'vibrato_rate': 3.5, 'vibrato_depth': 3, |
| 'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.4, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| "Zelda (NES)": { |
| |
| 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.3, 'vibrato_rate': 4.5, 'vibrato_depth': 4, |
| 'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.15, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| |
| "Falcom Ys (Rock Lead)": { |
| |
| 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15, 'vibrato_rate': 5.5, 'vibrato_depth': 6, |
| 'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.15, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| "Final Fantasy (Arpeggio)": { |
| |
| 'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22, 'vibrato_rate': 5.0, 'vibrato_depth': 0, |
| 'smooth_notes': True, 'continuous_vibrato': False, 'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| "Castlevania (Akumajō Dracula)": { |
| |
| 'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18, 'vibrato_rate': 6.5, 'vibrato_depth': 6, |
| 'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.35, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| "Pokémon (Game Boy Classics)": { |
| |
| 'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22, 'vibrato_rate': 5.0, 'vibrato_depth': 5, |
| 'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| |
| "Commodore 64 (SID Feel)": { |
| |
| 'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25, 'vibrato_rate': 8.0, 'vibrato_depth': 4, |
| 'smooth_notes': True, 'continuous_vibrato': False, 'bass_boost_level': 0.2, 'noise_level': 0.05, 'distortion_level': 0.1, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| "Megadrive/Genesis (FM Grit)": { |
| |
| 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18, 'vibrato_rate': 0.0, 'vibrato_depth': 0, |
| 'smooth_notes': False, 'continuous_vibrato': True, 'bass_boost_level': 0.4, 'noise_level': 0.1, 'distortion_level': 0.2, 'fm_modulation_depth': 0.2, 'fm_modulation_rate': 150 |
| }, |
| "PC-98 (Touhou Feel)": { |
| |
| 'waveform_type': 'Square', 'pulse_width': 0.15, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.12, 'vibrato_rate': 7.5, 'vibrato_depth': 7, |
| 'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.1, 'fm_modulation_rate': 200 |
| }, |
| "Roland SC-88 (GM Vibe)": { |
| |
| 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35, 'vibrato_rate': 0, 'vibrato_depth': 0, |
| 'smooth_notes': True, 'continuous_vibrato': False, 'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| |
| "Sci-Fi Energy Field": { |
| |
| 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, 'vibrato_rate': 10.0, 'vibrato_depth': 3, |
| 'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.1, 'noise_level': 0.1, 'distortion_level': 0.0, 'fm_modulation_depth': 0.05, 'fm_modulation_rate': 50 |
| }, |
| "Industrial Alarm": { |
| |
| 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2, 'vibrato_rate': 15.0, 'vibrato_depth': 8, |
| 'smooth_notes': False, 'continuous_vibrato': False, 'bass_boost_level': 0.3, 'noise_level': 0.2, 'distortion_level': 0.3, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| "Laser Charge-Up": { |
| |
| 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.3, 'vibrato_rate': 4.0, 'vibrato_depth': 25, |
| 'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| "Unstable Machine Core": { |
| |
| 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5, 'vibrato_rate': 1.0, 'vibrato_depth': 50, |
| 'smooth_notes': False, 'continuous_vibrato': True, 'bass_boost_level': 0.5, 'noise_level': 0.3, 'distortion_level': 0.4, 'fm_modulation_depth': 0.5, 'fm_modulation_rate': 10 |
| }, |
| "Hardcore Gabber Kick": { |
| |
| 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.1, 'vibrato_rate': 0, 'vibrato_depth': 0, |
| 'smooth_notes': False, 'continuous_vibrato': False, 'bass_boost_level': 0.8, 'noise_level': 0.2, 'distortion_level': 0.5, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| |
| "Generic Chiptune Loop": { |
| |
| 'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2, 'vibrato_rate': 5.5, 'vibrato_depth': 4, |
| 'smooth_notes': True, 'continuous_vibrato': True, 'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| }, |
| "Dark/Boss Atmosphere": { |
| |
| 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35, 'vibrato_rate': 7.0, 'vibrato_depth': 12, |
| 'smooth_notes': False, 'continuous_vibrato': False, 'bass_boost_level': 0.4, 'noise_level': 0.15, 'distortion_level': 0.25, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 |
| } |
| } |
|
|
| app = gr.Blocks(theme=gr.themes.Base()) |
| |
| with app: |
| gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>Audio-to-MIDI & Advanced Renderer</h1>") |
| gr.Markdown( |
| "**Upload a Audio for transcription-then-rendering, or a MIDI for rendering-only.**\n\n" |
| "This application combines piano audio transcription with a powerful MIDI transformation and rendering toolkit. " |
| "Based on the work of [asigalov61](https://github.com/asigalov61)." |
| ) |
| |
| with gr.Row(): |
| waveform_options = gr.WaveformOptions(show_recording_waveform=False) |
| with gr.Column(scale=1): |
| |
| gr.Markdown("## 1. Upload File") |
| |
| |
| |
| |
| input_file = gr.Audio( |
| label="Input Audio or MIDI File", |
| type="filepath", |
| sources=["upload"], waveform_options=waveform_options |
| ) |
| |
| gr.Markdown("## 2. Configure Processing") |
|
|
| |
| transcription_method = gr.Radio( |
| ["General Purpose", "Piano-Specific"], |
| label="Audio Transcription Method", |
| value="General Purpose", |
| info="Choose 'General Purpose' for most music (vocals, etc.). Choose 'Piano-Specific' only for solo piano recordings." |
| ) |
| |
| |
| enable_stereo_processing = gr.Checkbox( |
| label="Enable Stereo Transcription", |
| value=False, |
| info="If checked, left/right audio channels are transcribed separately and merged. Doubles processing time." |
| ) |
|
|
| with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings: |
| onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.") |
| frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.") |
| minimum_note_length = gr.Slider(10, 500, value=128, step=1, label="Minimum Note Length (ms)", info="Filters out very short, noisy notes.") |
| minimum_frequency = gr.Slider(0, 500, value=60, step=5, label="Minimum Frequency (Hz)", info="Ignores pitches below this frequency.") |
| maximum_frequency = gr.Slider(501, 10000, value=4000, step=10, label="Maximum Frequency (Hz)", info="Ignores pitches above this frequency.") |
| infer_onsets = gr.Checkbox(value=True, label="Infer Onsets (Boost Onsets)") |
| melodia_trick = gr.Checkbox(value=True, label="Melodia Trick (Contour Optimization)") |
| multiple_pitch_bends = gr.Checkbox(value=False, label="Allow Multiple Pitch Bends") |
|
|
| |
| render_type = gr.Radio( |
| ["Render as-is", "Custom render", "Extract melody", "Flip", "Reverse", "Repair Durations", "Repair Chords", "Remove Duplicate Pitches", "Longest Repeating Phrase", "Multi-Instrumental Summary", "Solo Piano Summary", "Add Drum Track"], |
| label="MIDI Transformation Render Type", |
| value="Render as-is", |
| info="Apply transformations to the MIDI before rendering. Select 'Render as-is' for basic rendering or other options for transformations." |
| ) |
| |
| |
| |
| soundfont_choices = [SYNTH_8_BIT_LABEL] + list(soundfonts_dict.keys()) |
| |
| default_sf_choice = "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7" if "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7" in soundfonts_dict else (soundfont_choices[0] if soundfont_choices else "") |
| |
| soundfont_bank = gr.Dropdown( |
| soundfont_choices, |
| label="SoundFont / Synthesizer", |
| value=default_sf_choice |
| ) |
|
|
| render_sample_rate = gr.Radio( |
| ["16000", "32000", "44100"], |
| label="Audio Sample Rate", |
| value="44100" |
| ) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| with gr.Accordion("8-bit Synthesizer Settings", open=False, visible=False) as synth_8bit_settings: |
| |
| s8bit_preset_selector = gr.Dropdown( |
| choices=["Custom"] + list(S8BIT_PRESETS.keys()), |
| value="Custom", |
| label="Style Preset", |
| info="Select a preset to auto-fill the settings below. Choose 'Custom' for manual control.\nFor reference and entertainment only. These presets are not guaranteed to be perfectly accurate." |
| ) |
| |
| s8bit_waveform_type = gr.Dropdown(['Square', 'Sawtooth', 'Triangle'], value='Square', label="Waveform Type") |
| s8bit_pulse_width = gr.Slider(0.01, 0.99, value=0.5, step=0.01, label="Pulse Width (Square Wave Only)") |
| s8bit_envelope_type = gr.Dropdown(['Plucky (AD Envelope)', 'Sustained (Full Decay)'], value='Plucky (AD Envelope)', label="Envelope Type") |
| s8bit_decay_time_s = gr.Slider(0.01, 0.6, value=0.1, step=0.01, label="Decay Time (s)") |
| s8bit_vibrato_rate = gr.Slider(0, 20, value=5, label="Vibrato Rate (Hz)") |
| s8bit_vibrato_depth = gr.Slider(0, 50, value=0, label="Vibrato Depth (Hz)") |
| s8bit_bass_boost_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="Bass Boost Level", info="Adjusts the volume of the sub-octave. 0 is off.") |
| s8bit_smooth_notes = gr.Checkbox(value=True, label="Smooth Notes", info="Applies a tiny fade-in/out to notes to reduce clicking.") |
| s8bit_continuous_vibrato = gr.Checkbox(value=True, label="Continuous Vibrato", info="Prevents vibrato from resetting on each note.") |
| |
| |
| with gr.Accordion("Advanced Synthesis & FX", open=False): |
| s8bit_noise_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Noise Level", info="Mixes in white noise. Great for percussion or adding 'air'.") |
| s8bit_distortion_level = gr.Slider(minimum=0.0, maximum=0.9, value=0.0, step=0.05, label="Distortion Level", info="Applies wave-shaping distortion for a grittier, harsher sound.") |
| s8bit_fm_modulation_depth = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="FM Depth", info="Depth of Frequency Modulation. Creates complex, metallic, or bell-like tones.") |
| s8bit_fm_modulation_rate = gr.Slider(minimum=0.0, maximum=500.0, value=0.0, step=1.0, label="FM Rate", info="Rate of Frequency Modulation. Higher values create brighter, more complex harmonics.") |
| |
| |
| with gr.Accordion("Advanced MIDI Rendering Options", open=False) as advanced_rendering_options: |
| render_with_sustains = gr.Checkbox(label="Apply sustain pedal effects (if present)", value=True) |
| render_output_as_solo_piano = gr.Checkbox(label="Convert to Solo Piano (Grand Piano patch)", value=False) |
| render_remove_drums = gr.Checkbox(label="Remove drum track", value=False) |
| render_transpose_to_C4 = gr.Checkbox(label="Transpose entire score to center around C4", value=False) |
| render_transpose_value = gr.Slider(-12, 12, value=0, step=1, label="Transpose (semitones)") |
| custom_render_patch = gr.Slider(-1, 127, value=-1, step=1, label="Force MIDI Patch (-1 to disable)") |
| merge_misaligned_notes = gr.Slider(-1, 127, value=-1, label="Time to merge notes in ms (-1 to disable)") |
| render_align = gr.Radio( |
| ["Do not align", "Start Times", "Start Times and Durations", "Start Times and Split Durations"], |
| label="Align notes to musical bars", |
| value="Do not align" |
| ) |
|
|
| submit_btn = gr.Button("Process and Render", variant="primary") |
| |
| with gr.Column(scale=2): |
| |
| gr.Markdown("## 3. Results") |
| output_midi_title = gr.Textbox(label="MIDI Title") |
| output_song_description = gr.Textbox(label="MIDI Description", lines=3) |
| output_audio = gr.Audio(label="Rendered Audio Output", format="wav", waveform_options=waveform_options) |
| output_plot = gr.Plot(label="MIDI Score Plot") |
| with gr.Row(): |
| output_midi = gr.File(label="Download Processed MIDI File", file_types=[".mid"]) |
| output_midi_md5 = gr.Textbox(label="Output MIDI MD5 Hash") |
| output_midi_summary = gr.Textbox(label="MIDI metadata summary", lines=4) |
| |
| |
| all_inputs = [ |
| input_file, |
| enable_stereo_processing, |
| transcription_method, |
| onset_threshold, frame_threshold, minimum_note_length, minimum_frequency, maximum_frequency, |
| infer_onsets, melodia_trick, multiple_pitch_bends, |
| render_type, soundfont_bank, render_sample_rate, |
| render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align, |
| render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums, |
| s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s, |
| s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, s8bit_bass_boost_level, |
| s8bit_smooth_notes, s8bit_continuous_vibrato, |
| s8bit_noise_level, s8bit_distortion_level, s8bit_fm_modulation_depth, s8bit_fm_modulation_rate |
| ] |
| all_outputs = [ |
| output_midi_md5, output_midi_title, output_midi_summary, |
| output_midi, output_audio, output_plot, output_song_description |
| ] |
| |
| |
| s8bit_updater_outputs = [ |
| s8bit_waveform_type, s8bit_pulse_width, s8bit_envelope_type, |
| s8bit_decay_time_s, s8bit_vibrato_rate, s8bit_vibrato_depth, |
| s8bit_smooth_notes, s8bit_continuous_vibrato, s8bit_bass_boost_level, |
| s8bit_noise_level, s8bit_distortion_level, s8bit_fm_modulation_depth, s8bit_fm_modulation_rate |
| ] |
| |
| |
| submit_btn.click( |
| process_and_render_file, |
| inputs=all_inputs, |
| outputs=all_outputs |
| ) |
| |
| |
| transcription_method.change( |
| fn=update_ui_visibility, |
| inputs=[transcription_method, soundfont_bank], |
| outputs=[general_transcription_settings, synth_8bit_settings] |
| ) |
| soundfont_bank.change( |
| fn=update_ui_visibility, |
| inputs=[transcription_method, soundfont_bank], |
| outputs=[general_transcription_settings, synth_8bit_settings] |
| ) |
| |
| |
| |
| |
| |
| s8bit_preset_selector.change( |
| fn=apply_8bit_preset, |
| inputs=[s8bit_preset_selector], |
| outputs=s8bit_updater_outputs |
| ) |
|
|
|
|
| |
| app.queue().launch(inbrowser=True, debug=True) |
|
|