Audio-To-MIDI-And-Advanced-Renderer

Running

avans06 commited on Aug 11, 2025

Commit

22dd15a

1 Parent(s): 58195d3

Add experimental vocal separation and selective transcription features

- Added **Separate Vocals** option (experimental): Splits input audio into vocals and music stems before processing.
- Added **Transcription Target** setting: Allows choosing which stem (vocals or music) to transcribe to MIDI.
- Added option to **Re-merge Other Part with Rendered Audio**: After rendering, merges the non-transcribed stem (e.g., original vocals) back with the new music.

Files changed (2) hide show

app.py +364 -160
requirements.txt +6 -2

app.py CHANGED Viewed

@@ -50,8 +50,13 @@ import soundfile as sf
 import torch
 import gradio as gr
-from src.piano_transcription.utils import initialize_app
 from piano_transcription_inference  import PianoTranscription, utilities, sample_rate as transcription_sample_rate
 # --- Import core transcription and MIDI processing libraries ---
@@ -1042,6 +1047,9 @@ def recommend_8bit_params(midi_data, default_preset):
 def process_and_render_file(input_file,
                             # --- Pass the preset selector value ---
                             s8bit_preset_selector,
                             # --- Transcription params ---
                             enable_stereo_processing,
                             transcription_method,
@@ -1071,14 +1079,9 @@ def process_and_render_file(input_file,
     filename = os.path.basename(input_file_path)
     print(f"Processing new file: {filename}")
-    try:
-        # Mono=False is required to correctly detect stereo channels
-        audio_data, native_sample_rate = librosa.load(input_file_path, sr=None, mono=False)
-    except Exception as e:
-        # If loading fails, it might be a MIDI file, which librosa cannot handle.
-        # We will proceed, assuming it's a MIDI, and let pretty_midi handle it later.
-        print(f"Could not load as audio: {e}. Assuming it is a MIDI file.")
-        pass
     # --- Step 1: Check file type and transcribe if necessary ---
     if filename.lower().endswith(('.mid', '.midi', '.kar')):
@@ -1086,42 +1089,95 @@ def process_and_render_file(input_file,
         midi_path_for_rendering = input_file_path
     else: #if filename.lower().endswith(('.wav', '.mp3'))
         print("Audio file detected. Starting transcription...")
         base_name = os.path.splitext(filename)[0]
-        temp_dir = "output/temp_normalized"
         os.makedirs(temp_dir, exist_ok=True)
         # === STEREO PROCESSING LOGIC ===
         if enable_stereo_processing:
-            if 'audio_data' not in locals() or audio_data.ndim != 2 or audio_data.shape[0] != 2:
                 print("Warning: Audio is not stereo or could not be loaded as stereo. Falling back to mono transcription.")
                 enable_stereo_processing = False # Disable stereo processing if audio is not stereo
         if enable_stereo_processing:
-            print("Stereo processing enabled. Splitting channels...")
             try:
-                left_channel = audio_data[0]
-                right_channel = audio_data[1]
-                normalized_left = normalize_loudness(left_channel, native_sample_rate)
-                normalized_right = normalize_loudness(right_channel, native_sample_rate)
-                temp_left_wav_path = os.path.join(temp_dir, f"{base_name}_left.wav")
-                temp_right_wav_path = os.path.join(temp_dir, f"{base_name}_right.wav")
-                sf.write(temp_left_wav_path, normalized_left, native_sample_rate)
-                sf.write(temp_right_wav_path, normalized_right, native_sample_rate)
-                print(f"Saved left channel to: {temp_left_wav_path}")
-                print(f"Saved right channel to: {temp_right_wav_path}")
                 print("Transcribing left and right channel...")
                 if transcription_method == "General Purpose":
-                    midi_path_left = TranscribeGeneralAudio(temp_left_wav_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
-                    midi_path_right = TranscribeGeneralAudio(temp_right_wav_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
-                else:
-                    midi_path_left = TranscribePianoAudio(temp_left_wav_path)
-                    midi_path_right = TranscribePianoAudio(temp_right_wav_path)
                 if midi_path_left and midi_path_right:
                     merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
@@ -1139,24 +1195,22 @@ def process_and_render_file(input_file,
                 print(f"An error occurred during stereo processing: {e}")
                 raise gr.Error(f"Stereo Processing Failed: {e}")
         else: # Standard mono transcription
-            print("Stereo processing disabled. Using standard mono transcription.")
-            if 'audio_data' in locals():
-                if audio_data.ndim == 1:
-                    mono_signal = audio_data
-                else:
-                    mono_signal = np.mean(audio_data, axis=0)
-                normalized_mono = normalize_loudness(mono_signal, native_sample_rate)
-                temp_mono_wav_path = os.path.join(temp_dir, f"{base_name}_mono.wav")
-                sf.write(temp_mono_wav_path, normalized_mono, native_sample_rate)
-                input_file_path = temp_mono_wav_path # Use the normalized mono file for transcription
             try:
                 if transcription_method == "General Purpose":
-                    midi_path_for_rendering = TranscribeGeneralAudio(input_file_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
                 else: # Piano-Specific
-                    midi_path_for_rendering = TranscribePianoAudio(input_file_path)
             except Exception as e:
                 print(f"An error occurred during transcription: {e}")
                 raise gr.Error(f"Transcription Failed: {e}")
@@ -1216,7 +1270,43 @@ def process_and_render_file(input_file,
                           synth_params['fm_modulation_depth'],
                           synth_params['fm_modulation_rate']
                           )
     print(f'Total processing time: {(reqtime.time() - start_time):.2f} sec')
     print('*' * 70)
@@ -1308,7 +1398,18 @@ if __name__ == "__main__":
     if not soundfonts_dict:
         print("\nWARNING: No SoundFonts were found or could be downloaded.")
         print("Rendering with SoundFonts will fail. Only the 8-bit synthesizer will be available.")
     # --- Define a constant for the fallback preset name ---
     # This prevents errors if the preset name is changed in the dictionary.
     FALLBACK_PRESET_NAME = "Generic Chiptune Loop"
@@ -1318,43 +1419,7 @@ if __name__ == "__main__":
     # Comprehensive preset dictionary including new JRPG and Handheld classics
     # Note: Vibrato depth is mapped to a representative value on the 0-50 Hz slider.
     S8BIT_PRESETS = {
-        # --- Rhythmic & Action ---
-        "Rhythm Pop Lead (Rhythm Tengoku / リズム天国)": {
-            # Description: A clean, round square wave perfect for the snappy, catchy feel of rhythm games.
-            'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18,
-            'vibrato_rate': 4.5, 'vibrato_depth': 4,
-            'smooth_notes_level': 0.9,          # Formerly True -> 1.0; slightly reduced for a bit more attack.
-            'continuous_vibrato_level': 0.8,    # Formerly True -> 1.0; slightly weakened for more defined note transitions.
-            'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
-            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
-        },
-        "Arcade Brawler Lead (Street Fighter / ストリートファイター)": {
-            # Description: A gritty sawtooth lead with a hard attack, capturing the high-energy feel of classic fighting games.
-            'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
-            'vibrato_rate': 5.0, 'vibrato_depth': 6,
-            'smooth_notes_level': 0.8,
-            'continuous_vibrato_level': 0.7,
-            'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.1,
-            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
-        },
-        "Mega Man (Rockman / ロックマン)": {
-            # Description: A thin, sharp square wave lead with fast vibrato, iconic for its driving, heroic melodies.
-            'waveform_type': 'Square', 'pulse_width': 0.2, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
-            'vibrato_rate': 6.0, 'vibrato_depth': 8,
-            'smooth_notes_level': 0.9,
-            'continuous_vibrato_level': 0.85,
-            'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.05,
-            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
-        },
-        "Kirby's Bubbly Melody (Hoshi no Kirby / 星のカービィ)": {
-            # Description: A soft, round square wave with a bouncy vibrato, creating a cheerful and adorable sound.
-            'waveform_type': 'Square', 'pulse_width': 0.4, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2,
-            'vibrato_rate': 6.0, 'vibrato_depth': 4,
-            'smooth_notes_level': 0.85,
-            'continuous_vibrato_level': 0.3,   # Formerly False (0.0); adds a hint of continuity for more liveliness.
-            'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0,
-            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
-        },
         "Mario (Super Mario Bros / スーパーマリオブラザーズ)": {
             # Description: A bright square wave with a per-note vibrato, producing the classic bouncy platformer sound.
             'waveform_type': 'Square', 'pulse_width': 0.3, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25,
@@ -1364,41 +1429,13 @@ if __name__ == "__main__":
             'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
-        # --- Epic & Atmospheric ---
-        "Mecha & Tactics Brass (Super Robot Wars / スーパーロボット大戦)": {
-            # Description: A powerful, sustained sawtooth emulating the bold, heroic synth-brass of strategy and mecha anime themes.
-            'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
-            'vibrato_rate': 3.5, 'vibrato_depth': 5,
-            'smooth_notes_level': 0.95,
-            'continuous_vibrato_level': 0.9,
-            'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15,
-            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
-        },
-        "Mystic Mana Pad (Secret of Mana / 聖剣伝説2)": {
-             # Description: A warm, ethereal square wave pad with slow vibrato, capturing a feeling of fantasy and wonder.
-            'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5,
-            'vibrato_rate': 2.5, 'vibrato_depth': 4,
-            'smooth_notes_level': 1.0,
-            'continuous_vibrato_level': 0.95,
-            'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
-            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
-        },
-        "Dragon Quest (ドラゴンクエスト)": {
-            # Description: A pure triangle wave with a long decay, mimicking the grand, orchestral feel of a classical flute or string section.
-           'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.6,
-            'vibrato_rate': 3.0, 'vibrato_depth': 4,
-            'smooth_notes_level': 0.9,
-            'continuous_vibrato_level': 0.9,
-            'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
-            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
-        },
-        "ONI V (Wafu Mystic / ONI V 隠忍を継ぐ者)": {
-            # Description: A solemn triangle wave with a slow, expressive vibrato, evoking the mysterious atmosphere of Japanese folklore.
-            'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
-            'vibrato_rate': 3.5, 'vibrato_depth': 3,
             'smooth_notes_level': 0.9,
             'continuous_vibrato_level': 0.85,
-            'bass_boost_level': 0.4, 'noise_level': 0.0, 'distortion_level': 0.0,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
         "Zelda (The Legend of Zelda / ゼルダの伝説)": {
@@ -1410,23 +1447,22 @@ if __name__ == "__main__":
             'bass_boost_level': 0.15, 'noise_level': 0.0, 'distortion_level': 0.0,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
-        # --- JRPG & System Classics ---
-        "Falcom Ys (Ys / イース)": {
-            # Description: A powerful sawtooth with slight distortion, emulating the driving rock organ and guitar leads of action JRPGs.
-            'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
-            'vibrato_rate': 5.5, 'vibrato_depth': 6,
             'smooth_notes_level': 0.85,
-            'continuous_vibrato_level': 0.8,
-            'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.15,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
-        "Final Fantasy (ファイナルファンタジー)": {
-            # Description: A perfect, clean square wave with zero vibrato, creating the iconic, crystal-clear arpeggio sound.
             'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22,
-            'vibrato_rate': 5.0, 'vibrato_depth': 0,
             'smooth_notes_level': 0.9,
-            'continuous_vibrato_level': 0.2,
-            'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
         "Castlevania (Akumajō Dracula / 悪魔城ドラキュラ)": {
@@ -1438,13 +1474,22 @@ if __name__ == "__main__":
             'bass_boost_level': 0.35, 'noise_level': 0.0, 'distortion_level': 0.0,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
-        "Pokémon (Game Boy Classics / ポケットモンスター)": {
-            # Description: A full, friendly square wave sound, capturing the cheerful and adventurous spirit of early handheld RPGs.
             'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22,
-            'vibrato_rate': 5.0, 'vibrato_depth': 5,
             'smooth_notes_level': 0.9,
-            'continuous_vibrato_level': 0.9,
-            'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
         # --- Advanced System Impressions ---
@@ -1484,7 +1529,155 @@ if __name__ == "__main__":
             'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
-        # --- Experimental & Sound FX ---
         "Sci-Fi Energy Field": {
             # Description: (SFX) High-speed vibrato and noise create a constant, shimmering hum suitable for energy shields or force fields.
             'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
@@ -1530,7 +1723,7 @@ if __name__ == "__main__":
             'bass_boost_level': 0.8, 'noise_level': 0.2, 'distortion_level': 0.5,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
-        # --- Utility ---
         "Generic Chiptune Loop": {
             # Description: A well-balanced, pleasant square wave lead that serves as a great starting point for custom sounds.
             'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2,
@@ -1540,35 +1733,14 @@ if __name__ == "__main__":
             'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
-        "Dark/Boss Atmosphere (Shin Megami Tensei / 真・女神転生)": {
-            # Description: An aggressive sawtooth, inspired by the dark, rock-infused themes of SMT.
-            'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35,
-            'vibrato_rate': 7.0, 'vibrato_depth': 12,
-            'smooth_notes_level': 0.1,
-            'continuous_vibrato_level': 0.0,
-            'bass_boost_level': 0.4, 'noise_level': 0.15, 'distortion_level': 0.25,
-            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
-        },
-        "Modern JRPG Pad (Persona / ペルソナ)": {
-            # Description: A warm, stylish square wave pad, capturing the modern, pop/jazz-infused feel of the Persona series.
-            'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5,
-            'vibrato_rate': 2.5, 'vibrato_depth': 4,
-            'smooth_notes_level': 1.0,
-            'continuous_vibrato_level': 0.95,
-            'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
-            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
-        },
-        "Tactical Brass (Fire Emblem / ファイアーエムブレム)": {
-            # Description: A powerful, sustained sawtooth emulating the bold, heroic synth-brass of Fire Emblem's tactical themes.
-            'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
-            'vibrato_rate': 3.5, 'vibrato_depth': 5,
-            'smooth_notes_level': 0.95,
-            'continuous_vibrato_level': 0.9,
-            'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15,
-            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
-        }
     }
     app = gr.Blocks(theme=gr.themes.Base())
     with app:
@@ -1611,6 +1783,27 @@ if __name__ == "__main__":
                     info="If checked, left/right audio channels are transcribed separately and merged. Doubles processing time."
                 )
                 with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
                     onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
                     frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
@@ -1727,7 +1920,7 @@ if __name__ == "__main__":
                     s8bit_waveform_type = gr.Dropdown(['Square', 'Sawtooth', 'Triangle'], value='Square', label="Waveform Type")
                     s8bit_pulse_width = gr.Slider(0.01, 0.99, value=0.5, step=0.01, label="Pulse Width (Square Wave Only)")
                     s8bit_envelope_type = gr.Dropdown(['Plucky (AD Envelope)', 'Sustained (Full Decay)'], value='Plucky (AD Envelope)', label="Envelope Type")
-                    s8bit_decay_time_s = gr.Slider(0.01, 0.6, value=0.1, step=0.01, label="Decay Time (s)") # Increased max to 0.6 for DQ style
                     s8bit_vibrato_rate = gr.Slider(0, 20, value=5, label="Vibrato Rate (Hz)")
                     s8bit_vibrato_depth = gr.Slider(0, 50, value=0, label="Vibrato Depth (Hz)")
                     s8bit_bass_boost_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Bass Boost Level", info="Adjusts the volume of the sub-octave. 0 is off.")
@@ -1774,7 +1967,11 @@ if __name__ == "__main__":
         # all_inputs now includes the preset selector itself
         # Inputs for the main processing function
         all_inputs = [
-            input_file, s8bit_preset_selector, enable_stereo_processing,
             transcription_method, onset_threshold, frame_threshold, minimum_note_length,
             minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends,
             render_type, soundfont_bank, render_sample_rate, render_with_sustains,
@@ -1810,6 +2007,13 @@ if __name__ == "__main__":
             inputs=all_inputs,
             outputs=all_outputs # Pass the combined list
         )
         # --- Listeners for dynamic UI updates ---
         transcription_method.change(

 import torch
 import gradio as gr
+# --- Imports for Vocal Separation ---
+import torchaudio
+from demucs.apply import apply_model
+from demucs.pretrained import get_model
+from demucs.audio import convert_audio
+from src.piano_transcription.utils import initialize_app
 from piano_transcription_inference  import PianoTranscription, utilities, sample_rate as transcription_sample_rate
 # --- Import core transcription and MIDI processing libraries ---
 def process_and_render_file(input_file,
                             # --- Pass the preset selector value ---
                             s8bit_preset_selector,
+                            separate_vocals,
+                            remerge_vocals,
+                            transcription_target,
                             # --- Transcription params ---
                             enable_stereo_processing,
                             transcription_method,
     filename = os.path.basename(input_file_path)
     print(f"Processing new file: {filename}")
+    # This will store the other part if separation is performed
+    other_part_tensor = None
+    other_part_sr = None
     # --- Step 1: Check file type and transcribe if necessary ---
     if filename.lower().endswith(('.mid', '.midi', '.kar')):
         midi_path_for_rendering = input_file_path
     else: #if filename.lower().endswith(('.wav', '.mp3'))
         print("Audio file detected. Starting transcription...")
+        try:
+            # Use torchaudio to load directly into a tensor, as demucs needs it.
+            # This is more efficient than loading with librosa then converting.
+            audio_tensor, native_sample_rate = torchaudio.load(input_file_path)
+        except Exception as e:
+            raise gr.Error(f"Failed to load audio file: {e}")
+        # --- Demucs Vocal Separation Logic, now decides which stem to process ---
+        if separate_vocals:
+            if demucs_model is None:
+                raise gr.Error("Demucs model is not loaded. Cannot separate vocals.")
+            # Convert to a common format (stereo, float32) that demucs expects
+            audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
+            if torch.cuda.is_available():
+                audio_tensor = audio_tensor.cuda()
+            print("Separating audio with Demucs... This may take some time.")
+            all_stems = apply_model(demucs_model, audio_tensor[None], device='cuda' if torch.cuda.is_available() else 'cpu', progress=True)[0]
+            vocals_idx = demucs_model.sources.index('vocals')
+            # Sum all stems that are NOT vocals to get the accompaniment
+            accompaniment_indices = [i for i, source in enumerate(demucs_model.sources) if source != 'vocals']
+            vocals_tensor = all_stems[vocals_idx]
+            accompaniment_tensor = all_stems[accompaniment_indices].sum(0)
+            # --- The new core branching logic ---
+            if transcription_target == "Transcribe Vocals":
+                print("Target: Transcribing VOCALS.")
+                tensor_to_process = vocals_tensor
+                other_part_tensor = accompaniment_tensor # Save accompaniment for re-merging
+            else: # Default to "Transcribe Music (Accompaniment)"
+                print("Target: Transcribing MUSIC (ACCOMPANIMENT).")
+                tensor_to_process = accompaniment_tensor
+                other_part_tensor = vocals_tensor # Save vocals for re-merging
+            other_part_sr = demucs_model.samplerate
+            audio_tensor = tensor_to_process # The audio to be processed is now the chosen stem
+            native_sample_rate = demucs_model.samplerate # Update sample rate to match demucs output
+            print("Separation complete.")
+        # --- Prepare audio for transcription (saving to a temp file) ---
+        # This part of the logic now works on whichever stem was selected above
         base_name = os.path.splitext(filename)[0]
+        temp_dir = "output/temp_transcribe"
         os.makedirs(temp_dir, exist_ok=True)
+        suffix = f"_{transcription_target.split(' ')[1].lower()}" if separate_vocals else "_original"
+        audio_to_transcribe_path = os.path.join(temp_dir, f"{base_name}{suffix}.wav")
+        torchaudio.save(audio_to_transcribe_path, audio_tensor.cpu(), native_sample_rate)
+        # Convert tensor to numpy array (channels, samples) for librosa/pyloudnorm compatibility
+        # We work with a CPU copy of the tensor.
+        audio_data_np = audio_tensor.cpu().numpy()
         # === STEREO PROCESSING LOGIC ===
         if enable_stereo_processing:
+            if audio_data_np.ndim != 2 or audio_data_np.shape[0] != 2:
                 print("Warning: Audio is not stereo or could not be loaded as stereo. Falling back to mono transcription.")
                 enable_stereo_processing = False # Disable stereo processing if audio is not stereo
         if enable_stereo_processing:
+            print("Stereo processing enabled. Splitting, normalizing, and transcribing channels...")
             try:
+                left_channel_np = audio_data_np[0]
+                right_channel_np = audio_data_np[1]
+                normalized_left = normalize_loudness(left_channel_np, native_sample_rate)
+                normalized_right = normalize_loudness(right_channel_np, native_sample_rate)
+                temp_left_path = os.path.join(temp_dir, f"{base_name}_left.wav")
+                temp_right_path = os.path.join(temp_dir, f"{base_name}_right.wav")
+                sf.write(temp_left_path, normalized_left, native_sample_rate)
+                sf.write(temp_right_path, normalized_right, native_sample_rate)
+                print(f"Saved left channel to: {temp_left_path}")
+                print(f"Saved right channel to: {temp_right_path}")
                 print("Transcribing left and right channel...")
                 if transcription_method == "General Purpose":
+                    midi_path_left = TranscribeGeneralAudio(temp_left_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
+                    midi_path_right = TranscribeGeneralAudio(temp_right_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
+                else: # Piano-Specific
+                    midi_path_left = TranscribePianoAudio(temp_left_path)
+                    midi_path_right = TranscribePianoAudio(temp_right_path)
                 if midi_path_left and midi_path_right:
                     merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
                 print(f"An error occurred during stereo processing: {e}")
                 raise gr.Error(f"Stereo Processing Failed: {e}")
         else: # Standard mono transcription
+            print("Mono processing. Normalizing and transcribing audio...")
+            # If the audio is stereo but stereo processing is disabled, convert to mono.
+            if audio_data_np.shape[0] == 2:
+                mono_signal_np = np.mean(audio_data_np, axis=0)
+            else:
+                mono_signal_np = audio_data_np[0]
+            normalized_mono = normalize_loudness(mono_signal_np, native_sample_rate)
+            temp_mono_path = os.path.join(temp_dir, f"{base_name}_mono.wav")
+            sf.write(temp_mono_path, normalized_mono, native_sample_rate)
             try:
                 if transcription_method == "General Purpose":
+                    midi_path_for_rendering = TranscribeGeneralAudio(temp_mono_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
                 else: # Piano-Specific
+                    midi_path_for_rendering = TranscribePianoAudio(temp_mono_path)
             except Exception as e:
                 print(f"An error occurred during transcription: {e}")
                 raise gr.Error(f"Transcription Failed: {e}")
                           synth_params['fm_modulation_depth'],
                           synth_params['fm_modulation_rate']
                           )
+    # --- Vocal Re-merging Logic now uses the generic "other_part" ---
+    if separate_vocals and remerge_vocals and other_part_tensor is not None:
+        print(f"Re-merging the non-transcribed part with newly rendered music...")
+        rendered_srate, rendered_music_int16 = results[4]
+        rendered_music_float = rendered_music_int16.astype(np.float32) / 32767.0
+        rendered_music_tensor = torch.from_numpy(rendered_music_float).T
+        if rendered_srate != other_part_sr:
+            resampler = torchaudio.transforms.Resample(rendered_srate, other_part_sr)
+            rendered_music_tensor = resampler(rendered_music_tensor)
+        len_music = rendered_music_tensor.shape[1]
+        len_other = other_part_tensor.shape[1]
+        if len_music > len_other:
+            padding = len_music - len_other
+            other_part_tensor = torch.nn.functional.pad(other_part_tensor, (0, padding))
+        elif len_other > len_music:
+            padding = len_other - len_music
+            rendered_music_tensor = torch.nn.functional.pad(rendered_music_tensor, (0, padding))
+        merged_audio_tensor = rendered_music_tensor + other_part_tensor.cpu()
+        max_abs = torch.max(torch.abs(merged_audio_tensor))
+        if max_abs > 1.0:
+            merged_audio_tensor /= max_abs
+        merged_audio_int16 = (merged_audio_tensor.T.numpy() * 32767).astype(np.int16)
+        new_results = list(results)
+        new_results[4] = (other_part_sr, merged_audio_int16)
+        results = tuple(new_results)
+        print("Re-merging complete.")
     print(f'Total processing time: {(reqtime.time() - start_time):.2f} sec')
     print('*' * 70)
     if not soundfonts_dict:
         print("\nWARNING: No SoundFonts were found or could be downloaded.")
         print("Rendering with SoundFonts will fail. Only the 8-bit synthesizer will be available.")
+    # --- Pre-load the Demucs model on startup for efficiency ---
+    print("Loading Demucs model (htdemucs_ft), this may take a moment on first run...")
+    try:
+        demucs_model = get_model(name='htdemucs_ft')
+        if torch.cuda.is_available():
+            demucs_model = demucs_model.cuda()
+        print("Demucs model loaded successfully.")
+    except Exception as e:
+        print(f"Warning: Could not load Demucs model. Vocal separation will not be available. Error: {e}")
+        demucs_model = None
     # --- Define a constant for the fallback preset name ---
     # This prevents errors if the preset name is changed in the dictionary.
     FALLBACK_PRESET_NAME = "Generic Chiptune Loop"
     # Comprehensive preset dictionary including new JRPG and Handheld classics
     # Note: Vibrato depth is mapped to a representative value on the 0-50 Hz slider.
     S8BIT_PRESETS = {
+        # --- Classic Chiptune ---
         "Mario (Super Mario Bros / スーパーマリオブラザーズ)": {
             # Description: A bright square wave with a per-note vibrato, producing the classic bouncy platformer sound.
             'waveform_type': 'Square', 'pulse_width': 0.3, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25,
             'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
+        "Mega Man (Rockman / ロックマン)": {
+            # Description: A thin, sharp square wave lead with fast vibrato, iconic for its driving, heroic melodies.
+            'waveform_type': 'Square', 'pulse_width': 0.2, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
+            'vibrato_rate': 6.0, 'vibrato_depth': 8,
             'smooth_notes_level': 0.9,
             'continuous_vibrato_level': 0.85,
+            'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.05,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
         "Zelda (The Legend of Zelda / ゼルダの伝説)": {
             'bass_boost_level': 0.15, 'noise_level': 0.0, 'distortion_level': 0.0,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
+        "Kirby's Bubbly Melody (Hoshi no Kirby / 星のカービィ)": {
+            # Description: A soft, round square wave with a bouncy vibrato, creating a cheerful and adorable sound.
+            'waveform_type': 'Square', 'pulse_width': 0.4, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2,
+            'vibrato_rate': 6.0, 'vibrato_depth': 4,
             'smooth_notes_level': 0.85,
+            'continuous_vibrato_level': 0.3,   # Formerly False (0.0); adds a hint of continuity for more liveliness.
+            'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
+        "Pokémon (Game Boy Classics / ポケットモンスター)": {
+            # Description: A full, friendly square wave sound, capturing the cheerful and adventurous spirit of early handheld RPGs.
             'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22,
+            'vibrato_rate': 5.0, 'vibrato_depth': 5,
             'smooth_notes_level': 0.9,
+            'continuous_vibrato_level': 0.9,
+            'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
         "Castlevania (Akumajō Dracula / 悪魔城ドラキュラ)": {
             'bass_boost_level': 0.35, 'noise_level': 0.0, 'distortion_level': 0.0,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
+        "Final Fantasy (Arpeggio / ファイナルファンタジー)": {
+            # Description: A perfect, clean square wave with zero vibrato, creating the iconic, crystal-clear arpeggio sound.
             'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22,
+            'vibrato_rate': 5.0, 'vibrato_depth': 0,
             'smooth_notes_level': 0.9,
+            'continuous_vibrato_level': 0.2,
+            'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0,
+            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
+        },
+        "ONI V (Wafu Mystic / ONI V 隠忍を継ぐ者)": {
+            # Description: A solemn triangle wave with a slow, expressive vibrato, evoking the mysterious atmosphere of Japanese folklore.
+            'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
+            'vibrato_rate': 3.5, 'vibrato_depth': 3,
+            'smooth_notes_level': 0.9,
+            'continuous_vibrato_level': 0.85,
+            'bass_boost_level': 0.4, 'noise_level': 0.0, 'distortion_level': 0.0,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
         # --- Advanced System Impressions ---
             'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
+        # --- Action & Rock Leads ---
+        "Falcom Ys (Rock Lead / イース)": {
+            # Description: A powerful sawtooth with slight distortion, emulating the driving rock organ and guitar leads of action JRPGs.
+            'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
+            'vibrato_rate': 5.5, 'vibrato_depth': 6,
+            'smooth_notes_level': 0.85,
+            'continuous_vibrato_level': 0.8,
+            'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.15,
+            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
+        },
+        "Arcade Brawler Lead (Street Fighter / ストリートファイター)": {
+            # Description: A gritty sawtooth lead with a hard attack, capturing the high-energy feel of classic fighting games.
+            'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
+            'vibrato_rate': 5.0, 'vibrato_depth': 6,
+            'smooth_notes_level': 0.8,
+            'continuous_vibrato_level': 0.7,
+            'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.1,
+            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
+        },
+        "Rhythm Pop Lead (Rhythm Tengoku / リズム天国)": {
+            # Description: A clean, round square wave perfect for the snappy, catchy feel of rhythm games.
+            'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18,
+            'vibrato_rate': 4.5, 'vibrato_depth': 4,
+            'smooth_notes_level': 0.9,          # Formerly True -> 1.0; slightly reduced for a bit more attack.
+            'continuous_vibrato_level': 0.8,    # Formerly True -> 1.0; slightly weakened for more defined note transitions.
+            'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
+            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
+        },
+        # --- Epic & Orchestral Pads ---
+        "Dragon Quest (Orchestral Feel / ドラゴンクエスト)": {
+            # Description: A pure triangle wave with a long decay, mimicking the grand, orchestral feel of a classical flute or string section.
+           'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.6,
+            'vibrato_rate': 3.0, 'vibrato_depth': 4,
+            'smooth_notes_level': 0.9,
+            'continuous_vibrato_level': 0.9,
+            'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
+            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
+        },
+        "Mystic Mana Pad (Secret of Mana / 聖剣伝説2)": {
+             # Description: A warm, ethereal square wave pad with slow vibrato, capturing a feeling of fantasy and wonder.
+            'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5,
+            'vibrato_rate': 2.5, 'vibrato_depth': 4,
+            'smooth_notes_level': 1.0,
+            'continuous_vibrato_level': 0.95,
+            'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
+            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
+        },
+        "Modern JRPG Pad (Persona / ペルソナ)": {
+            # Description: A warm, stylish square wave pad, capturing the modern, pop/jazz-infused feel of the Persona series.
+            'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5,
+            'vibrato_rate': 2.5, 'vibrato_depth': 4,
+            'smooth_notes_level': 1.0,
+            'continuous_vibrato_level': 0.95,
+            'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
+            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
+        },
+        "Tactical Brass (Fire Emblem / ファイアーエムブレム)": {
+            # Description: A powerful, sustained sawtooth emulating the bold, heroic synth-brass of Fire Emblem's tactical themes.
+            'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
+            'vibrato_rate': 3.5, 'vibrato_depth': 5,
+            'smooth_notes_level': 0.95,
+            'continuous_vibrato_level': 0.9,
+            'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15,
+            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
+        },
+        "Mecha & Tactics Brass (Super Robot Wars / スーパーロボット大戦)": {
+            # Description: A powerful, sustained sawtooth emulating the bold, heroic synth-brass of strategy and mecha anime themes.
+            'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
+            'vibrato_rate': 3.5, 'vibrato_depth': 5,
+            'smooth_notes_level': 0.95,
+            'continuous_vibrato_level': 0.9,
+            'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15,
+            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
+        },
+        "Dark/Boss Atmosphere (Shin Megami Tensei / 真・女神転生)": {
+            # Description: An aggressive sawtooth, inspired by the dark, rock-infused themes of SMT.
+            'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35,
+            'vibrato_rate': 7.0, 'vibrato_depth': 12,
+            'smooth_notes_level': 0.1,
+            'continuous_vibrato_level': 0.0,
+            'bass_boost_level': 0.4, 'noise_level': 0.15, 'distortion_level': 0.25,
+            'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
+        },
+        # --- Vocal Synthesis ---
+        "8-Bit Vocal Lead": {
+            # Description: A soft, sustained triangle wave with gentle vibrato to mimic a singing voice.
+            'waveform_type': 'Triangle',
+            'pulse_width': 0.5,
+            'envelope_type': 'Sustained (Full Decay)',
+            'decay_time_s': 0.8,
+            'vibrato_rate': 5.5,
+            'vibrato_depth': 4, # Mapped from the suggested 0.15 range
+            'bass_boost_level': 0.1,
+            'smooth_notes_level': 0.85,
+            'continuous_vibrato_level': 0.9,
+            'noise_level': 0.02,
+            'distortion_level': 0.0,
+            'fm_modulation_depth': 0.05,
+            'fm_modulation_rate': 20
+        },
+        "8-Bit Male Vocal": {
+            # Description: A deeper, fuller triangle wave with more bass and slower vibrato for a masculine feel.
+            'waveform_type': 'Triangle',
+            'pulse_width': 0.5,
+            'envelope_type': 'Sustained (Full Decay)',
+            'decay_time_s': 1.0,
+            'vibrato_rate': 5.0,
+            'vibrato_depth': 3, # Mapped from the suggested 0.12 range
+            'bass_boost_level': 0.3,
+            'smooth_notes_level': 0.9,
+            'continuous_vibrato_level': 0.85,
+            'noise_level': 0.015,
+            'distortion_level': 0.0,
+            'fm_modulation_depth': 0.08,
+            'fm_modulation_rate': 25
+        },
+        "8-Bit Female Vocal": {
+            # Description: A brighter, lighter triangle wave with faster vibrato and less bass for a feminine feel.
+            'waveform_type': 'Triangle',
+            'pulse_width': 0.5,
+            'envelope_type': 'Sustained (Full Decay)',
+            'decay_time_s': 0.7,
+            'vibrato_rate': 6.0,
+            'vibrato_depth': 5, # Mapped from the suggested 0.18 range
+            'bass_boost_level': 0.05,
+            'smooth_notes_level': 0.85,
+            'continuous_vibrato_level': 0.92,
+            'noise_level': 0.025,
+            'distortion_level': 0.0,
+            'fm_modulation_depth': 0.04,
+            'fm_modulation_rate': 30
+        },
+        "Lo-Fi Vocal": {
+            # Description: A gritty, noisy square wave with a short decay to simulate a low-resolution vocal sample.
+            'waveform_type': 'Square',
+            'pulse_width': 0.48,
+            'envelope_type': 'Plucky (AD Envelope)', # "Short" implies a plucky, not sustained, envelope
+            'decay_time_s': 0.4,
+            'vibrato_rate': 4.8,
+            'vibrato_depth': 2, # Mapped from the suggested 0.10 range
+            'bass_boost_level': 0.1,
+            'smooth_notes_level': 0.65,
+            'continuous_vibrato_level': 0.6,
+            'noise_level': 0.05,
+            'distortion_level': 0.05,
+            'fm_modulation_depth': 0.02,
+            'fm_modulation_rate': 20
+        },
+        # --- Sound FX & Experimental ---
         "Sci-Fi Energy Field": {
             # Description: (SFX) High-speed vibrato and noise create a constant, shimmering hum suitable for energy shields or force fields.
             'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
             'bass_boost_level': 0.8, 'noise_level': 0.2, 'distortion_level': 0.5,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
+        # --- Utility & Starting Points ---
         "Generic Chiptune Loop": {
             # Description: A well-balanced, pleasant square wave lead that serves as a great starting point for custom sounds.
             'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2,
             'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0,
             'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
         },
     }
+    # --- Function to control visibility of BOTH new UI elements ---
+    def update_vocal_ui_visibility(separate_vocals):
+        """Shows or hides the separation-related UI controls."""
+        is_visible = gr.update(visible=separate_vocals)
+        return is_visible, is_visible # Return two updates
     app = gr.Blocks(theme=gr.themes.Base())
     with app:
                     info="If checked, left/right audio channels are transcribed separately and merged. Doubles processing time."
                 )
+                # --- Vocal Separation Checkboxes ---
+                with gr.Group():
+                    separate_vocals = gr.Checkbox(
+                        label="Separate Vocals",
+                        value=False,
+                        info="If checked, separates the audio into vocals and music stems before processing."
+                    )
+                    transcription_target = gr.Radio(
+                        ["Transcribe Music (Accompaniment)", "Transcribe Vocals"],
+                        label="Transcription Target",
+                        value="Transcribe Music (Accompaniment)",
+                        info="Choose which part of the separated audio to transcribe to MIDI.",
+                        visible=False # Initially hidden
+                    )
+                    remerge_vocals = gr.Checkbox(
+                        label="Re-merge Other Part with Rendered Audio",
+                        value=False,
+                        info="After rendering, merges the non-transcribed part (e.g., original vocals) back with the new music.",
+                        visible=False # Initially hidden
+                    )
                 with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
                     onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
                     frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
                     s8bit_waveform_type = gr.Dropdown(['Square', 'Sawtooth', 'Triangle'], value='Square', label="Waveform Type")
                     s8bit_pulse_width = gr.Slider(0.01, 0.99, value=0.5, step=0.01, label="Pulse Width (Square Wave Only)")
                     s8bit_envelope_type = gr.Dropdown(['Plucky (AD Envelope)', 'Sustained (Full Decay)'], value='Plucky (AD Envelope)', label="Envelope Type")
+                    s8bit_decay_time_s = gr.Slider(0.01, 1.0, value=0.1, step=0.01, label="Decay Time (s)") # Increased max to 0.6 for DQ style
                     s8bit_vibrato_rate = gr.Slider(0, 20, value=5, label="Vibrato Rate (Hz)")
                     s8bit_vibrato_depth = gr.Slider(0, 50, value=0, label="Vibrato Depth (Hz)")
                     s8bit_bass_boost_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Bass Boost Level", info="Adjusts the volume of the sub-octave. 0 is off.")
         # all_inputs now includes the preset selector itself
         # Inputs for the main processing function
         all_inputs = [
+            input_file, s8bit_preset_selector,
+            separate_vocals,
+            remerge_vocals,
+            transcription_target,
+            enable_stereo_processing,
             transcription_method, onset_threshold, frame_threshold, minimum_note_length,
             minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends,
             render_type, soundfont_bank, render_sample_rate, render_with_sustains,
             inputs=all_inputs,
             outputs=all_outputs # Pass the combined list
         )
+        # --- The change event now controls TWO components ---
+        separate_vocals.change(
+            fn=update_vocal_ui_visibility,
+            inputs=separate_vocals,
+            outputs=[transcription_target, remerge_vocals] # Update both components
+        )
         # --- Listeners for dynamic UI updates ---
         transcription_method.change(

requirements.txt CHANGED Viewed

@@ -1,8 +1,9 @@
 --extra-index-url https://download.pytorch.org/whl/cu128
 torch
 numpy
-gradio
 mido
 librosa
 torchlibrosa
@@ -18,9 +19,12 @@ psutil
 pretty_midi
 soundfile
 pyloudnorm
 piano_transcription_inference
 basic-pitch @ git+https://github.com/avan06/basic-pitch; sys_platform != 'linux'
 basic-pitch[tf] @ git+https://github.com/avan06/basic-pitch; sys_platform == 'linux'
-git+https://github.com/avan06/pyfluidsynth

 --extra-index-url https://download.pytorch.org/whl/cu128
 torch
+torchaudio
 numpy
+gradio >= 5.42.0
 mido
 librosa
 torchlibrosa
 pretty_midi
 soundfile
 pyloudnorm
 piano_transcription_inference
 basic-pitch @ git+https://github.com/avan06/basic-pitch; sys_platform != 'linux'
 basic-pitch[tf] @ git+https://github.com/avan06/basic-pitch; sys_platform == 'linux'
+git+https://github.com/avan06/pyfluidsynth
+demucs