Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import shutil | |
| import asyncio | |
| import librosa | |
| import librosa.display | |
| import soundfile as sf | |
| import numpy as np | |
| import time | |
| import zipfile | |
| import tempfile | |
| import matplotlib.pyplot as plt | |
| import matplotlib | |
| import struct | |
| from scipy.signal import convolve, butter, lfilter, windows | |
| # Use a non-interactive backend for Matplotlib for UI compatibility | |
| matplotlib.use('Agg') | |
| # --- UTILITY: MIDI FILE WRITING --- | |
| def encode_delta_time(time): | |
| """Encodes a time value into MIDI variable-length quantity format.""" | |
| data = [] | |
| if time == 0: | |
| return b'\x00' | |
| while time > 0: | |
| byte = time & 0x7F | |
| time >>= 7 | |
| if time > 0: | |
| byte |= 0x80 | |
| data.insert(0, byte) | |
| return bytes(data) | |
| def freq_to_midi(freq): | |
| """Converts a frequency in Hz to a MIDI note number.""" | |
| # A4 = 440 Hz = MIDI 69 | |
| if freq <= 0: | |
| return 0 | |
| # Note: Using a simple threshold for frequency detection to minimize noise | |
| if freq < 40: # Ignore frequencies below C2 (approx 65Hz) | |
| return 0 | |
| return int(round(69 + 12 * np.log2(freq / 440.0))) | |
| def write_midi_file(notes_list, bpm, output_path): | |
| """ | |
| Writes a very basic, dependency-free MIDI file (.mid) from a list of notes. | |
| Each note is (midi_note, start_time_sec, duration_sec). | |
| """ | |
| if not notes_list: | |
| return | |
| tempo_us_per_beat = int(60000000 / bpm) | |
| division = 96 # Ticks per quarter note | |
| seconds_per_tick = 60.0 / (bpm * division) | |
| midi_data = [ | |
| # Track 0: Tempo and Time Sig | |
| struct.pack('>L', 0) + b'\xFF\x51\x03' + struct.pack('>L', tempo_us_per_beat)[1:], # Set Tempo | |
| struct.pack('>L', 0) + b'\xFF\x58\x04\x04\x02\x18\x08', # Time Signature (4/4) | |
| ] | |
| # Sort notes by start time | |
| notes_list.sort(key=lambda x: x[1]) | |
| current_tick = 0 | |
| for note, start_sec, duration_sec in notes_list: | |
| if note == 0: continue | |
| # Calculate delta time from last event | |
| target_tick = int(start_sec / seconds_per_tick) | |
| delta_tick = target_tick - current_tick | |
| current_tick = target_tick | |
| # Note On event (Channel 1, Velocity 100) | |
| note_on = b'\x90' + struct.pack('>B', note) + b'\x64' | |
| midi_data.append(encode_delta_time(delta_tick) + note_on) | |
| # Note Off event (Channel 1, Velocity 0) | |
| duration_ticks = int(duration_sec / seconds_per_tick) | |
| note_off = b'\x80' + struct.pack('>B', note) + b'\x00' | |
| midi_data.append(encode_delta_time(duration_ticks) + note_off) | |
| current_tick += duration_ticks | |
| track_data = b"".join(midi_data) | |
| # 1. Header Chunk (MThd) | |
| header = b'MThd' + struct.pack('>L', 6) + b'\x00\x01' + struct.pack('>H', 1) + struct.pack('>H', division) | |
| # 2. Track Chunk (MTrk) | |
| track_chunk = b'MTrk' + struct.pack('>L', len(track_data)) + track_data + b'\x00\xFF\x2F\x00' # End of Track | |
| with open(output_path, 'wb') as f: | |
| f.write(header + track_chunk) | |
| # --- CONFIGURATION & UTILITY --- | |
| # Mapping for standard key to Camelot Code | |
| KEY_TO_CAMELOT = { | |
| "C Maj": "8B", "G Maj": "9B", "D Maj": "10B", "A Maj": "11B", "E Maj": "12B", | |
| "B Maj": "1B", "F# Maj": "2B", "Db Maj": "3B", "Ab Maj": "4B", "Eb Maj": "5B", | |
| "Bb Maj": "6B", "F Maj": "7B", | |
| "A Min": "8A", "E Min": "9A", "B Min": "10A", "F# Min": "11A", "C# Min": "12A", | |
| "G# Min": "1A", "D# Min": "2A", "Bb Min": "3A", "F Min": "4A", "C Min": "5A", | |
| "G Min": "6A", "D Min": "7A", | |
| "Gb Maj": "2B", "Cb Maj": "7B", "A# Min": "3A", "D# Maj": "11B", "G# Maj": "3B" | |
| } | |
| def get_harmonic_recommendations(key_str): | |
| """Calculates harmonically compatible keys based on the Camelot wheel.""" | |
| code = KEY_TO_CAMELOT.get(key_str, "N/A") | |
| if code == "N/A": return "N/A (Key not recognized or 'Unknown Key' detected.)" | |
| try: | |
| num = int(code[:-1]) | |
| mode = code[-1] | |
| opposite_mode = 'B' if mode == 'A' else 'A' | |
| num_plus_one = (num % 12) + 1 | |
| num_minus_one = 12 if num == 1 else num - 1 | |
| recs = [f"{num}{opposite_mode}", f"{num_plus_one}{mode}", f"{num_minus_one}{mode}"] | |
| CAMELOT_TO_KEY = {v: k for k, v in KEY_TO_CAMELOT.items()} | |
| rec_keys = [f"{CAMELOT_TO_KEY.get(r_code, f'Code {r_code}')} ({r_code})" for r_code in recs] | |
| return " | ".join(rec_keys) | |
| except: | |
| return "N/A (Error calculating recommendations.)" | |
| def detect_key(y, sr): | |
| """Analyzes the audio to determine the most likely musical key.""" | |
| try: | |
| chroma = librosa.feature.chroma_stft(y=y, sr=sr) | |
| chroma_sums = np.sum(chroma, axis=1) | |
| chroma_norm = chroma_sums / np.sum(chroma_sums) | |
| major_template = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88]) | |
| minor_template = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17]) | |
| pitch_classes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'] | |
| major_correlations = [np.dot(chroma_norm, np.roll(major_template, i)) for i in range(12)] | |
| best_major_index = np.argmax(major_correlations) | |
| minor_correlations = [np.dot(chroma_norm, np.roll(minor_template, i)) for i in range(12)] | |
| best_minor_index = np.argmax(minor_correlations) | |
| if major_correlations[best_major_index] > minor_correlations[best_minor_index]: | |
| return pitch_classes[best_major_index] + " Maj" | |
| else: | |
| return pitch_classes[best_minor_index] + " Min" | |
| except Exception as e: | |
| print(f"Key detection failed: {e}") | |
| return "Unknown Key" | |
| def reduce_reverb(audio_path, log_history): | |
| # Reverb reduction logic... (unchanged) | |
| try: | |
| y, sr = librosa.load(audio_path, sr=None) | |
| n_fft = 2048 | |
| hop_length = 512 | |
| D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length) | |
| mag = np.abs(D) | |
| phase = np.angle(D) | |
| ambient_floor = np.percentile(mag, 10, axis=1, keepdims=True) | |
| freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft) | |
| dampening_factor = np.clip(1 - (freqs / 1000.0), 0.2, 1.0)[:, np.newaxis] | |
| reduction_strength = 0.5 | |
| ambient_reduction = ambient_floor * reduction_strength * dampening_factor | |
| mag_processed = np.maximum(mag - ambient_reduction, 0) | |
| D_processed = mag_processed * np.exp(1j * phase) | |
| y_processed = librosa.istft(D_processed, length=len(y), dtype=y.dtype, hop_length=hop_length) | |
| processed_path = audio_path.replace(".wav", "_dry.wav") | |
| sf.write(processed_path, y_processed, sr) | |
| log_history += "✅ Reverb reduction applied to vocals. Using dry vocal track.\n" | |
| return processed_path, log_history | |
| except Exception as e: | |
| log_history += f"⚠️ WARNING: Reverb reduction failed ({e}). Proceeding with wet vocal audio.\n" | |
| return audio_path, log_history | |
| def apply_crossfade(audio_chunk, sr, fade_ms): | |
| """Applies a simple Hanning crossfade (fade-in/fade-out) to an audio chunk. (unchanged)""" | |
| if fade_ms <= 0 or len(audio_chunk) == 0: | |
| return audio_chunk | |
| fade_samples = int(sr * (fade_ms / 1000.0)) | |
| n_samples = len(audio_chunk) | |
| if n_samples < 2 * fade_samples: | |
| fade_samples = n_samples // 2 | |
| if fade_samples == 0: return audio_chunk | |
| window = np.hanning(2 * fade_samples) | |
| fade_in_window = window[:fade_samples] | |
| fade_out_window = window[fade_samples:] | |
| chunk_copy = audio_chunk.copy() | |
| if fade_samples > 0: | |
| if chunk_copy.ndim == 1: | |
| chunk_copy[:fade_samples] *= fade_in_window | |
| chunk_copy[-fade_samples:] *= fade_out_window | |
| else: | |
| chunk_copy[:fade_samples, :] *= fade_in_window[:, np.newaxis] | |
| chunk_copy[-fade_samples:] *= fade_out_window[:, np.newaxis] | |
| return chunk_copy | |
| def generate_waveform_preview(y, sr, slice_samples, stem_name, loop_type, temp_dir): | |
| """Generates a Matplotlib image showing the waveform and slice points. (unchanged)""" | |
| img_path = os.path.join(temp_dir, f"{stem_name}_preview_{int(time.time() * 1000)}.png") | |
| plt.figure(figsize=(10, 1.5)) | |
| y_display = librosa.to_mono(y.T) if y.ndim > 1 else y | |
| librosa.display.waveshow(y_display, sr=sr, x_axis='time', color="#4a7098") | |
| slice_times = librosa.samples_to_time(slice_samples, sr=sr) | |
| for t in slice_times: | |
| plt.axvline(x=t, color='red', linestyle='--', linewidth=1, alpha=0.7) | |
| plt.title(f"{stem_name} Slices ({loop_type})", fontsize=10) | |
| plt.xlabel("") | |
| plt.yticks([]) | |
| plt.tight_layout(pad=0) | |
| plt.savefig(img_path) | |
| plt.close() | |
| return img_path | |
| def apply_modulation(y, sr, bpm, rate, pan_depth, level_depth): | |
| """Applies tempo-synced LFOs for panning and volume modulation. (unchanged)""" | |
| if y.ndim == 1: | |
| y = np.stack((y, y), axis=-1) | |
| elif y.ndim == 0: | |
| return y | |
| N = len(y) | |
| duration_sec = N / sr | |
| rate_map = {'1/2': 0.5, '1/4': 1, '1/8': 2, '1/16': 4} | |
| beats_per_measure = rate_map.get(rate, 1) | |
| lfo_freq_hz = (bpm / 60.0) * (beats_per_measure / 4.0) | |
| t = np.linspace(0, duration_sec, N, endpoint=False) | |
| # Panning LFO | |
| if pan_depth > 0: | |
| pan_lfo = np.sin(2 * np.pi * lfo_freq_hz * t) * pan_depth | |
| L_mod = (1 - pan_lfo) / 2.0 | |
| R_mod = (1 + pan_lfo) / 2.0 | |
| y[:, 0] *= L_mod | |
| y[:, 1] *= R_mod | |
| # Level LFO (Tremolo) | |
| if level_depth > 0: | |
| level_lfo = (np.sin(2 * np.pi * lfo_freq_hz * t) + 1) / 2.0 | |
| gain_multiplier = (1 - level_depth) + (level_depth * level_lfo) | |
| y[:, 0] *= gain_multiplier | |
| y[:, 1] *= gain_multiplier | |
| return y | |
| def apply_normalization_dbfs(y, target_dbfs): | |
| """Applies peak normalization to match a target dBFS value. (unchanged)""" | |
| if target_dbfs >= 0: | |
| return y | |
| current_peak_amp = np.max(np.abs(y)) | |
| target_peak_amp = 10**(target_dbfs / 20.0) | |
| if current_peak_amp > 1e-6: | |
| gain = target_peak_amp / current_peak_amp | |
| y_normalized = y * gain | |
| y_normalized = np.clip(y_normalized, -1.0, 1.0) | |
| return y_normalized | |
| else: | |
| return y | |
| # --- NEW UTILITY: TRANSIENT SHAPING --- | |
| def apply_transient_shaping(y, sr, attack_gain, sustain_gain): | |
| """ | |
| Applies basic transient shaping to the audio signal (mono or stereo). | |
| Only applies if the stem is 'drums'. | |
| """ | |
| if y.ndim == 1: | |
| y_mono = y | |
| else: | |
| y_mono = librosa.to_mono(y.T) | |
| rectified = np.abs(y_mono) | |
| # Filter/Window sizes based on typical transient/sustain times | |
| attack_samples = int(sr * 0.005) # 5ms | |
| sustain_samples = int(sr * 0.05) # 50ms | |
| # Envelope followers | |
| attack_window = windows.hann(attack_samples * 2); attack_window /= np.sum(attack_window) | |
| sustain_window = windows.hann(sustain_samples * 2); sustain_window /= np.sum(sustain_window) | |
| fast_envelope = convolve(rectified, attack_window, mode='same') | |
| slow_envelope = convolve(rectified, sustain_window, mode='same') | |
| # Ratio: how transient the signal is (fast envelope >> slow envelope) | |
| ratio = np.clip(fast_envelope / (slow_envelope + 1e-6), 1.0, 5.0) | |
| # Normalized ratio (0 to 1, where 1 is strong transient) | |
| # 4.0 comes from the ratio clip max 5.0 - min 1.0 | |
| normalized_ratio = (ratio - 1.0) / 4.0 | |
| # Gain is a blend between sustain_gain and attack_gain based on the normalized_ratio | |
| gain_envelope = (sustain_gain * (1 - normalized_ratio)) + (attack_gain * normalized_ratio) | |
| # Apply Gain | |
| if y.ndim == 1: | |
| y_out = y * gain_envelope | |
| else: | |
| y_out = y * gain_envelope[:, np.newaxis] | |
| return y_out | |
| # --- NEW UTILITY: FILTER MODULATION --- | |
| def apply_filter_modulation(y, sr, bpm, rate, filter_type, freq, depth): | |
| """ | |
| Applies a tempo-synced LFO to a 2nd order Butterworth filter cutoff frequency. | |
| """ | |
| if depth == 0: | |
| return y | |
| # Ensure stereo for LFO application | |
| if y.ndim == 1: | |
| y = np.stack((y, y), axis=-1) | |
| N = len(y) | |
| duration_sec = N / sr | |
| # LFO Rate Calculation | |
| rate_map = {'1/2': 0.5, '1/4': 1, '1/8': 2, '1/16': 4} | |
| beats_per_measure = rate_map.get(rate, 1) | |
| lfo_freq_hz = (bpm / 60.0) * (beats_per_measure / 4.0) | |
| t = np.linspace(0, duration_sec, N, endpoint=False) | |
| # LFO: ranges from 0 to 1 | |
| lfo_value = (np.sin(2 * np.pi * lfo_freq_hz * t) + 1) / 2.0 | |
| # Modulate Cutoff Frequency: Cutoff = BaseFreq + (LFO * Depth) | |
| cutoff_modulation = freq + (lfo_value * depth) | |
| # Safety clip to prevent instability | |
| cutoff_modulation = np.clip(cutoff_modulation, 20.0, sr / 2.0 - 100) | |
| y_out = np.zeros_like(y) | |
| filter_type_b = filter_type.lower().replace('-pass', '') | |
| frame_size = 512 # Frame-based update for filter coefficients | |
| # Apply filter channel by channel | |
| for channel in range(y.shape[1]): | |
| zi = np.zeros(2) # Initial filter state (2nd order filter) | |
| for frame_start in range(0, N, frame_size): | |
| frame_end = min(frame_start + frame_size, N) | |
| frame = y[frame_start:frame_end, channel] | |
| # Use the average LFO cutoff for the frame | |
| avg_cutoff = np.mean(cutoff_modulation[frame_start:frame_end]) | |
| # Calculate 2nd order Butterworth filter coefficients | |
| b, a = butter(2, avg_cutoff, btype=filter_type_b, fs=sr) | |
| # Apply filter to the frame, updating the state `zi` | |
| filtered_frame, zi = lfilter(b, a, frame, zi=zi) | |
| y_out[frame_start:frame_end, channel] = filtered_frame | |
| return y_out | |
| # --- CORE SEPARATION FUNCTION (Truncated for brevity, focus on analysis) --- | |
| async def separate_stems(audio_file_path, selected_model, denoise_enabled, reverb_reduction_enabled): | |
| """ | |
| Separates audio, detects BPM and Key, and applies post-processing. | |
| (Function logic remains the same for separation, only the returns are relevant) | |
| """ | |
| if audio_file_path is None: | |
| raise gr.Error("No audio file uploaded!") | |
| log_history = "Starting separation...\n" | |
| yield { status_log: log_history, detected_bpm_key: "", harmonic_recs: "---" } | |
| # 1. Pre-process and analyze original audio | |
| detected_bpm = 120 | |
| detected_key = "Unknown Key" | |
| # ... (BPM and Key detection logic, including error handling) ... | |
| try: | |
| y_orig, sr_orig = librosa.load(audio_file_path, sr=None) | |
| y_mono = librosa.to_mono(y_orig.T) if y_orig.ndim > 1 else y_orig | |
| tempo, _ = librosa.beat.beat_track(y=y_mono, sr=sr_orig) | |
| detected_bpm = 120 if tempo is None or tempo == 0 else int(np.round(tempo).item()) | |
| detected_key = detect_key(y_mono, sr_orig) | |
| harmonic_recommendations = get_harmonic_recommendations(detected_key) | |
| status_string = f"Detected Tempo: {detected_bpm} BPM. Detected Key: {detected_key}. Proceeding with separation...\n" | |
| log_history += status_string | |
| yield { | |
| status_log: log_history, | |
| detected_bpm_key: f"{detected_bpm} BPM, {detected_key}", | |
| harmonic_recs: harmonic_recommendations | |
| } | |
| except Exception as e: | |
| log_history += f"⚠️ WARNING: Analysis failed ({e}). Defaulting to 120 BPM, Unknown Key.\n" | |
| harmonic_recommendations = "N/A (Analysis failed)" | |
| yield { | |
| status_log: log_history, | |
| detected_bpm_key: f"{detected_bpm} BPM, {detected_key}", | |
| harmonic_recs: harmonic_recommendations | |
| } | |
| # --- Truncated Demucs Output Placeholder (For Demonstrating Success) --- | |
| # Mock file paths and generation for demo purposes | |
| vocals_path = "separated/htdemucs/input/vocals.wav" | |
| drums_path = "separated/htdemucs/input/drums.wav" | |
| bass_path = "separated/htdemucs/input/bass.wav" | |
| other_path = "separated/htdemucs/input/other.wav" | |
| guitar_path = None | |
| piano_path = None | |
| mock_sr = 44100 | |
| mock_duration = 10 | |
| mock_y = np.random.uniform(low=-0.5, high=0.5, size=(mock_sr * mock_duration, 2)).astype(np.float32) | |
| os.makedirs(os.path.dirname(vocals_path), exist_ok=True) | |
| sf.write(vocals_path, mock_y, mock_sr) | |
| sf.write(drums_path, mock_y, mock_sr) | |
| sf.write(bass_path, mock_y, mock_sr) | |
| sf.write(other_path, mock_y, mock_sr) | |
| # --- End Truncated Demucs Output Placeholder --- | |
| log_history += "✅ Stem separation complete! (Mock files generated for demo)\n" | |
| yield { | |
| status_log: log_history, | |
| vocals_output: gr.update(value=vocals_path, visible=True), | |
| drums_output: gr.update(value=drums_path, visible=True), | |
| bass_output: gr.update(value=bass_path, visible=True), | |
| other_output: gr.update(value=other_path, visible=True), | |
| guitar_output: gr.update(value=guitar_path, visible=False), | |
| piano_output: gr.update(value=piano_path, visible=False), | |
| detected_bpm_key: f"{detected_bpm} BPM, {detected_key}", | |
| gr.Textbox(elem_id="detected_bpm_key_output"): f"{detected_bpm} BPM, {detected_key}", | |
| gr.Textbox(elem_id="harmonic_recs_output"): harmonic_recommendations | |
| } | |
| # --- CORE SLICING FUNCTION (UPDATED for MIDI and Rich Tagging) --- | |
| def slice_stem_real(stem_audio_data, loop_choice, sensitivity, stem_name, manual_bpm, time_signature, crossfade_ms, transpose_semitones, detected_key, pan_depth, level_depth, modulation_rate, target_dbfs, attack_gain, sustain_gain, filter_type, filter_freq, filter_depth): | |
| """ | |
| Slices a single stem, applies pitch shift, modulation, normalization, | |
| transient shaping, filter LFO, and generates MIDI/visualizations. | |
| """ | |
| if stem_audio_data is None: | |
| return [], None | |
| sample_rate, y_int = stem_audio_data | |
| y = librosa.util.buf_to_float(y_int, dtype=np.float32) | |
| if y.ndim == 0: return [], None | |
| y_mono = librosa.to_mono(y.T) if y.ndim > 1 else y | |
| # --- 1. PITCH SHIFTING (if enabled) --- | |
| if transpose_semitones != 0: | |
| y_shifted = librosa.effects.pitch_shift(y, sr=sample_rate, n_steps=transpose_semitones) | |
| y = y_shifted | |
| # --- 2. TRANSIENT SHAPING (Drums Only) --- | |
| if stem_name == "drums" and (attack_gain != 1.0 or sustain_gain != 1.0): | |
| y = apply_transient_shaping(y, sample_rate, attack_gain, sustain_gain) | |
| # --- 3. FILTER MODULATION (LFO 2.0) --- | |
| if filter_depth > 0: | |
| y = apply_filter_modulation(y, sample_rate, manual_bpm, modulation_rate, filter_type, filter_freq, filter_depth) | |
| # --- 4. PAN/LEVEL MODULATION --- | |
| normalized_pan_depth = pan_depth / 100.0 | |
| normalized_level_depth = level_depth / 100.0 | |
| if normalized_pan_depth > 0 or normalized_level_depth > 0: | |
| y = apply_modulation(y, sample_rate, manual_bpm, modulation_rate, normalized_pan_depth, normalized_level_depth) | |
| # Check if any modification was applied for the RICH METADATA TAGGING | |
| is_modified = ( | |
| transpose_semitones != 0 or | |
| normalized_pan_depth > 0 or normalized_level_depth > 0 or | |
| filter_depth > 0 or | |
| stem_name == "drums" and (attack_gain != 1.0 or sustain_gain != 1.0) | |
| ) | |
| mod_tag = "_MOD" if is_modified else "" # Rich Tagging: Modification flag | |
| # --- 5. NORMALIZATION --- | |
| if target_dbfs < 0: | |
| y = apply_normalization_dbfs(y, target_dbfs) | |
| # --- 6. DETERMINE BPM & KEY (FOR RICH TAGGING) --- | |
| bpm_int = int(manual_bpm) | |
| bpm_tag = f"{bpm_int}BPM" # Rich Tagging: BPM | |
| time_sig_tag = time_signature.replace("/", "") # Rich Tagging: Time Signature | |
| key_tag = detected_key.replace(" ", "") | |
| if transpose_semitones != 0: | |
| root = detected_key.split(" ")[0] | |
| mode = detected_key.split(" ")[1] | |
| pitch_classes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'] | |
| try: | |
| current_index = pitch_classes.index(root) | |
| new_index = (current_index + transpose_semitones) % 12 | |
| new_key_root = pitch_classes[new_index] | |
| key_tag = f"{new_key_root}{mode}Shift" # Rich Tagging: Transposed Key | |
| except ValueError: | |
| pass # Keep original key tag if root not found | |
| # --- 7. MIDI GENERATION (Melodic Stems) --- | |
| output_files = [] | |
| loops_dir = tempfile.mkdtemp() | |
| is_melodic = stem_name in ["vocals", "bass", "guitar", "piano", "other"] | |
| if is_melodic and ("Bar Loops" in loop_choice): | |
| try: | |
| # Use piptrack for a more robust (though less accurate than Pyin) general pitch detection | |
| pitches, magnitudes = librosa.piptrack(y=y_mono, sr=sample_rate) | |
| main_pitch_line = np.zeros(pitches.shape[1]) | |
| for t in range(pitches.shape[1]): | |
| index = magnitudes[:, t].argmax() | |
| main_pitch_line[t] = pitches[index, t] | |
| notes_list = [] | |
| # Simple note segmentation by pitch change | |
| i = 0 | |
| while i < len(main_pitch_line): | |
| current_freq = main_pitch_line[i] | |
| current_midi = freq_to_midi(current_freq) | |
| j = i | |
| while j < len(main_pitch_line) and freq_to_midi(main_pitch_line[j]) == current_midi: | |
| j += 1 | |
| duration_frames = j - i | |
| # Minimum duration filter to ignore extremely short notes | |
| if current_midi != 0 and duration_frames >= 2: | |
| start_sec = librosa.frames_to_time(i, sr=sample_rate, hop_length=512) | |
| duration_sec = librosa.frames_to_time(duration_frames, sr=sample_rate, hop_length=512) | |
| notes_list.append((current_midi, start_sec, duration_sec)) | |
| i = j | |
| full_stem_midi_path = os.path.join(loops_dir, f"{stem_name}_MELODY_{key_tag}_{bpm_tag}{mod_tag}.mid") | |
| write_midi_file(notes_list, manual_bpm, full_stem_midi_path) | |
| output_files.append((full_stem_midi_path, loops_dir)) | |
| except Exception as e: | |
| print(f"MIDI generation failed for {stem_name}: {e}") | |
| # Do not stop execution | |
| # --- 8. CALCULATE TIMING & SLICING --- | |
| beats_per_bar = 4 | |
| if time_signature == "3/4": beats_per_bar = 3 | |
| slice_samples = [] | |
| if "Bar Loops" in loop_choice: | |
| bars = int(loop_choice.split(" ")[0]) | |
| loop_type_tag = f"{bars}Bar" | |
| loop_duration_samples = int((60.0 / bpm_int * beats_per_bar * bars) * sample_rate) | |
| if loop_duration_samples == 0: return [], loops_dir | |
| num_loops = len(y) // loop_duration_samples | |
| for i in range(num_loops): | |
| start_sample = i * loop_duration_samples | |
| end_sample = start_sample + loop_duration_samples | |
| slice_data = y[start_sample:end_sample] | |
| # Rich Metadata/Tagging via Filename Enhancement | |
| filename = os.path.join(loops_dir, f"{stem_name}_{loop_type_tag}_{i+1:03d}_{key_tag}_{bpm_tag}_{time_sig_tag}{mod_tag}.wav") | |
| sf.write(filename, slice_data, sample_rate, subtype='PCM_16') | |
| output_files.append((filename, loops_dir)) | |
| slice_samples.append(start_sample) | |
| elif "One-Shots" in loop_choice: | |
| loop_type_tag = "OneShot" | |
| onset_frames = librosa.onset.onset_detect( | |
| y=y_mono, sr=sample_rate, delta=sensitivity, | |
| wait=1, pre_avg=1, post_avg=1, post_max=1, units='frames' | |
| ) | |
| onset_samples = librosa.frames_to_samples(onset_frames) | |
| if len(onset_samples) > 0: | |
| num_onsets = len(onset_samples) | |
| slice_samples = list(onset_samples) | |
| for i, start_sample in enumerate(onset_samples): | |
| end_sample = onset_samples[i+1] if i+1 < num_onsets else len(y) | |
| slice_data = y[start_sample:end_sample] | |
| if crossfade_ms > 0: | |
| slice_data = apply_crossfade(slice_data, sample_rate, crossfade_ms) | |
| # Rich Metadata/Tagging via Filename Enhancement | |
| filename = os.path.join(loops_dir, f"{stem_name}_{loop_type_tag}_{i+1:03d}_{key_tag}_{bpm_tag}{mod_tag}.wav") | |
| sf.write(filename, slice_data, sample_rate, subtype='PCM_16') | |
| output_files.append((filename, loops_dir)) | |
| if not output_files: | |
| return [], loops_dir | |
| # --- 9. VISUALIZATION GENERATION --- | |
| img_path = generate_waveform_preview(y, sample_rate, slice_samples, stem_name, loop_choice, loops_dir) | |
| # Return audio file path and the single visualization map | |
| return [(audio_file, img_path) for audio_file, _ in output_files if audio_file.endswith(('.wav', '.mid'))], loops_dir | |
| # --- SLICING HANDLERS (UPDATED for NEW Inputs) --- | |
| async def slice_all_and_zip_real(vocals, drums, bass, other, guitar, piano, loop_choice, sensitivity, manual_bpm, time_signature, crossfade_ms, transpose_semitones, detected_bpm_key_str, pan_depth, level_depth, modulation_rate, target_dbfs, attack_gain, sustain_gain, filter_type, filter_freq, filter_depth): | |
| """ | |
| Slices all available stems, applies all transformations, and packages them into a ZIP file. | |
| """ | |
| log_history = "Starting batch slice...\n" | |
| yield { status_log: log_history } | |
| await asyncio.sleep(0.1) | |
| parts = detected_bpm_key_str.split(', ') | |
| key_str = parts[1] if len(parts) > 1 else "Unknown Key" | |
| stems_to_process = { | |
| "vocals": vocals, "drums": drums, "bass": bass, | |
| "other": other, "guitar": guitar, "piano": piano | |
| } | |
| zip_path = "Loop_Architect_Pack.zip" | |
| num_stems = sum(1 for data in stems_to_process.values() if data is not None) | |
| if num_stems == 0: | |
| raise gr.Error("No stems to process! Please separate stems first.") | |
| all_temp_dirs = [] | |
| try: | |
| with zipfile.ZipFile(zip_path, 'w') as zf: | |
| processed_count = 0 | |
| for name, data in stems_to_process.items(): | |
| if data is not None: | |
| log_history += f"--- Slicing {name} stem ---\n" | |
| yield { status_log: log_history } | |
| sliced_files_and_viz, temp_dir = slice_stem_real( | |
| (data[0], data[1]), loop_choice, sensitivity, name, | |
| manual_bpm, time_signature, crossfade_ms, transpose_semitones, key_str, | |
| pan_depth, level_depth, modulation_rate, target_dbfs, | |
| attack_gain, sustain_gain, filter_type, filter_freq, filter_depth | |
| ) | |
| if sliced_files_and_viz: | |
| # Write both WAV and MIDI files to the ZIP | |
| midi_count = sum(1 for f, _ in sliced_files_and_viz if f.endswith('.mid')) | |
| wav_count = sum(1 for f, _ in sliced_files_and_viz if f.endswith('.wav')) | |
| log_history += f"Generated {wav_count} WAV slices and {midi_count} MIDI files for {name}.\n" | |
| all_temp_dirs.append(temp_dir) | |
| for loop_file, _ in sliced_files_and_viz: | |
| # Create a subfolder for WAVs and a separate one for MIDIs in the zip | |
| ext = 'MIDI' if loop_file.endswith('.mid') else name | |
| arcname = os.path.join(ext, os.path.basename(loop_file)) | |
| zf.write(loop_file, arcname) | |
| else: | |
| log_history += f"No slices generated for {name}.\n" | |
| processed_count += 1 | |
| yield { status_log: log_history } | |
| log_history += "Packaging complete! WAVs and corresponding MIDIs are organized in the ZIP.\n" | |
| yield { | |
| status_log: log_history + "✅ Pack ready for download!", | |
| download_zip_file: gr.update(value=zip_path, visible=True) | |
| } | |
| except Exception as e: | |
| print(f"An error occurred during slice all: {e}") | |
| yield { status_log: log_history + f"❌ ERROR: {e}" } | |
| finally: | |
| for d in all_temp_dirs: | |
| if d and os.path.exists(d): | |
| shutil.rmtree(d) | |
| # --- Create the full Gradio Interface --- | |
| with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="red")) as demo: | |
| # State variables | |
| detected_bpm_key = gr.State(value="") | |
| harmonic_recs = gr.State(value="---") | |
| # Define outputs globally | |
| vocals_output = gr.Audio(label="Vocals", scale=4, visible=False) | |
| drums_output = gr.Audio(label="Drums", scale=4, visible=False) | |
| bass_output = gr.Audio(label="Bass", scale=4, visible=False) | |
| other_output = gr.Audio(label="Other / Instrumental", scale=4, visible=False) | |
| guitar_output = gr.Audio(label="Guitar", scale=4, visible=False) | |
| piano_output = gr.Audio(label="Piano", scale=4, visible=False) | |
| download_zip_file = gr.File(label="Download Your Loop Pack", visible=False) | |
| status_log = gr.Textbox(label="Status Log", lines=10, interactive=False) | |
| loop_gallery = gr.Gallery( | |
| label="Generated Loops Preview (Audio + Waveform Slice Map)", | |
| columns=8, object_fit="contain", height="auto", preview=True, | |
| type="numpy" | |
| ) | |
| gr.Markdown("# 🎵 Loop Architect (Pro Edition)") | |
| gr.Markdown("Upload any song to separate it into stems, detect musical attributes, and then slice and tag the stems for instant use in a DAW.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 1. Separate Stems") | |
| audio_input = gr.Audio(type="filepath", label="Upload a Track") | |
| with gr.Row(): | |
| reverb_reduction_option = gr.Checkbox( | |
| label="Dry Vocals", | |
| value=False, | |
| info="Reduce reverb on the vocal stem." | |
| ) | |
| model_selector = gr.Radio( | |
| ["htdemucs (High Quality 4-Stem)", "hdemucs (Faster 4-Stem)", "htdemucs_6s (6-Stem)", "2-Stem (Vocals Only)"], | |
| label="Separation Model Control", | |
| value="htdemucs (High Quality 4-Stem)" | |
| ) | |
| submit_button = gr.Button("Separate & Analyze Stems", variant="primary") | |
| gr.Markdown("### 2. Analysis & Transform") | |
| # Key/BPM Display | |
| gr.Textbox(label="Detected Tempo & Key", value="", interactive=False, elem_id="detected_bpm_key_output", placeholder="Run Separation to Analyze...", render=True, visible=True) | |
| # Harmonic Recommendations Display | |
| gr.Textbox(label="Harmonic Mixing Recommendations (Camelot Wheel)", value="---", interactive=False, elem_id="harmonic_recs_output", render=True, visible=True) | |
| # Transpose Control | |
| transpose_slider = gr.Slider( | |
| minimum=-12, maximum=12, value=0, step=1, | |
| label="Transpose Loops (Semitones)", | |
| info="Shift the pitch of all slices by +/- 1 octave. (Tags the file with `Shift`)" | |
| ) | |
| # --- TRANSIENT SHAPING --- | |
| gr.Markdown("### Transient Shaping (Drums Only)") | |
| with gr.Group(): | |
| attack_gain_slider = gr.Slider( | |
| minimum=0.5, maximum=1.5, value=1.0, step=0.1, | |
| label="Attack Gain Multiplier", | |
| info="Increase (>1.0) for punchier transients." | |
| ) | |
| sustain_gain_slider = gr.Slider( | |
| minimum=0.5, maximum=1.5, value=1.0, step=0.1, | |
| label="Sustain Gain Multiplier", | |
| info="Increase (>1.0) for longer tails/reverb." | |
| ) | |
| # --- MODULATION (PAN/LEVEL) --- | |
| gr.Markdown("### Pan/Level Modulation (LFO 1.0)") | |
| with gr.Group(): | |
| modulation_rate_radio = gr.Radio( | |
| ['1/2', '1/4', '1/8', '1/16'], | |
| label="Modulation Rate (Tempo Synced)", | |
| value='1/4', | |
| info="The speed of the Pan/Level pulse." | |
| ) | |
| pan_depth_slider = gr.Slider( | |
| minimum=0, maximum=100, value=0, step=5, | |
| label="Pan Modulation Depth (%)", | |
| info="Creates a stereo auto-pan effect." | |
| ) | |
| level_depth_slider = gr.Slider( | |
| minimum=0, maximum=100, value=0, step=5, | |
| label="Level Modulation Depth (%)", | |
| info="Creates a tempo-synced tremolo (volume pulse)." | |
| ) | |
| # --- FILTER MODULATION --- | |
| gr.Markdown("### Filter Modulation (LFO 2.0)") | |
| with gr.Group(): | |
| filter_type_radio = gr.Radio( | |
| ['Low-Pass', 'High-Pass'], | |
| label="Filter Type", | |
| value='Low-Pass' | |
| ) | |
| with gr.Row(): | |
| filter_freq_slider = gr.Slider( | |
| minimum=20, maximum=10000, value=2000, step=10, | |
| label="Base Cutoff Frequency (Hz)", | |
| ) | |
| filter_depth_slider = gr.Slider( | |
| minimum=0, maximum=5000, value=0, step=10, | |
| label="Modulation Depth (Hz)", | |
| info="0 = Static filter at Base Cutoff. Modifying any value tags the file with `MOD`." | |
| ) | |
| gr.Markdown("### 3. Slicing Options") | |
| with gr.Group(): | |
| # Normalization Control | |
| lufs_target_slider = gr.Slider( | |
| minimum=-18.0, maximum=-0.1, value=-3.0, step=0.1, | |
| label="Target Peak Level (dBFS)", | |
| info="Normalizes all exported loops to this peak volume." | |
| ) | |
| loop_options_radio = gr.Radio( | |
| ["One-Shots (All Transients)", "4 Bar Loops", "8 Bar Loops"], | |
| label="Slice Type", | |
| value="One-Shots (All Transients)", | |
| info="Bar Loops include automatic MIDI generation for melodic stems." | |
| ) | |
| with gr.Row(): | |
| bpm_input = gr.Number( | |
| label="Manual BPM", | |
| value=120, | |
| minimum=40, | |
| maximum=300, | |
| info="Overrides auto-detect for loop timing." | |
| ) | |
| time_sig_radio = gr.Radio( | |
| ["4/4", "3/4"], | |
| label="Time Signature", | |
| value="4/4", | |
| info="For correct bar length. (Tags the file with `44` or `34`)" | |
| ) | |
| sensitivity_slider = gr.Slider( | |
| minimum=0.01, maximum=0.5, value=0.05, step=0.01, | |
| label="One-Shot Sensitivity", | |
| info="Lower values = more slices." | |
| ) | |
| crossfade_ms_slider = gr.Slider( | |
| minimum=0, maximum=30, value=10, step=1, | |
| label="One-Shot Crossfade (ms)", | |
| info="Prevents clicks/pops on transient slices." | |
| ) | |
| gr.Markdown("### 4. Create Pack (Rich Tagging & MIDI)") | |
| slice_all_button = gr.Button("Slice, Transform & Tag ALL Stems (Create ZIP)", variant="stop") | |
| download_zip_file | |
| gr.Markdown("### Status") | |
| status_log.render() | |
| with gr.Column(scale=2): | |
| with gr.Accordion("Separated Stems (Preview & Slice)", open=True): | |
| # Base slice inputs - ALL inputs for slice_stem_real | |
| slice_inputs = [ | |
| loop_options_radio, sensitivity_slider, gr.Textbox(visible=False), # Placeholder for stem name | |
| bpm_input, time_sig_radio, crossfade_ms_slider, transpose_slider, detected_bpm_key, | |
| pan_depth_slider, level_depth_slider, modulation_rate_radio, | |
| lufs_target_slider, | |
| attack_gain_slider, sustain_gain_slider, | |
| filter_type_radio, filter_freq_slider, filter_depth_slider | |
| ] | |
| # Wrapper function to call slice_stem_real and update the gallery | |
| def slice_and_display_wrapper(stem_data, loop_choice, sensitivity, stem_name, manual_bpm, time_signature, crossfade_ms, transpose_semitones, detected_bpm_key_str, pan_depth, level_depth, modulation_rate, target_dbfs, attack_gain, sustain_gain, filter_type, filter_freq, filter_depth): | |
| if not detected_bpm_key_str: | |
| raise gr.Error("Please run 'Separate & Analyze Stems' first.") | |
| key_str = detected_bpm_key_str.split(', ')[1] if len(detected_bpm_key_str.split(', ')) > 1 else "Unknown Key" | |
| sliced_files_and_viz, temp_dir = slice_stem_real( | |
| stem_data, loop_choice, sensitivity, stem_name, | |
| manual_bpm, time_signature, crossfade_ms, transpose_semitones, key_str, | |
| pan_depth, level_depth, modulation_rate, target_dbfs, | |
| attack_gain, sustain_gain, filter_type, filter_freq, filter_depth | |
| ) | |
| gallery_output = [] | |
| if sliced_files_and_viz: | |
| # Find the first visualization for the gallery | |
| first_image_path = sliced_files_and_viz[0][1] if sliced_files_and_viz else None | |
| wav_count = sum(1 for f, _ in sliced_files_and_viz if f.endswith('.wav')) | |
| midi_count = sum(1 for f, _ in sliced_files_and_viz if f.endswith('.mid')) | |
| for i, (audio_file, _) in enumerate(sliced_files_and_viz): | |
| if audio_file.endswith('.wav'): | |
| label = os.path.basename(audio_file).rsplit('.', 1)[0] | |
| gallery_output.append((audio_file, label, first_image_path)) | |
| log_msg = f"✅ Sliced {stem_name} into {wav_count} WAVs and generated {midi_count} MIDIs. Waveform preview generated." | |
| else: | |
| log_msg = f"No slices generated for {stem_name}." | |
| if temp_dir and os.path.exists(temp_dir): | |
| pass | |
| return { | |
| loop_gallery: gr.update(value=gallery_output), | |
| status_log: log_msg | |
| } | |
| def update_output_visibility(selected_model): | |
| is_6_stem = "6-Stem" in selected_model | |
| is_2_stem = "2-Stem" in selected_model | |
| other_label = "Other" | |
| if is_2_stem: other_label = "Instrumental (No Vocals)" | |
| elif is_6_stem: other_label = "Other (No Guitar/Piano)" | |
| return ( | |
| gr.update(visible=True), | |
| gr.update(visible=True if not is_2_stem else False), | |
| gr.update(visible=True if not is_2_stem else False), | |
| gr.update(visible=True, label=other_label), | |
| gr.update(visible=is_6_stem), | |
| gr.update(visible=is_6_stem), | |
| gr.update(visible=is_6_stem), | |
| gr.update(visible=is_6_stem) | |
| ) | |
| with gr.Row(): | |
| vocals_output.render() | |
| slice_vocals_btn = gr.Button("Slice Vocals", scale=1) | |
| with gr.Row(): | |
| drums_output.render() | |
| slice_drums_btn = gr.Button("Slice Drums", scale=1) | |
| with gr.Row(): | |
| bass_output.render() | |
| slice_bass_btn = gr.Button("Slice Bass", scale=1) | |
| with gr.Row(): | |
| other_output.render() | |
| slice_other_btn = gr.Button("Slice Other", scale=1) | |
| with gr.Row(visible=False) as guitar_row: | |
| guitar_output.render() | |
| slice_guitar_btn = gr.Button("Slice Guitar", scale=1) | |
| with gr.Row(visible=False) as piano_row: | |
| piano_output.render() | |
| slice_piano_btn = gr.Button("Slice Piano", scale=1) | |
| gr.Markdown("### Sliced Loops / Samples (Preview)") | |
| loop_gallery.render() | |
| # --- MAIN EVENT LISTENERS --- | |
| # 1. Separation Event | |
| submit_button.click( | |
| fn=separate_stems, | |
| inputs=[gr.File(type="filepath"), model_selector, gr.Checkbox(visible=False), reverb_reduction_option], | |
| outputs=[ | |
| vocals_output, drums_output, bass_output, other_output, | |
| guitar_output, piano_output, | |
| status_log, detected_bpm_key, | |
| gr.Textbox(elem_id="detected_bpm_key_output"), | |
| gr.Textbox(elem_id="harmonic_recs_output") | |
| ] | |
| ) | |
| # 2. UI Visibility Event | |
| model_selector.change( | |
| fn=update_output_visibility, | |
| inputs=[model_selector], | |
| outputs=[ | |
| vocals_output, drums_output, bass_output, other_output, | |
| guitar_output, piano_output, | |
| guitar_row, piano_row | |
| ] | |
| ) | |
| # --- Single Slice Button Events --- | |
| slice_vocals_btn.click(fn=slice_and_display_wrapper, inputs=[vocals_output] + slice_inputs[:2] + [gr.Textbox("vocals", visible=False)] + slice_inputs[3:], outputs=[loop_gallery, status_log]) | |
| slice_drums_btn.click(fn=slice_and_display_wrapper, inputs=[drums_output] + slice_inputs[:2] + [gr.Textbox("drums", visible=False)] + slice_inputs[3:], outputs=[loop_gallery, status_log]) | |
| slice_bass_btn.click(fn=slice_and_display_wrapper, inputs=[bass_output] + slice_inputs[:2] + [gr.Textbox("bass", visible=False)] + slice_inputs[3:], outputs=[loop_gallery, status_log]) | |
| slice_other_btn.click(fn=slice_and_display_wrapper, inputs=[other_output] + slice_inputs[:2] + [gr.Textbox("other", visible=False)] + slice_inputs[3:], outputs=[loop_gallery, status_log]) | |
| slice_guitar_btn.click(fn=slice_and_display_wrapper, inputs=[guitar_output] + slice_inputs[:2] + [gr.Textbox("guitar", visible=False)] + slice_inputs[3:], outputs=[loop_gallery, status_log]) | |
| slice_piano_btn.click(fn=slice_and_display_wrapper, inputs=[piano_output] + slice_inputs[:2] + [gr.Textbox("piano", visible=False)] + slice_inputs[3:], outputs=[loop_gallery, status_log]) | |
| # 3. Slice All Event | |
| slice_all_event = slice_all_button.click( | |
| fn=slice_all_and_zip_real, | |
| inputs=[ | |
| vocals_output, drums_output, bass_output, other_output, guitar_output, piano_output, | |
| loop_options_radio, sensitivity_slider, | |
| bpm_input, time_sig_radio, crossfade_ms_slider, transpose_slider, detected_bpm_key, | |
| pan_depth_slider, level_depth_slider, modulation_rate_radio, lufs_target_slider, | |
| attack_gain_slider, sustain_gain_slider, | |
| filter_type_radio, filter_freq_slider, filter_depth_slider | |
| ], | |
| outputs=[download_zip_file, status_log] | |
| ) | |