import gradio as gr import pretty_midi import numpy as np import tempfile import os import librosa import torch import torchaudio from pathlib import Path import soundfile as sf import io class MP3ToHumanized: def __init__(self): self.groove_profiles = { "drums": {"timing_var": 0.02, "velocity_var": 15, "swing_factor": 0.1}, "melody": {"timing_var": 0.01, "velocity_var": 10, "swing_factor": 0.05}, "bass": {"timing_var": 0.015, "velocity_var": 12, "swing_factor": 0.07}, "chords": {"timing_var": 0.008, "velocity_var": 8, "swing_factor": 0.03}, "other": {"timing_var": 0.01, "velocity_var": 10, "swing_factor": 0.05} } def audio_to_midi(self, audio_path, conversion_method="basic"): """Convert audio file to MIDI using different methods""" try: # Load audio file y, sr = librosa.load(audio_path, sr=22050) if conversion_method == "basic": return self.basic_audio_to_midi(y, sr) elif conversion_method == "melody": return self.melody_extraction_to_midi(y, sr) else: return self.rhythm_based_midi(y, sr) except Exception as e: raise Exception(f"Audio to MIDI conversion failed: {str(e)}") def basic_audio_to_midi(self, y, sr): """Basic onset detection and pitch estimation""" # Create a pretty_midi object midi = pretty_midi.PrettyMIDI() # Create instrument piano_program = pretty_midi.instrument_name_to_program('Acoustic Grand Piano') instrument = pretty_midi.Instrument(program=piano_program) # Detect onsets (when notes start) onset_frames = librosa.onset.onset_detect(y=y, sr=sr, hop_length=512, backtrack=True) onset_times = librosa.frames_to_time(onset_frames, sr=sr, hop_length=512) # Estimate pitch for each onset for onset_time in onset_times: # Extract a segment around the onset start_sample = int(onset_time * sr) end_sample = start_sample + int(0.5 * sr) # 500ms segment if end_sample < len(y): segment = y[start_sample:end_sample] # Estimate fundamental frequency f0 = self.estimate_pitch(segment, sr) if f0 > 0: # Convert frequency to MIDI note number midi_note = int(69 + 12 * np.log2(f0 / 440.0)) # Only add if it's a valid MIDI note if 0 <= midi_note <= 127: # Create note note = pretty_midi.Note( velocity=np.random.randint(60, 100), pitch=midi_note, start=onset_time, end=onset_time + 0.5 # 500ms duration ) instrument.notes.append(note) midi.instruments.append(instrument) return midi def melody_extraction_to_midi(self, y, sr): """Extract melody and convert to MIDI""" midi = pretty_midi.PrettyMIDI() instrument = pretty_midi.Instrument(program=0) # Piano # Use librosa's melody extraction f0, voiced_flag, voiced_probs = librosa.pyin( y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr ) times = librosa.times_like(f0, sr=sr, hop_length=512) current_note = None note_start = 0 for time, freq, voiced in zip(times, f0, voiced_flag): if voiced and not np.isnan(freq): midi_note = int(69 + 12 * np.log2(freq / 440.0)) if 0 <= midi_note <= 127: if current_note != midi_note: if current_note is not None: # End previous note note = pretty_midi.Note( velocity=80, pitch=current_note, start=note_start, end=time ) instrument.notes.append(note) # Start new note current_note = midi_note note_start = time else: if current_note is not None: # End current note note = pretty_midi.Note( velocity=80, pitch=current_note, start=note_start, end=time ) instrument.notes.append(note) current_note = None midi.instruments.append(instrument) return midi def rhythm_based_midi(self, y, sr): """Create rhythm-based MIDI from percussive elements""" midi = pretty_midi.PrettyMIDI() # Drum instrument drum_instrument = pretty_midi.Instrument(program=0, is_drum=True) # Detect strong beats and onsets tempo, beats = librosa.beat.beat_track(y=y, sr=sr) beat_times = librosa.frames_to_time(beats, sr=sr) # Add drum hits on beats for beat_time in beat_times: # Kick drum on strong beats note = pretty_midi.Note( velocity=100, pitch=36, # Kick drum start=beat_time, end=beat_time + 0.1 ) drum_instrument.notes.append(note) midi.instruments.append(drum_instrument) return midi def estimate_pitch(self, segment, sr): """Estimate fundamental frequency from audio segment""" try: # Use autocorrelation for pitch detection corr = np.correlate(segment, segment, mode='full') corr = corr[len(corr)//2:] # Find the first peak after zero lag (fundamental frequency) d = np.diff(corr) start = np.where(d > 0)[0] if len(start) > 0: start = start[0] peak = np.argmax(corr[start:]) + start freq = sr / peak if peak > 0 else 0 return freq except: pass return 0 def humanize_midi(self, midi_data, intensity=0.7, style="organic"): """Humanize the MIDI data""" tempo = midi_data.estimate_tempo() if len(midi_data.instruments) > 0 else 120 for instrument in midi_data.instruments: inst_type = "drums" if instrument.is_drum else "melody" profile = self.groove_profiles[inst_type] for note in instrument.notes: # Humanize timing timing_shift = np.random.normal(0, profile["timing_var"] * intensity) note.start = max(0, note.start + timing_shift) # Humanize duration (except drums) if not instrument.is_drum: duration_shift = np.random.normal(0, profile["timing_var"] * 0.3 * intensity) note.end = max(note.start + 0.1, note.end + duration_shift) # Humanize velocity vel_shift = np.random.randint(-profile["velocity_var"], profile["velocity_var"]) new_velocity = note.velocity + int(vel_shift * intensity) note.velocity = max(20, min(127, new_velocity)) return midi_data def process_audio_files(files, intensity, style, conversion_method): if not files: return None, None, "Please upload audio files (MP3, WAV, etc.)" converter = MP3ToHumanized() processed_files = [] for file in files: try: # Convert audio to MIDI midi_data = converter.audio_to_midi(file.name, conversion_method) # Humanize the MIDI humanized_midi = converter.humanize_midi(midi_data, intensity, style) # Save humanized MIDI output_path = tempfile.mktemp(suffix='_humanized.mid') humanized_midi.write(output_path) processed_files.append(output_path) except Exception as e: return None, None, f"Error processing {file.name}: {str(e)}" if processed_files: # Create audio preview from first file preview_audio = None try: # Convert MIDI back to audio for preview midi_data = pretty_midi.PrettyMIDI(processed_files[0]) audio_data = midi_data.synthesize() preview_path = tempfile.mktemp(suffix='_preview.wav') sf.write(preview_path, audio_data, 44100) preview_audio = preview_path except: preview_audio = None return processed_files, preview_audio, f"✅ Successfully processed {len(processed_files)} files!" else: return None, None, "❌ No files were processed successfully." # Create Gradio interface with gr.Blocks(theme=gr.themes.Soft(), title="MP3 HumanizeBot") as demo: gr.Markdown(""" # đŸŽĩ MP3 HumanizeBot **Convert MP3/Audio to MIDI and remove AI traces to sound human-made!** Upload audio files from AI music generators, convert to MIDI, and apply natural humanization. """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 📁 Upload Audio Files") file_input = gr.File( file_count="multiple", file_types=[".mp3", ".wav", ".ogg", ".m4a", ".flac"], label="Upload Audio Files", type="filepath" ) conversion_method = gr.Radio( ["basic", "melody", "rhythm"], value="basic", label="đŸŽĩ Conversion Method", info="Basic = general purpose, Melody = focus on tunes, Rhythm = focus on beats" ) intensity = gr.Slider( 0.1, 1.0, value=0.7, label="đŸŽšī¸ Humanization Intensity" ) style = gr.Radio( ["organic", "groovy", "gentle"], value="organic", label="🎸 Humanization Style" ) process_btn = gr.Button( "✨ Convert & Humanize!", variant="primary", size="lg" ) with gr.Column(scale=1): gr.Markdown("### đŸ“Ĩ Download Results") file_output = gr.File( file_count="multiple", label="Download Humanized MIDI Files" ) audio_output = gr.Audio( label="MIDI Audio Preview", interactive=False ) status = gr.Textbox( label="Status", interactive=False, max_lines=4 ) with gr.Accordion("â„šī¸ How It Works", open=False): gr.Markdown(""" **Process:** 1. **Upload** your AI-generated audio files (MP3, WAV, etc.) 2. **Convert** to MIDI using pitch and rhythm detection 3. **Humanize** with timing and velocity variations 4. **Download** humanized MIDI files **Conversion Methods:** - **Basic**: General purpose conversion for most music - **Melody**: Focuses on extracting melodic content - **Rhythm**: Focuses on drum patterns and beats **Note**: Audio-to-MIDI conversion is challenging and works best with: - Clear melodic lines - Good audio quality - Not too much reverb/effects """) process_btn.click( fn=process_audio_files, inputs=[file_input, intensity, style, conversion_method], outputs=[file_output, audio_output, status] ) if __name__ == "__main__": demo.launch(debug=True)