import gradio as gr import numpy as np import tempfile import librosa import soundfile as sf from scipy import signal import os class AIHumanizer: def __init__(self): pass def humanize_audio(self, audio_path, intensity=0.7): """Remove AI artifacts and make audio sound human-made""" try: print(f"Loading audio from: {audio_path}") # Load the full song - handle both mono and stereo y, sr = librosa.load(audio_path, sr=None, mono=False) print(f"Audio loaded: shape={y.shape}, sr={sr}, duration={len(y)/sr:.2f}s") # If stereo, process both channels if len(y.shape) > 1: print("Processing stereo audio...") processed_channels = [] for i, channel in enumerate(y): print(f"Processing channel {i+1}...") processed_channel = self.process_channel(channel, sr, intensity) processed_channels.append(processed_channel) y_processed = np.array(processed_channels) else: print("Processing mono audio...") y_processed = self.process_channel(y, sr, intensity) y_processed = np.array([y_processed]) # Make it 2D for consistency print("Audio processing completed successfully") return y_processed, sr except Exception as e: print(f"Error in humanize_audio: {str(e)}") raise Exception(f"Humanization failed: {str(e)}") def process_channel(self, y, sr, intensity): """Process a single audio channel to remove AI artifacts""" print(f"Processing channel: {len(y)} samples, intensity={intensity}") # 1. Reduce robotic frequencies y_processed = self.reduce_ai_artifacts(y, sr, intensity) # 2. Add timing variations y_processed = self.add_timing_variations(y_processed, sr, intensity) # 3. Add pitch variations y_processed = self.add_pitch_variations(y_processed, sr, intensity) # 4. Add room ambiance y_processed = self.add_room_ambiance(y_processed, sr, intensity) # 5. Add analog warmth y_processed = self.add_analog_warmth(y_processed, sr, intensity) # 6. Reduce perfect quantization y_processed = self.reduce_perfect_quantization(y_processed, sr, intensity) return y_processed def reduce_ai_artifacts(self, y, sr, intensity): """Reduce common AI audio artifacts""" # Reduce harsh frequencies in the 2kHz-6kHz range (common AI artifacts) if sr > 4000: # Only if sample rate is high enough sos = signal.butter(4, [1900, 6100], 'bandstop', fs=sr, output='sos') y_filtered = signal.sosfilt(sos, y) # Blend with original based on intensity y_processed = y * (1 - intensity*0.3) + y_filtered * (intensity*0.3) return y_processed return y def add_timing_variations(self, y, sr, intensity): """Add subtle timing variations""" if intensity < 0.1: return y # Create small random speed variations segment_size = int(sr * 2.0) # 2-second segments segments = [] for i in range(0, len(y), segment_size): segment = y[i:i+segment_size] if len(segment) > 100: # Only process if segment is long enough # Small speed variation speed_var = 1.0 + np.random.normal(0, 0.004 * intensity) new_length = int(len(segment) / speed_var) if new_length > 0 and len(segment) > 0: # Simple resampling for timing variation original_indices = np.arange(len(segment)) new_indices = np.linspace(0, len(segment)-1, new_length) segment_varied = np.interp(new_indices, original_indices, segment) # Resample back to original length if needed if len(segment_varied) != len(segment): if len(segment_varied) > len(segment): segment_varied = segment_varied[:len(segment)] else: segment_varied = np.pad(segment_varied, (0, len(segment) - len(segment_varied))) segments.append(segment_varied) else: segments.append(segment) else: segments.append(segment) if segments: return np.concatenate(segments) return y def add_pitch_variations(self, y, sr, intensity): """Add subtle pitch variations""" if intensity < 0.2: return y try: # Use librosa for pitch shifting (more reliable) n_steps = np.random.normal(0, 0.1 * intensity) y_shifted = librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps, bins_per_octave=24) # Blend with original blend_factor = 0.15 * intensity return y * (1 - blend_factor) + y_shifted * blend_factor except: return y def add_room_ambiance(self, y, sr, intensity): """Add natural room reverb""" if intensity < 0.1: return y # Simple impulse response for natural room impulse_length = int(0.2 * sr) # 200ms reverb if impulse_length < 10: return y impulse = np.zeros(impulse_length) # Early reflections early_reflections = int(0.01 * sr) # 10ms if early_reflections < len(impulse): impulse[early_reflections] = 0.6 # Late reverb tail reverb_start = min(early_reflections + 1, len(impulse)) if reverb_start < len(impulse): tail_length = len(impulse) - reverb_start decay = np.exp(-np.linspace(0, 8, tail_length)) impulse[reverb_start:] = decay * 0.3 # Normalize impulse if np.max(np.abs(impulse)) > 0: impulse = impulse / np.max(np.abs(impulse)) # Apply convolution try: y_reverb = signal.convolve(y, impulse, mode='same') # Normalize to prevent clipping if np.max(np.abs(y_reverb)) > 0: y_reverb = y_reverb / np.max(np.abs(y_reverb)) * np.max(np.abs(y)) # Blend with original blend_factor = 0.08 * intensity return y * (1 - blend_factor) + y_reverb * blend_factor except: return y def add_analog_warmth(self, y, sr, intensity): """Add analog-style warmth""" # Soft clipping saturation saturation_amount = 1.0 + 0.3 * intensity y_saturated = np.tanh(y * saturation_amount) / saturation_amount # Add subtle warmth with EQ try: # Gentle low-end boost sos = signal.butter(2, 80, 'highpass', fs=sr, output='sos') y_warm = signal.sosfilt(sos, y_saturated) # Blend blend_factor = 0.1 * intensity return y * (1 - blend_factor) + y_warm * blend_factor except: return y_saturated def reduce_perfect_quantization(self, y, sr, intensity): """Reduce perfectly quantized timing with amplitude variations""" # Add subtle random amplitude variations t = np.linspace(0, len(y)/sr, len(y)) # Low-frequency amplitude modulation lfo_rate = 0.3 + 0.4 * intensity # Hz lfo_depth = 0.03 * intensity amplitude_variation = 1.0 + np.sin(2 * np.pi * lfo_rate * t) * lfo_depth # Random micro-variations random_variation = 1.0 + np.random.normal(0, 0.01 * intensity, len(y)) # Combine variations total_variation = amplitude_variation * random_variation return y * total_variation def humanize_song(input_mp3, intensity): """Main humanization function""" if input_mp3 is None: return None, "Please upload an audio file" humanizer = AIHumanizer() try: print("Starting humanization process...") # Process the entire song to remove AI artifacts audio_data, sr = humanizer.humanize_audio(input_mp3, intensity) print(f"Humanization complete. Saving audio: shape={audio_data.shape}, sr={sr}") # Save as WAV (more reliable than MP3) output_path = tempfile.mktemp(suffix='_humanized.wav') # Ensure data is in correct format if len(audio_data.shape) > 1: audio_data = audio_data.T # Transpose for soundfile sf.write(output_path, audio_data, sr) print(f"Audio saved successfully to: {output_path}") return output_path, "✅ Song humanized! AI artifacts removed and human feel added." except Exception as e: error_msg = f"❌ Error: {str(e)}" print(error_msg) return None, error_msg # Simple and reliable interface with gr.Blocks(theme=gr.themes.Soft(), title="AI Song Humanizer") as demo: gr.Markdown(""" # 🎵 AI Song Humanizer **Remove AI Detection - Make Your Songs Sound Human-Made** *Upload your AI-generated song → Remove robotic artifacts → Download natural-sounding version* """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 1. Upload AI Song") input_audio = gr.Audio( sources=["upload", "microphone"], type="filepath", label="Upload your complete AI-generated song", editable=True ) gr.Markdown("### 2. Humanization Strength") intensity = gr.Slider( 0.1, 1.0, value=0.7, label="How much human feel to add", info="Lower = subtle, Higher = more natural/organic" ) process_btn = gr.Button( "🎹 Humanize This Song", variant="primary", size="lg" ) with gr.Column(scale=1): gr.Markdown("### 3. Download Result") output_audio = gr.Audio( label="Your Human-Sounding Song", type="filepath", interactive=False ) status = gr.Textbox( label="Status", interactive=False, max_lines=3 ) with gr.Accordion("💡 How It Works", open=True): gr.Markdown(""" **This tool processes your EXISTING song to remove AI characteristics:** ✅ **Keeps Everything Original:** - Your complete song structure - All vocals and instruments - Melody and arrangement - Everything you created 🎛️ **Removes AI Artifacts:** - Robotic/metallic frequencies - Perfect digital quantization - Sterile, artificial sound - AI-generated frequency patterns 🎵 **Adds Human Elements:** - Natural timing variations - Subtle pitch fluctuations - Room ambiance and warmth - Analog-style character **Result:** Your same song, but it sounds like humans performed it! """) # Processing function process_btn.click( fn=humanize_song, inputs=[input_audio, intensity], outputs=[output_audio, status] ) if __name__ == "__main__": demo.launch(debug=True)