import os import tempfile from typing import Optional import librosa import numpy as np import soundfile as sf def _load_audio(audio_path: str, mono: bool = False) -> tuple[np.ndarray, int]: """Load audio file with standard settings.""" y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq") # Ensure consistent (samples, channels) format if not mono and y.ndim > 1 and y.shape[0] == 2: y = y.T return y, int(sr) def detect_crossfade_point( insert_position: float, audio_duration: float, crossfade_duration: float = 0.1 ) -> tuple[float, float]: """ Calculate optimal crossfade points for seamless insertion. Args: insert_position: Where to insert the section (in seconds) audio_duration: Total duration of the target audio (in seconds) crossfade_duration: Length of crossfade (in seconds) Returns: Tuple of (start_time, end_time) for crossfade region """ # Calculate crossfade boundaries fade_start = max(0, insert_position - crossfade_duration / 2) fade_end = min(audio_duration, insert_position + crossfade_duration / 2) return fade_start, fade_end def apply_crossfade( section: np.ndarray, target: np.ndarray, crossfade_duration: float, sample_rate: int ) -> np.ndarray: """ Apply crossfade between section and target audio. Args: section: Audio section to insert target: Target audio to insert into crossfade_duration: Length of crossfade in seconds sample_rate: Sample rate of audio Returns: Target audio with section inserted """ # Calculate crossfade samples fade_samples = int(crossfade_duration * sample_rate) # Create crossfade envelope fade_in = np.linspace(0, 1, fade_samples) fade_out = np.linspace(1, 0, fade_samples) # Handle stereo audio if section.ndim > 1: crossfade = np.outer(fade_in * fade_out, np.ones(section.shape[1])) else: crossfade = fade_in * fade_out # Apply crossfade to section end section_end = section[-fade_samples:] if len(section) > fade_samples else section if section_end.ndim > 1: section_end[:fade_samples] *= crossfade else: section_end[:fade_samples] *= crossfade # Insert section into target insert_sample = int(len(target) * 0.5) # Insert at middle result = np.insert(target, insert_sample, section_end, axis=0) return result def insert_section( audio_path: str, section_path: str, insert_time: float, crossfade_duration: float = 0.1, output_path: Optional[str] = None, output_format: str = "wav", ) -> str: """ Insert a section from one audio track into another at a precise time position. This function allows you to insert audio content (like an intro, advertisement, or sound effect) into an existing track at any position with smooth crossfading to avoid audible clicks or abrupt transitions. Args: audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) section_path: Path to the audio section to insert (supports common formats: WAV, MP3, FLAC, M4A) insert_time: Position to insert the section (in seconds from start of main audio) crossfade_duration: Length of crossfade in seconds (default: 0.1) Longer crossfades create smoother transitions but reduce clarity output_path: Optional output directory (default: None, uses temp directory) output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav') Returns: Path to the audio file with the section inserted Examples: >>> insert_section("main_track.wav", "intro.wav", 5.0, 0.2, "output", "wav") # Returns 'path/to/main_with_intro.wav' with intro inserted at 5 seconds >>> insert_section("podcast.mp3", "advertisement.mp3", 180.0, 0.5, "output", "mp3") # Returns 'path/to/podcast_with_ad.mp3' with ad inserted at 3 minutes Note: - Insert position is measured from the start of the main audio - Crossfade prevents clicks and creates smooth transitions - If insert_time + section duration exceeds main audio duration, section is truncated - Works with mono or stereo audio files - Preserves original audio quality and sample rate - Processing time depends on audio length and crossfade duration """ try: # Load both audio files main_audio, main_sr = _load_audio(audio_path, mono=False) section_audio, section_sr = _load_audio(section_path, mono=False) # Resample if needed if main_sr != section_sr: if section_audio.ndim > 1: # Resample each channel separately section_audio = np.array( [ librosa.resample( section_audio[:, ch], orig_sr=section_sr, target_sr=main_sr ) for ch in range(section_audio.shape[1]) ] ).T else: section_audio = librosa.resample( section_audio, orig_sr=section_sr, target_sr=main_sr ) # Calculate timing main_duration = len(main_audio) / main_sr # Validate insert position if insert_time < 0: raise ValueError("Insert time must be positive") if insert_time > main_duration: raise ValueError( f"Insert time ({insert_time}s) exceeds main audio duration ({main_duration}s)" ) # Calculate crossfade points fade_start, fade_end = detect_crossfade_point( insert_time, main_duration, crossfade_duration ) # Extract main audio segments main_before = main_audio[: int(fade_start * main_sr)] main_after = main_audio[int(fade_end * main_sr) :] # Apply crossfade and insert section result = apply_crossfade(section_audio, main_after, crossfade_duration, main_sr) # Combine all parts final_audio = np.concatenate([main_before, result]) # Save output if output_path is None: output_path = tempfile.mkdtemp(suffix="_inserted") else: os.makedirs(output_path, exist_ok=True) # Generate output filename main_filename = os.path.splitext(os.path.basename(audio_path))[0] output_filename = f"{main_filename}_with_insertion.{output_format}" output_file = os.path.join(output_path, output_filename) # Save final audio sf.write(output_file, final_audio, main_sr) return output_file except Exception as e: raise RuntimeError(f"Error inserting audio section: {str(e)}") def insert_multiple_sections( audio_path: str, sections: list[tuple[str, float, float]], crossfade_duration: float = 0.1, output_path: Optional[str] = None, output_format: str = "wav", ) -> str: """ Insert multiple sections into an audio track at specified positions. This function allows inserting multiple audio sections (like multiple ads, sound effects, or musical segments) into a main track with smooth transitions between each insertion. Args: audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) sections: List of (section_path, insert_time) tuples section_path: Path to audio section to insert insert_time: Position to insert section (in seconds) crossfade_duration: Length of crossfade in seconds (default: 0.1) output_path: Optional output directory (default: None, uses temp directory) output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav') Returns: Path to the audio file with all sections inserted Examples: >>> insert_multiple_sections("track.wav", [("intro.wav", 0), ("ad1.wav", 30), ("ad2.wav", 180)], 0.2) # Returns 'path/to/track_with_insertions.wav' with intro at start, ads at 30s and 3min >>> insert_multiple_sections("podcast.mp3", [("sponsor.wav", 60)], 0.3, "output", "mp3") # Returns 'path/to/podcast_with_sponsor.wav' with sponsor segment at 1 minute Note: - Sections are inserted in chronological order - Each section gets crossfade at both start and end - If sections overlap, later sections take precedence - Total processing time increases with number of sections - Works best with non-overlapping insertion times """ try: # Load main audio main_audio, main_sr = _load_audio(audio_path, mono=False) main_duration = len(main_audio) / main_sr current_audio = main_audio.copy() # Sort sections by insert time sorted_sections = sorted(sections, key=lambda x: x[1]) # Insert each section for section_path, insert_time, _ in sorted_sections: # Load section section_audio, section_sr = _load_audio(section_path, mono=False) # Resample if needed if section_sr != main_sr: if section_audio.ndim > 1: # Resample each channel separately section_audio = np.array( [ librosa.resample( section_audio[:, ch], orig_sr=section_sr, target_sr=main_sr, ) for ch in range(section_audio.shape[1]) ] ).T else: section_audio = librosa.resample( section_audio, orig_sr=section_sr, target_sr=main_sr ) # Calculate crossfade points fade_start, fade_end = detect_crossfade_point( insert_time, main_duration, crossfade_duration ) # Extract current audio segments current_before = current_audio[: int(fade_start * main_sr)] current_after = current_audio[int(fade_end * main_sr) :] # Apply crossfade and insert section section_with_fade = apply_crossfade( section_audio, current_after, crossfade_duration, main_sr ) # Update current audio current_audio = np.concatenate([current_before, section_with_fade]) # Update duration for next insertion main_duration = len(current_audio) / main_sr # Save output if output_path is None: output_path = tempfile.mkdtemp(suffix="_multi_inserted") else: os.makedirs(output_path, exist_ok=True) # Generate output filename main_filename = os.path.splitext(os.path.basename(audio_path))[0] output_filename = f"{main_filename}_with_multiple_insertions.{output_format}" output_file = os.path.join(output_path, output_filename) # Save final audio sf.write(output_file, current_audio, main_sr) return output_file except Exception as e: raise RuntimeError(f"Error inserting multiple sections: {str(e)}") def replace_section( audio_path: str, start_time: float, end_time: float, replacement_path: str, crossfade_duration: float = 0.1, output_path: Optional[str] = None, output_format: str = "wav", ) -> str: """ Replace a section of an audio track with another audio segment. This function removes a specified time range from the main audio and replaces it with new content, using crossfades for smooth transitions. Args: audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) start_time: Start time of section to replace (in seconds) end_time: End time of section to replace (in seconds) replacement_path: Path to the replacement audio segment (supports common formats: WAV, MP3, FLAC, M4A) crossfade_duration: Length of crossfade in seconds (default: 0.1) output_path: Optional output directory (default: None, uses temp directory) output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav') Returns: Path to the audio file with the section replaced Examples: >>> replace_section("song.wav", 60.0, 90.0, "new_verse.wav", 0.2, "output", "wav") # Returns 'path/to/song_replaced.wav' with 60-90s section replaced >>> replace_section("podcast.mp3", 120.0, 150.0, "correction.wav", 0.3, "output", "mp3") # Returns 'path/to/podcast_replaced.mp3' with 2-minute section replaced Note: - Start time must be less than end time - Crossfade prevents clicks at replacement boundaries - Replacement section is trimmed if longer than specified duration - Preserves original audio quality and sample rate - Useful for fixing mistakes, updating content, or adding corrections """ try: # Load both audio files main_audio, main_sr = _load_audio(audio_path, mono=False) replacement_audio, replacement_sr = _load_audio(replacement_path, mono=False) # Validate timing if start_time >= end_time: raise ValueError("Start time must be less than end time") # Convert times to samples start_sample = int(start_time * main_sr) end_sample = int(end_time * main_sr) # Extract main audio parts main_before = main_audio[:start_sample] main_after = main_audio[end_sample:] # Resample replacement if needed if replacement_sr != main_sr: if replacement_audio.ndim > 1: # Resample each channel separately replacement_audio = np.array( [ librosa.resample( replacement_audio[:, ch], orig_sr=replacement_sr, target_sr=main_sr, ) for ch in range(replacement_audio.shape[1]) ] ).T else: replacement_audio = librosa.resample( replacement_audio, orig_sr=replacement_sr, target_sr=main_sr ) # Trim replacement to specified duration replacement_duration = end_time - start_time replacement_samples = int(replacement_duration * main_sr) trimmed_replacement = ( replacement_audio[:replacement_samples] if len(replacement_audio) > replacement_samples else replacement_audio ) # Apply crossfades fade_samples = int(crossfade_duration * main_sr) # Fade in replacement fade_in = np.linspace(0, 1, fade_samples) if trimmed_replacement.ndim > 1: fade_in = np.outer(fade_in, np.ones(trimmed_replacement.shape[1])) trimmed_replacement[:fade_samples] *= fade_in # Fade out at end of replacement fade_out = np.linspace(1, 0, fade_samples) if trimmed_replacement.ndim > 1: fade_out = np.outer(fade_out, np.ones(trimmed_replacement.shape[1])) trimmed_replacement[-fade_samples:] *= fade_out # Combine all parts final_audio = np.concatenate([main_before, trimmed_replacement, main_after]) # Save output if output_path is None: output_path = tempfile.mkdtemp(suffix="_replaced") else: os.makedirs(output_path, exist_ok=True) # Generate output filename main_filename = os.path.splitext(os.path.basename(audio_path))[0] output_filename = f"{main_filename}_replaced.{output_format}" output_file = os.path.join(output_path, output_filename) # Save final audio sf.write(output_file, final_audio, main_sr) return output_file except Exception as e: raise RuntimeError(f"Error replacing audio section: {str(e)}") def insert_section_wrapper( audio_path: str, insert_path: str, insert_time: float, crossfade_duration: float = 0.1, output_format: str = "wav", ) -> str: """ Wrapper function for inserting audio sections with error handling for MCP integration. Args: audio_path: Path to the main audio file insert_path: Path to the audio section to insert insert_time: Time to insert the section (in seconds) crossfade_duration: Length of crossfade in seconds (default: 0.1) output_format: Output format ('wav' or 'mp3', default: 'wav') Returns: Path to output file or error message """ try: return insert_section( audio_path=audio_path, section_path=insert_path, insert_time=insert_time, crossfade_duration=crossfade_duration, output_path=None, output_format=output_format, ) except Exception as e: return f"Error: {str(e)}" def replace_section_wrapper( audio_path: str, start_time: float, end_time: float, replacement_path: str, crossfade_duration: float = 0.1, output_format: str = "wav", ) -> str: """ Wrapper function for replacing audio sections with error handling for MCP integration. Args: audio_path: Path to the main audio file start_time: Start time of section to replace (in seconds) end_time: End time of section to replace (in seconds) replacement_path: Path to the replacement audio segment crossfade_duration: Length of crossfade in seconds (default: 0.1) output_format: Output format ('wav' or 'mp3', default: 'wav') Returns: Path to output file or error message """ try: return replace_section( audio_path=audio_path, start_time=start_time, end_time=end_time, replacement_path=replacement_path, crossfade_duration=crossfade_duration, output_path=None, output_format=output_format, ) except Exception as e: return f"Error: {str(e)}" if __name__ == "__main__": """ Script section for running audio insertion/replacement locally. Usage: python tools/audio_insertion.py insert main.wav insert.wav 30.0 python tools/audio_insertion.py replace main.wav 10.0 20.0 replacement.wav """ import argparse import sys parser = argparse.ArgumentParser( description="Insert or replace audio sections", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Insert section at 30 seconds python tools/audio_insertion.py insert main.wav insert.wav 30.0 # Replace section from 10s to 20s python tools/audio_insertion.py replace main.wav 10.0 20.0 replacement.wav # With custom crossfade python tools/audio_insertion.py insert main.wav insert.wav 30.0 --crossfade 0.2 """, ) subparsers = parser.add_subparsers(dest="command", help="Command to run") # Insert command insert_parser = subparsers.add_parser("insert", help="Insert audio section") insert_parser.add_argument("main", help="Main audio file") insert_parser.add_argument("insert", help="Audio section to insert") insert_parser.add_argument("time", type=float, help="Insert time in seconds") insert_parser.add_argument( "--crossfade", type=float, default=0.1, help="Crossfade duration in seconds (default: 0.1)", ) insert_parser.add_argument( "--format", choices=["wav", "mp3"], default="wav", help="Output format (default: wav)", ) # Replace command replace_parser = subparsers.add_parser("replace", help="Replace audio section") replace_parser.add_argument("main", help="Main audio file") replace_parser.add_argument("start", type=float, help="Start time in seconds") replace_parser.add_argument("end", type=float, help="End time in seconds") replace_parser.add_argument("replacement", help="Replacement audio section") replace_parser.add_argument( "--crossfade", type=float, default=0.1, help="Crossfade duration in seconds (default: 0.1)", ) replace_parser.add_argument( "--format", choices=["wav", "mp3"], default="wav", help="Output format (default: wav)", ) args = parser.parse_args() if not args.command: parser.print_help() sys.exit(1) print("Audio Insertion Tool") print("=" * 25) try: result = None if args.command == "insert": print(f"Main audio: {args.main}") print(f"Insert section: {args.insert}") print(f"Insert time: {args.time}s") print(f"Crossfade: {args.crossfade}s") print() result = insert_section_wrapper( audio_path=args.main, insert_path=args.insert, insert_time=args.time, crossfade_duration=args.crossfade, output_format=args.format, ) elif args.command == "replace": print(f"Main audio: {args.main}") print(f"Replace section: {args.start}s - {args.end}s") print(f"Replacement: {args.replacement}") print(f"Crossfade: {args.crossfade}s") print() result = replace_section_wrapper( audio_path=args.main, start_time=args.start, end_time=args.end, replacement_path=args.replacement, crossfade_duration=args.crossfade, output_format=args.format, ) if result is None: print("❌ No command executed") sys.exit(1) elif result.startswith("Error:"): print(f"❌ {result}") sys.exit(1) else: print(f"✅ Audio {args.command}ion completed!") print(f"Output saved to: {result}") except Exception as e: print(f"❌ Error: {e}") sys.exit(1)