Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import os | |
| import tempfile | |
| from typing import Optional | |
| import librosa | |
| import numpy as np | |
| import soundfile as sf | |
| def _load_audio(audio_path: str, mono: bool = False) -> tuple[np.ndarray, int]: | |
| """Load audio file with standard settings.""" | |
| y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq") | |
| # Ensure consistent (samples, channels) format | |
| if not mono and y.ndim > 1 and y.shape[0] == 2: | |
| y = y.T | |
| return y, int(sr) | |
| def detect_crossfade_point( | |
| insert_position: float, audio_duration: float, crossfade_duration: float = 0.1 | |
| ) -> tuple[float, float]: | |
| """ | |
| Calculate optimal crossfade points for seamless insertion. | |
| Args: | |
| insert_position: Where to insert the section (in seconds) | |
| audio_duration: Total duration of the target audio (in seconds) | |
| crossfade_duration: Length of crossfade (in seconds) | |
| Returns: | |
| Tuple of (start_time, end_time) for crossfade region | |
| """ | |
| # Calculate crossfade boundaries | |
| fade_start = max(0, insert_position - crossfade_duration / 2) | |
| fade_end = min(audio_duration, insert_position + crossfade_duration / 2) | |
| return fade_start, fade_end | |
| def apply_crossfade( | |
| section: np.ndarray, target: np.ndarray, crossfade_duration: float, sample_rate: int | |
| ) -> np.ndarray: | |
| """ | |
| Apply crossfade between section and target audio. | |
| Args: | |
| section: Audio section to insert | |
| target: Target audio to insert into | |
| crossfade_duration: Length of crossfade in seconds | |
| sample_rate: Sample rate of audio | |
| Returns: | |
| Target audio with section inserted | |
| """ | |
| # Calculate crossfade samples | |
| fade_samples = int(crossfade_duration * sample_rate) | |
| # Create crossfade envelope | |
| fade_in = np.linspace(0, 1, fade_samples) | |
| fade_out = np.linspace(1, 0, fade_samples) | |
| # Handle stereo audio | |
| if section.ndim > 1: | |
| crossfade = np.outer(fade_in * fade_out, np.ones(section.shape[1])) | |
| else: | |
| crossfade = fade_in * fade_out | |
| # Apply crossfade to section end | |
| section_end = section[-fade_samples:] if len(section) > fade_samples else section | |
| if section_end.ndim > 1: | |
| section_end[:fade_samples] *= crossfade | |
| else: | |
| section_end[:fade_samples] *= crossfade | |
| # Insert section into target | |
| insert_sample = int(len(target) * 0.5) # Insert at middle | |
| result = np.insert(target, insert_sample, section_end, axis=0) | |
| return result | |
| def insert_section( | |
| audio_path: str, | |
| section_path: str, | |
| insert_time: float, | |
| crossfade_duration: float = 0.1, | |
| output_path: Optional[str] = None, | |
| output_format: str = "wav", | |
| ) -> str: | |
| """ | |
| Insert a section from one audio track into another at a precise time position. | |
| This function allows you to insert audio content (like an intro, advertisement, | |
| or sound effect) into an existing track at any position with smooth | |
| crossfading to avoid audible clicks or abrupt transitions. | |
| Args: | |
| audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| section_path: Path to the audio section to insert (supports common formats: WAV, MP3, FLAC, M4A) | |
| insert_time: Position to insert the section (in seconds from start of main audio) | |
| crossfade_duration: Length of crossfade in seconds (default: 0.1) | |
| Longer crossfades create smoother transitions but reduce clarity | |
| output_path: Optional output directory (default: None, uses temp directory) | |
| output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Path to the audio file with the section inserted | |
| Examples: | |
| >>> insert_section("main_track.wav", "intro.wav", 5.0, 0.2, "output", "wav") | |
| # Returns 'path/to/main_with_intro.wav' with intro inserted at 5 seconds | |
| >>> insert_section("podcast.mp3", "advertisement.mp3", 180.0, 0.5, "output", "mp3") | |
| # Returns 'path/to/podcast_with_ad.mp3' with ad inserted at 3 minutes | |
| Note: | |
| - Insert position is measured from the start of the main audio | |
| - Crossfade prevents clicks and creates smooth transitions | |
| - If insert_time + section duration exceeds main audio duration, section is truncated | |
| - Works with mono or stereo audio files | |
| - Preserves original audio quality and sample rate | |
| - Processing time depends on audio length and crossfade duration | |
| """ | |
| try: | |
| # Load both audio files | |
| main_audio, main_sr = _load_audio(audio_path, mono=False) | |
| section_audio, section_sr = _load_audio(section_path, mono=False) | |
| # Resample if needed | |
| if main_sr != section_sr: | |
| if section_audio.ndim > 1: | |
| # Resample each channel separately | |
| section_audio = np.array( | |
| [ | |
| librosa.resample( | |
| section_audio[:, ch], orig_sr=section_sr, target_sr=main_sr | |
| ) | |
| for ch in range(section_audio.shape[1]) | |
| ] | |
| ).T | |
| else: | |
| section_audio = librosa.resample( | |
| section_audio, orig_sr=section_sr, target_sr=main_sr | |
| ) | |
| # Calculate timing | |
| main_duration = len(main_audio) / main_sr | |
| # Validate insert position | |
| if insert_time < 0: | |
| raise ValueError("Insert time must be positive") | |
| if insert_time > main_duration: | |
| raise ValueError( | |
| f"Insert time ({insert_time}s) exceeds main audio duration ({main_duration}s)" | |
| ) | |
| # Calculate crossfade points | |
| fade_start, fade_end = detect_crossfade_point( | |
| insert_time, main_duration, crossfade_duration | |
| ) | |
| # Extract main audio segments | |
| main_before = main_audio[: int(fade_start * main_sr)] | |
| main_after = main_audio[int(fade_end * main_sr) :] | |
| # Apply crossfade and insert section | |
| result = apply_crossfade(section_audio, main_after, crossfade_duration, main_sr) | |
| # Combine all parts | |
| final_audio = np.concatenate([main_before, result]) | |
| # Save output | |
| if output_path is None: | |
| output_path = tempfile.mkdtemp(suffix="_inserted") | |
| else: | |
| os.makedirs(output_path, exist_ok=True) | |
| # Generate output filename | |
| main_filename = os.path.splitext(os.path.basename(audio_path))[0] | |
| output_filename = f"{main_filename}_with_insertion.{output_format}" | |
| output_file = os.path.join(output_path, output_filename) | |
| # Save final audio | |
| sf.write(output_file, final_audio, main_sr) | |
| return output_file | |
| except Exception as e: | |
| raise RuntimeError(f"Error inserting audio section: {str(e)}") | |
| def insert_multiple_sections( | |
| audio_path: str, | |
| sections: list[tuple[str, float, float]], | |
| crossfade_duration: float = 0.1, | |
| output_path: Optional[str] = None, | |
| output_format: str = "wav", | |
| ) -> str: | |
| """ | |
| Insert multiple sections into an audio track at specified positions. | |
| This function allows inserting multiple audio sections (like multiple ads, | |
| sound effects, or musical segments) into a main track with smooth | |
| transitions between each insertion. | |
| Args: | |
| audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| sections: List of (section_path, insert_time) tuples | |
| section_path: Path to audio section to insert | |
| insert_time: Position to insert section (in seconds) | |
| crossfade_duration: Length of crossfade in seconds (default: 0.1) | |
| output_path: Optional output directory (default: None, uses temp directory) | |
| output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Path to the audio file with all sections inserted | |
| Examples: | |
| >>> insert_multiple_sections("track.wav", [("intro.wav", 0), ("ad1.wav", 30), ("ad2.wav", 180)], 0.2) | |
| # Returns 'path/to/track_with_insertions.wav' with intro at start, ads at 30s and 3min | |
| >>> insert_multiple_sections("podcast.mp3", [("sponsor.wav", 60)], 0.3, "output", "mp3") | |
| # Returns 'path/to/podcast_with_sponsor.wav' with sponsor segment at 1 minute | |
| Note: | |
| - Sections are inserted in chronological order | |
| - Each section gets crossfade at both start and end | |
| - If sections overlap, later sections take precedence | |
| - Total processing time increases with number of sections | |
| - Works best with non-overlapping insertion times | |
| """ | |
| try: | |
| # Load main audio | |
| main_audio, main_sr = _load_audio(audio_path, mono=False) | |
| main_duration = len(main_audio) / main_sr | |
| current_audio = main_audio.copy() | |
| # Sort sections by insert time | |
| sorted_sections = sorted(sections, key=lambda x: x[1]) | |
| # Insert each section | |
| for section_path, insert_time, _ in sorted_sections: | |
| # Load section | |
| section_audio, section_sr = _load_audio(section_path, mono=False) | |
| # Resample if needed | |
| if section_sr != main_sr: | |
| if section_audio.ndim > 1: | |
| # Resample each channel separately | |
| section_audio = np.array( | |
| [ | |
| librosa.resample( | |
| section_audio[:, ch], | |
| orig_sr=section_sr, | |
| target_sr=main_sr, | |
| ) | |
| for ch in range(section_audio.shape[1]) | |
| ] | |
| ).T | |
| else: | |
| section_audio = librosa.resample( | |
| section_audio, orig_sr=section_sr, target_sr=main_sr | |
| ) | |
| # Calculate crossfade points | |
| fade_start, fade_end = detect_crossfade_point( | |
| insert_time, main_duration, crossfade_duration | |
| ) | |
| # Extract current audio segments | |
| current_before = current_audio[: int(fade_start * main_sr)] | |
| current_after = current_audio[int(fade_end * main_sr) :] | |
| # Apply crossfade and insert section | |
| section_with_fade = apply_crossfade( | |
| section_audio, current_after, crossfade_duration, main_sr | |
| ) | |
| # Update current audio | |
| current_audio = np.concatenate([current_before, section_with_fade]) | |
| # Update duration for next insertion | |
| main_duration = len(current_audio) / main_sr | |
| # Save output | |
| if output_path is None: | |
| output_path = tempfile.mkdtemp(suffix="_multi_inserted") | |
| else: | |
| os.makedirs(output_path, exist_ok=True) | |
| # Generate output filename | |
| main_filename = os.path.splitext(os.path.basename(audio_path))[0] | |
| output_filename = f"{main_filename}_with_multiple_insertions.{output_format}" | |
| output_file = os.path.join(output_path, output_filename) | |
| # Save final audio | |
| sf.write(output_file, current_audio, main_sr) | |
| return output_file | |
| except Exception as e: | |
| raise RuntimeError(f"Error inserting multiple sections: {str(e)}") | |
| def replace_section( | |
| audio_path: str, | |
| start_time: float, | |
| end_time: float, | |
| replacement_path: str, | |
| crossfade_duration: float = 0.1, | |
| output_path: Optional[str] = None, | |
| output_format: str = "wav", | |
| ) -> str: | |
| """ | |
| Replace a section of an audio track with another audio segment. | |
| This function removes a specified time range from the main audio and | |
| replaces it with new content, using crossfades for smooth transitions. | |
| Args: | |
| audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| start_time: Start time of section to replace (in seconds) | |
| end_time: End time of section to replace (in seconds) | |
| replacement_path: Path to the replacement audio segment (supports common formats: WAV, MP3, FLAC, M4A) | |
| crossfade_duration: Length of crossfade in seconds (default: 0.1) | |
| output_path: Optional output directory (default: None, uses temp directory) | |
| output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Path to the audio file with the section replaced | |
| Examples: | |
| >>> replace_section("song.wav", 60.0, 90.0, "new_verse.wav", 0.2, "output", "wav") | |
| # Returns 'path/to/song_replaced.wav' with 60-90s section replaced | |
| >>> replace_section("podcast.mp3", 120.0, 150.0, "correction.wav", 0.3, "output", "mp3") | |
| # Returns 'path/to/podcast_replaced.mp3' with 2-minute section replaced | |
| Note: | |
| - Start time must be less than end time | |
| - Crossfade prevents clicks at replacement boundaries | |
| - Replacement section is trimmed if longer than specified duration | |
| - Preserves original audio quality and sample rate | |
| - Useful for fixing mistakes, updating content, or adding corrections | |
| """ | |
| try: | |
| # Load both audio files | |
| main_audio, main_sr = _load_audio(audio_path, mono=False) | |
| replacement_audio, replacement_sr = _load_audio(replacement_path, mono=False) | |
| # Validate timing | |
| if start_time >= end_time: | |
| raise ValueError("Start time must be less than end time") | |
| # Convert times to samples | |
| start_sample = int(start_time * main_sr) | |
| end_sample = int(end_time * main_sr) | |
| # Extract main audio parts | |
| main_before = main_audio[:start_sample] | |
| main_after = main_audio[end_sample:] | |
| # Resample replacement if needed | |
| if replacement_sr != main_sr: | |
| if replacement_audio.ndim > 1: | |
| # Resample each channel separately | |
| replacement_audio = np.array( | |
| [ | |
| librosa.resample( | |
| replacement_audio[:, ch], | |
| orig_sr=replacement_sr, | |
| target_sr=main_sr, | |
| ) | |
| for ch in range(replacement_audio.shape[1]) | |
| ] | |
| ).T | |
| else: | |
| replacement_audio = librosa.resample( | |
| replacement_audio, orig_sr=replacement_sr, target_sr=main_sr | |
| ) | |
| # Trim replacement to specified duration | |
| replacement_duration = end_time - start_time | |
| replacement_samples = int(replacement_duration * main_sr) | |
| trimmed_replacement = ( | |
| replacement_audio[:replacement_samples] | |
| if len(replacement_audio) > replacement_samples | |
| else replacement_audio | |
| ) | |
| # Apply crossfades | |
| fade_samples = int(crossfade_duration * main_sr) | |
| # Fade in replacement | |
| fade_in = np.linspace(0, 1, fade_samples) | |
| if trimmed_replacement.ndim > 1: | |
| fade_in = np.outer(fade_in, np.ones(trimmed_replacement.shape[1])) | |
| trimmed_replacement[:fade_samples] *= fade_in | |
| # Fade out at end of replacement | |
| fade_out = np.linspace(1, 0, fade_samples) | |
| if trimmed_replacement.ndim > 1: | |
| fade_out = np.outer(fade_out, np.ones(trimmed_replacement.shape[1])) | |
| trimmed_replacement[-fade_samples:] *= fade_out | |
| # Combine all parts | |
| final_audio = np.concatenate([main_before, trimmed_replacement, main_after]) | |
| # Save output | |
| if output_path is None: | |
| output_path = tempfile.mkdtemp(suffix="_replaced") | |
| else: | |
| os.makedirs(output_path, exist_ok=True) | |
| # Generate output filename | |
| main_filename = os.path.splitext(os.path.basename(audio_path))[0] | |
| output_filename = f"{main_filename}_replaced.{output_format}" | |
| output_file = os.path.join(output_path, output_filename) | |
| # Save final audio | |
| sf.write(output_file, final_audio, main_sr) | |
| return output_file | |
| except Exception as e: | |
| raise RuntimeError(f"Error replacing audio section: {str(e)}") | |
| def insert_section_wrapper( | |
| audio_path: str, | |
| insert_path: str, | |
| insert_time: float, | |
| crossfade_duration: float = 0.1, | |
| output_format: str = "wav", | |
| ) -> str: | |
| """ | |
| Wrapper function for inserting audio sections with error handling for MCP integration. | |
| Args: | |
| audio_path: Path to the main audio file | |
| insert_path: Path to the audio section to insert | |
| insert_time: Time to insert the section (in seconds) | |
| crossfade_duration: Length of crossfade in seconds (default: 0.1) | |
| output_format: Output format ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Path to output file or error message | |
| """ | |
| try: | |
| return insert_section( | |
| audio_path=audio_path, | |
| section_path=insert_path, | |
| insert_time=insert_time, | |
| crossfade_duration=crossfade_duration, | |
| output_path=None, | |
| output_format=output_format, | |
| ) | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| def replace_section_wrapper( | |
| audio_path: str, | |
| start_time: float, | |
| end_time: float, | |
| replacement_path: str, | |
| crossfade_duration: float = 0.1, | |
| output_format: str = "wav", | |
| ) -> str: | |
| """ | |
| Wrapper function for replacing audio sections with error handling for MCP integration. | |
| Args: | |
| audio_path: Path to the main audio file | |
| start_time: Start time of section to replace (in seconds) | |
| end_time: End time of section to replace (in seconds) | |
| replacement_path: Path to the replacement audio segment | |
| crossfade_duration: Length of crossfade in seconds (default: 0.1) | |
| output_format: Output format ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Path to output file or error message | |
| """ | |
| try: | |
| return replace_section( | |
| audio_path=audio_path, | |
| start_time=start_time, | |
| end_time=end_time, | |
| replacement_path=replacement_path, | |
| crossfade_duration=crossfade_duration, | |
| output_path=None, | |
| output_format=output_format, | |
| ) | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| if __name__ == "__main__": | |
| """ | |
| Script section for running audio insertion/replacement locally. | |
| Usage: | |
| python tools/audio_insertion.py insert main.wav insert.wav 30.0 | |
| python tools/audio_insertion.py replace main.wav 10.0 20.0 replacement.wav | |
| """ | |
| import argparse | |
| import sys | |
| parser = argparse.ArgumentParser( | |
| description="Insert or replace audio sections", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| # Insert section at 30 seconds | |
| python tools/audio_insertion.py insert main.wav insert.wav 30.0 | |
| # Replace section from 10s to 20s | |
| python tools/audio_insertion.py replace main.wav 10.0 20.0 replacement.wav | |
| # With custom crossfade | |
| python tools/audio_insertion.py insert main.wav insert.wav 30.0 --crossfade 0.2 | |
| """, | |
| ) | |
| subparsers = parser.add_subparsers(dest="command", help="Command to run") | |
| # Insert command | |
| insert_parser = subparsers.add_parser("insert", help="Insert audio section") | |
| insert_parser.add_argument("main", help="Main audio file") | |
| insert_parser.add_argument("insert", help="Audio section to insert") | |
| insert_parser.add_argument("time", type=float, help="Insert time in seconds") | |
| insert_parser.add_argument( | |
| "--crossfade", | |
| type=float, | |
| default=0.1, | |
| help="Crossfade duration in seconds (default: 0.1)", | |
| ) | |
| insert_parser.add_argument( | |
| "--format", | |
| choices=["wav", "mp3"], | |
| default="wav", | |
| help="Output format (default: wav)", | |
| ) | |
| # Replace command | |
| replace_parser = subparsers.add_parser("replace", help="Replace audio section") | |
| replace_parser.add_argument("main", help="Main audio file") | |
| replace_parser.add_argument("start", type=float, help="Start time in seconds") | |
| replace_parser.add_argument("end", type=float, help="End time in seconds") | |
| replace_parser.add_argument("replacement", help="Replacement audio section") | |
| replace_parser.add_argument( | |
| "--crossfade", | |
| type=float, | |
| default=0.1, | |
| help="Crossfade duration in seconds (default: 0.1)", | |
| ) | |
| replace_parser.add_argument( | |
| "--format", | |
| choices=["wav", "mp3"], | |
| default="wav", | |
| help="Output format (default: wav)", | |
| ) | |
| args = parser.parse_args() | |
| if not args.command: | |
| parser.print_help() | |
| sys.exit(1) | |
| print("Audio Insertion Tool") | |
| print("=" * 25) | |
| try: | |
| result = None | |
| if args.command == "insert": | |
| print(f"Main audio: {args.main}") | |
| print(f"Insert section: {args.insert}") | |
| print(f"Insert time: {args.time}s") | |
| print(f"Crossfade: {args.crossfade}s") | |
| print() | |
| result = insert_section_wrapper( | |
| audio_path=args.main, | |
| insert_path=args.insert, | |
| insert_time=args.time, | |
| crossfade_duration=args.crossfade, | |
| output_format=args.format, | |
| ) | |
| elif args.command == "replace": | |
| print(f"Main audio: {args.main}") | |
| print(f"Replace section: {args.start}s - {args.end}s") | |
| print(f"Replacement: {args.replacement}") | |
| print(f"Crossfade: {args.crossfade}s") | |
| print() | |
| result = replace_section_wrapper( | |
| audio_path=args.main, | |
| start_time=args.start, | |
| end_time=args.end, | |
| replacement_path=args.replacement, | |
| crossfade_duration=args.crossfade, | |
| output_format=args.format, | |
| ) | |
| if result is None: | |
| print("β No command executed") | |
| sys.exit(1) | |
| elif result.startswith("Error:"): | |
| print(f"β {result}") | |
| sys.exit(1) | |
| else: | |
| print(f"β Audio {args.command}ion completed!") | |
| print(f"Output saved to: {result}") | |
| except Exception as e: | |
| print(f"β Error: {e}") | |
| sys.exit(1) | |