music-mcp / tools /audio_insertion.py
frascuchon's picture
frascuchon HF Staff
fixing tools
f62bfdb
import os
import tempfile
from typing import Optional
import librosa
import numpy as np
import soundfile as sf
def _load_audio(audio_path: str, mono: bool = False) -> tuple[np.ndarray, int]:
"""Load audio file with standard settings."""
y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
# Ensure consistent (samples, channels) format
if not mono and y.ndim > 1 and y.shape[0] == 2:
y = y.T
return y, int(sr)
def detect_crossfade_point(
insert_position: float, audio_duration: float, crossfade_duration: float = 0.1
) -> tuple[float, float]:
"""
Calculate optimal crossfade points for seamless insertion.
Args:
insert_position: Where to insert the section (in seconds)
audio_duration: Total duration of the target audio (in seconds)
crossfade_duration: Length of crossfade (in seconds)
Returns:
Tuple of (start_time, end_time) for crossfade region
"""
# Calculate crossfade boundaries
fade_start = max(0, insert_position - crossfade_duration / 2)
fade_end = min(audio_duration, insert_position + crossfade_duration / 2)
return fade_start, fade_end
def apply_crossfade(
section: np.ndarray, target: np.ndarray, crossfade_duration: float, sample_rate: int
) -> np.ndarray:
"""
Apply crossfade between section and target audio.
Args:
section: Audio section to insert
target: Target audio to insert into
crossfade_duration: Length of crossfade in seconds
sample_rate: Sample rate of audio
Returns:
Target audio with section inserted
"""
# Calculate crossfade samples
fade_samples = int(crossfade_duration * sample_rate)
# Create crossfade envelope
fade_in = np.linspace(0, 1, fade_samples)
fade_out = np.linspace(1, 0, fade_samples)
# Handle stereo audio
if section.ndim > 1:
crossfade = np.outer(fade_in * fade_out, np.ones(section.shape[1]))
else:
crossfade = fade_in * fade_out
# Apply crossfade to section end
section_end = section[-fade_samples:] if len(section) > fade_samples else section
if section_end.ndim > 1:
section_end[:fade_samples] *= crossfade
else:
section_end[:fade_samples] *= crossfade
# Insert section into target
insert_sample = int(len(target) * 0.5) # Insert at middle
result = np.insert(target, insert_sample, section_end, axis=0)
return result
def insert_section(
audio_path: str,
section_path: str,
insert_time: float,
crossfade_duration: float = 0.1,
output_path: Optional[str] = None,
output_format: str = "wav",
) -> str:
"""
Insert a section from one audio track into another at a precise time position.
This function allows you to insert audio content (like an intro, advertisement,
or sound effect) into an existing track at any position with smooth
crossfading to avoid audible clicks or abrupt transitions.
Args:
audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
section_path: Path to the audio section to insert (supports common formats: WAV, MP3, FLAC, M4A)
insert_time: Position to insert the section (in seconds from start of main audio)
crossfade_duration: Length of crossfade in seconds (default: 0.1)
Longer crossfades create smoother transitions but reduce clarity
output_path: Optional output directory (default: None, uses temp directory)
output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
Returns:
Path to the audio file with the section inserted
Examples:
>>> insert_section("main_track.wav", "intro.wav", 5.0, 0.2, "output", "wav")
# Returns 'path/to/main_with_intro.wav' with intro inserted at 5 seconds
>>> insert_section("podcast.mp3", "advertisement.mp3", 180.0, 0.5, "output", "mp3")
# Returns 'path/to/podcast_with_ad.mp3' with ad inserted at 3 minutes
Note:
- Insert position is measured from the start of the main audio
- Crossfade prevents clicks and creates smooth transitions
- If insert_time + section duration exceeds main audio duration, section is truncated
- Works with mono or stereo audio files
- Preserves original audio quality and sample rate
- Processing time depends on audio length and crossfade duration
"""
try:
# Load both audio files
main_audio, main_sr = _load_audio(audio_path, mono=False)
section_audio, section_sr = _load_audio(section_path, mono=False)
# Resample if needed
if main_sr != section_sr:
if section_audio.ndim > 1:
# Resample each channel separately
section_audio = np.array(
[
librosa.resample(
section_audio[:, ch], orig_sr=section_sr, target_sr=main_sr
)
for ch in range(section_audio.shape[1])
]
).T
else:
section_audio = librosa.resample(
section_audio, orig_sr=section_sr, target_sr=main_sr
)
# Calculate timing
main_duration = len(main_audio) / main_sr
# Validate insert position
if insert_time < 0:
raise ValueError("Insert time must be positive")
if insert_time > main_duration:
raise ValueError(
f"Insert time ({insert_time}s) exceeds main audio duration ({main_duration}s)"
)
# Calculate crossfade points
fade_start, fade_end = detect_crossfade_point(
insert_time, main_duration, crossfade_duration
)
# Extract main audio segments
main_before = main_audio[: int(fade_start * main_sr)]
main_after = main_audio[int(fade_end * main_sr) :]
# Apply crossfade and insert section
result = apply_crossfade(section_audio, main_after, crossfade_duration, main_sr)
# Combine all parts
final_audio = np.concatenate([main_before, result])
# Save output
if output_path is None:
output_path = tempfile.mkdtemp(suffix="_inserted")
else:
os.makedirs(output_path, exist_ok=True)
# Generate output filename
main_filename = os.path.splitext(os.path.basename(audio_path))[0]
output_filename = f"{main_filename}_with_insertion.{output_format}"
output_file = os.path.join(output_path, output_filename)
# Save final audio
sf.write(output_file, final_audio, main_sr)
return output_file
except Exception as e:
raise RuntimeError(f"Error inserting audio section: {str(e)}")
def insert_multiple_sections(
audio_path: str,
sections: list[tuple[str, float, float]],
crossfade_duration: float = 0.1,
output_path: Optional[str] = None,
output_format: str = "wav",
) -> str:
"""
Insert multiple sections into an audio track at specified positions.
This function allows inserting multiple audio sections (like multiple ads,
sound effects, or musical segments) into a main track with smooth
transitions between each insertion.
Args:
audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
sections: List of (section_path, insert_time) tuples
section_path: Path to audio section to insert
insert_time: Position to insert section (in seconds)
crossfade_duration: Length of crossfade in seconds (default: 0.1)
output_path: Optional output directory (default: None, uses temp directory)
output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
Returns:
Path to the audio file with all sections inserted
Examples:
>>> insert_multiple_sections("track.wav", [("intro.wav", 0), ("ad1.wav", 30), ("ad2.wav", 180)], 0.2)
# Returns 'path/to/track_with_insertions.wav' with intro at start, ads at 30s and 3min
>>> insert_multiple_sections("podcast.mp3", [("sponsor.wav", 60)], 0.3, "output", "mp3")
# Returns 'path/to/podcast_with_sponsor.wav' with sponsor segment at 1 minute
Note:
- Sections are inserted in chronological order
- Each section gets crossfade at both start and end
- If sections overlap, later sections take precedence
- Total processing time increases with number of sections
- Works best with non-overlapping insertion times
"""
try:
# Load main audio
main_audio, main_sr = _load_audio(audio_path, mono=False)
main_duration = len(main_audio) / main_sr
current_audio = main_audio.copy()
# Sort sections by insert time
sorted_sections = sorted(sections, key=lambda x: x[1])
# Insert each section
for section_path, insert_time, _ in sorted_sections:
# Load section
section_audio, section_sr = _load_audio(section_path, mono=False)
# Resample if needed
if section_sr != main_sr:
if section_audio.ndim > 1:
# Resample each channel separately
section_audio = np.array(
[
librosa.resample(
section_audio[:, ch],
orig_sr=section_sr,
target_sr=main_sr,
)
for ch in range(section_audio.shape[1])
]
).T
else:
section_audio = librosa.resample(
section_audio, orig_sr=section_sr, target_sr=main_sr
)
# Calculate crossfade points
fade_start, fade_end = detect_crossfade_point(
insert_time, main_duration, crossfade_duration
)
# Extract current audio segments
current_before = current_audio[: int(fade_start * main_sr)]
current_after = current_audio[int(fade_end * main_sr) :]
# Apply crossfade and insert section
section_with_fade = apply_crossfade(
section_audio, current_after, crossfade_duration, main_sr
)
# Update current audio
current_audio = np.concatenate([current_before, section_with_fade])
# Update duration for next insertion
main_duration = len(current_audio) / main_sr
# Save output
if output_path is None:
output_path = tempfile.mkdtemp(suffix="_multi_inserted")
else:
os.makedirs(output_path, exist_ok=True)
# Generate output filename
main_filename = os.path.splitext(os.path.basename(audio_path))[0]
output_filename = f"{main_filename}_with_multiple_insertions.{output_format}"
output_file = os.path.join(output_path, output_filename)
# Save final audio
sf.write(output_file, current_audio, main_sr)
return output_file
except Exception as e:
raise RuntimeError(f"Error inserting multiple sections: {str(e)}")
def replace_section(
audio_path: str,
start_time: float,
end_time: float,
replacement_path: str,
crossfade_duration: float = 0.1,
output_path: Optional[str] = None,
output_format: str = "wav",
) -> str:
"""
Replace a section of an audio track with another audio segment.
This function removes a specified time range from the main audio and
replaces it with new content, using crossfades for smooth transitions.
Args:
audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
start_time: Start time of section to replace (in seconds)
end_time: End time of section to replace (in seconds)
replacement_path: Path to the replacement audio segment (supports common formats: WAV, MP3, FLAC, M4A)
crossfade_duration: Length of crossfade in seconds (default: 0.1)
output_path: Optional output directory (default: None, uses temp directory)
output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
Returns:
Path to the audio file with the section replaced
Examples:
>>> replace_section("song.wav", 60.0, 90.0, "new_verse.wav", 0.2, "output", "wav")
# Returns 'path/to/song_replaced.wav' with 60-90s section replaced
>>> replace_section("podcast.mp3", 120.0, 150.0, "correction.wav", 0.3, "output", "mp3")
# Returns 'path/to/podcast_replaced.mp3' with 2-minute section replaced
Note:
- Start time must be less than end time
- Crossfade prevents clicks at replacement boundaries
- Replacement section is trimmed if longer than specified duration
- Preserves original audio quality and sample rate
- Useful for fixing mistakes, updating content, or adding corrections
"""
try:
# Load both audio files
main_audio, main_sr = _load_audio(audio_path, mono=False)
replacement_audio, replacement_sr = _load_audio(replacement_path, mono=False)
# Validate timing
if start_time >= end_time:
raise ValueError("Start time must be less than end time")
# Convert times to samples
start_sample = int(start_time * main_sr)
end_sample = int(end_time * main_sr)
# Extract main audio parts
main_before = main_audio[:start_sample]
main_after = main_audio[end_sample:]
# Resample replacement if needed
if replacement_sr != main_sr:
if replacement_audio.ndim > 1:
# Resample each channel separately
replacement_audio = np.array(
[
librosa.resample(
replacement_audio[:, ch],
orig_sr=replacement_sr,
target_sr=main_sr,
)
for ch in range(replacement_audio.shape[1])
]
).T
else:
replacement_audio = librosa.resample(
replacement_audio, orig_sr=replacement_sr, target_sr=main_sr
)
# Trim replacement to specified duration
replacement_duration = end_time - start_time
replacement_samples = int(replacement_duration * main_sr)
trimmed_replacement = (
replacement_audio[:replacement_samples]
if len(replacement_audio) > replacement_samples
else replacement_audio
)
# Apply crossfades
fade_samples = int(crossfade_duration * main_sr)
# Fade in replacement
fade_in = np.linspace(0, 1, fade_samples)
if trimmed_replacement.ndim > 1:
fade_in = np.outer(fade_in, np.ones(trimmed_replacement.shape[1]))
trimmed_replacement[:fade_samples] *= fade_in
# Fade out at end of replacement
fade_out = np.linspace(1, 0, fade_samples)
if trimmed_replacement.ndim > 1:
fade_out = np.outer(fade_out, np.ones(trimmed_replacement.shape[1]))
trimmed_replacement[-fade_samples:] *= fade_out
# Combine all parts
final_audio = np.concatenate([main_before, trimmed_replacement, main_after])
# Save output
if output_path is None:
output_path = tempfile.mkdtemp(suffix="_replaced")
else:
os.makedirs(output_path, exist_ok=True)
# Generate output filename
main_filename = os.path.splitext(os.path.basename(audio_path))[0]
output_filename = f"{main_filename}_replaced.{output_format}"
output_file = os.path.join(output_path, output_filename)
# Save final audio
sf.write(output_file, final_audio, main_sr)
return output_file
except Exception as e:
raise RuntimeError(f"Error replacing audio section: {str(e)}")
def insert_section_wrapper(
audio_path: str,
insert_path: str,
insert_time: float,
crossfade_duration: float = 0.1,
output_format: str = "wav",
) -> str:
"""
Wrapper function for inserting audio sections with error handling for MCP integration.
Args:
audio_path: Path to the main audio file
insert_path: Path to the audio section to insert
insert_time: Time to insert the section (in seconds)
crossfade_duration: Length of crossfade in seconds (default: 0.1)
output_format: Output format ('wav' or 'mp3', default: 'wav')
Returns:
Path to output file or error message
"""
try:
return insert_section(
audio_path=audio_path,
section_path=insert_path,
insert_time=insert_time,
crossfade_duration=crossfade_duration,
output_path=None,
output_format=output_format,
)
except Exception as e:
return f"Error: {str(e)}"
def replace_section_wrapper(
audio_path: str,
start_time: float,
end_time: float,
replacement_path: str,
crossfade_duration: float = 0.1,
output_format: str = "wav",
) -> str:
"""
Wrapper function for replacing audio sections with error handling for MCP integration.
Args:
audio_path: Path to the main audio file
start_time: Start time of section to replace (in seconds)
end_time: End time of section to replace (in seconds)
replacement_path: Path to the replacement audio segment
crossfade_duration: Length of crossfade in seconds (default: 0.1)
output_format: Output format ('wav' or 'mp3', default: 'wav')
Returns:
Path to output file or error message
"""
try:
return replace_section(
audio_path=audio_path,
start_time=start_time,
end_time=end_time,
replacement_path=replacement_path,
crossfade_duration=crossfade_duration,
output_path=None,
output_format=output_format,
)
except Exception as e:
return f"Error: {str(e)}"
if __name__ == "__main__":
"""
Script section for running audio insertion/replacement locally.
Usage:
python tools/audio_insertion.py insert main.wav insert.wav 30.0
python tools/audio_insertion.py replace main.wav 10.0 20.0 replacement.wav
"""
import argparse
import sys
parser = argparse.ArgumentParser(
description="Insert or replace audio sections",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Insert section at 30 seconds
python tools/audio_insertion.py insert main.wav insert.wav 30.0
# Replace section from 10s to 20s
python tools/audio_insertion.py replace main.wav 10.0 20.0 replacement.wav
# With custom crossfade
python tools/audio_insertion.py insert main.wav insert.wav 30.0 --crossfade 0.2
""",
)
subparsers = parser.add_subparsers(dest="command", help="Command to run")
# Insert command
insert_parser = subparsers.add_parser("insert", help="Insert audio section")
insert_parser.add_argument("main", help="Main audio file")
insert_parser.add_argument("insert", help="Audio section to insert")
insert_parser.add_argument("time", type=float, help="Insert time in seconds")
insert_parser.add_argument(
"--crossfade",
type=float,
default=0.1,
help="Crossfade duration in seconds (default: 0.1)",
)
insert_parser.add_argument(
"--format",
choices=["wav", "mp3"],
default="wav",
help="Output format (default: wav)",
)
# Replace command
replace_parser = subparsers.add_parser("replace", help="Replace audio section")
replace_parser.add_argument("main", help="Main audio file")
replace_parser.add_argument("start", type=float, help="Start time in seconds")
replace_parser.add_argument("end", type=float, help="End time in seconds")
replace_parser.add_argument("replacement", help="Replacement audio section")
replace_parser.add_argument(
"--crossfade",
type=float,
default=0.1,
help="Crossfade duration in seconds (default: 0.1)",
)
replace_parser.add_argument(
"--format",
choices=["wav", "mp3"],
default="wav",
help="Output format (default: wav)",
)
args = parser.parse_args()
if not args.command:
parser.print_help()
sys.exit(1)
print("Audio Insertion Tool")
print("=" * 25)
try:
result = None
if args.command == "insert":
print(f"Main audio: {args.main}")
print(f"Insert section: {args.insert}")
print(f"Insert time: {args.time}s")
print(f"Crossfade: {args.crossfade}s")
print()
result = insert_section_wrapper(
audio_path=args.main,
insert_path=args.insert,
insert_time=args.time,
crossfade_duration=args.crossfade,
output_format=args.format,
)
elif args.command == "replace":
print(f"Main audio: {args.main}")
print(f"Replace section: {args.start}s - {args.end}s")
print(f"Replacement: {args.replacement}")
print(f"Crossfade: {args.crossfade}s")
print()
result = replace_section_wrapper(
audio_path=args.main,
start_time=args.start,
end_time=args.end,
replacement_path=args.replacement,
crossfade_duration=args.crossfade,
output_format=args.format,
)
if result is None:
print("❌ No command executed")
sys.exit(1)
elif result.startswith("Error:"):
print(f"❌ {result}")
sys.exit(1)
else:
print(f"βœ… Audio {args.command}ion completed!")
print(f"Output saved to: {result}")
except Exception as e:
print(f"❌ Error: {e}")
sys.exit(1)