Spaces:

frascuchon
/

music-mcp

Running on CPU Upgrade

App Files Files Community

music-mcp / tools /audio_insertion.py

frascuchon HF Staff

fixing tools

f62bfdb 12 days ago

raw

history blame contribute delete

22.5 kB

	import os
	import tempfile
	from typing import Optional

	import librosa
	import numpy as np
	import soundfile as sf


	def _load_audio(audio_path: str, mono: bool = False) -> tuple[np.ndarray, int]:
	"""Load audio file with standard settings."""
	y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
	# Ensure consistent (samples, channels) format
	if not mono and y.ndim > 1 and y.shape[0] == 2:
	y = y.T
	return y, int(sr)


	def detect_crossfade_point(
	insert_position: float, audio_duration: float, crossfade_duration: float = 0.1
	) -> tuple[float, float]:
	"""
	Calculate optimal crossfade points for seamless insertion.

	Args:
	insert_position: Where to insert the section (in seconds)
	audio_duration: Total duration of the target audio (in seconds)
	crossfade_duration: Length of crossfade (in seconds)

	Returns:
	Tuple of (start_time, end_time) for crossfade region
	"""
	# Calculate crossfade boundaries
	fade_start = max(0, insert_position - crossfade_duration / 2)
	fade_end = min(audio_duration, insert_position + crossfade_duration / 2)

	return fade_start, fade_end


	def apply_crossfade(
	section: np.ndarray, target: np.ndarray, crossfade_duration: float, sample_rate: int
	) -> np.ndarray:
	"""
	Apply crossfade between section and target audio.

	Args:
	section: Audio section to insert
	target: Target audio to insert into
	crossfade_duration: Length of crossfade in seconds
	sample_rate: Sample rate of audio

	Returns:
	Target audio with section inserted
	"""
	# Calculate crossfade samples
	fade_samples = int(crossfade_duration * sample_rate)

	# Create crossfade envelope
	fade_in = np.linspace(0, 1, fade_samples)
	fade_out = np.linspace(1, 0, fade_samples)

	# Handle stereo audio
	if section.ndim > 1:
	crossfade = np.outer(fade_in * fade_out, np.ones(section.shape[1]))
	else:
	crossfade = fade_in * fade_out

	# Apply crossfade to section end
	section_end = section[-fade_samples:] if len(section) > fade_samples else section
	if section_end.ndim > 1:
	section_end[:fade_samples] *= crossfade
	else:
	section_end[:fade_samples] *= crossfade

	# Insert section into target
	insert_sample = int(len(target) * 0.5) # Insert at middle
	result = np.insert(target, insert_sample, section_end, axis=0)

	return result


	def insert_section(
	audio_path: str,
	section_path: str,
	insert_time: float,
	crossfade_duration: float = 0.1,
	output_path: Optional[str] = None,
	output_format: str = "wav",
	) -> str:
	"""
	Insert a section from one audio track into another at a precise time position.

	This function allows you to insert audio content (like an intro, advertisement,
	or sound effect) into an existing track at any position with smooth
	crossfading to avoid audible clicks or abrupt transitions.

	Args:
	audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
	section_path: Path to the audio section to insert (supports common formats: WAV, MP3, FLAC, M4A)
	insert_time: Position to insert the section (in seconds from start of main audio)
	crossfade_duration: Length of crossfade in seconds (default: 0.1)
	Longer crossfades create smoother transitions but reduce clarity
	output_path: Optional output directory (default: None, uses temp directory)
	output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')

	Returns:
	Path to the audio file with the section inserted

	Examples:
	>>> insert_section("main_track.wav", "intro.wav", 5.0, 0.2, "output", "wav")
	# Returns 'path/to/main_with_intro.wav' with intro inserted at 5 seconds

	>>> insert_section("podcast.mp3", "advertisement.mp3", 180.0, 0.5, "output", "mp3")
	# Returns 'path/to/podcast_with_ad.mp3' with ad inserted at 3 minutes

	Note:
	- Insert position is measured from the start of the main audio
	- Crossfade prevents clicks and creates smooth transitions
	- If insert_time + section duration exceeds main audio duration, section is truncated
	- Works with mono or stereo audio files
	- Preserves original audio quality and sample rate
	- Processing time depends on audio length and crossfade duration
	"""
	try:
	# Load both audio files
	main_audio, main_sr = _load_audio(audio_path, mono=False)
	section_audio, section_sr = _load_audio(section_path, mono=False)

	# Resample if needed
	if main_sr != section_sr:
	if section_audio.ndim > 1:
	# Resample each channel separately
	section_audio = np.array(
	[
	librosa.resample(
	section_audio[:, ch], orig_sr=section_sr, target_sr=main_sr
	)
	for ch in range(section_audio.shape[1])
	]
	).T
	else:
	section_audio = librosa.resample(
	section_audio, orig_sr=section_sr, target_sr=main_sr
	)

	# Calculate timing
	main_duration = len(main_audio) / main_sr

	# Validate insert position
	if insert_time < 0:
	raise ValueError("Insert time must be positive")
	if insert_time > main_duration:
	raise ValueError(
	f"Insert time ({insert_time}s) exceeds main audio duration ({main_duration}s)"
	)

	# Calculate crossfade points
	fade_start, fade_end = detect_crossfade_point(
	insert_time, main_duration, crossfade_duration
	)

	# Extract main audio segments
	main_before = main_audio[: int(fade_start * main_sr)]
	main_after = main_audio[int(fade_end * main_sr) :]

	# Apply crossfade and insert section
	result = apply_crossfade(section_audio, main_after, crossfade_duration, main_sr)

	# Combine all parts
	final_audio = np.concatenate([main_before, result])

	# Save output
	if output_path is None:
	output_path = tempfile.mkdtemp(suffix="_inserted")
	else:
	os.makedirs(output_path, exist_ok=True)

	# Generate output filename
	main_filename = os.path.splitext(os.path.basename(audio_path))[0]
	output_filename = f"{main_filename}_with_insertion.{output_format}"
	output_file = os.path.join(output_path, output_filename)

	# Save final audio
	sf.write(output_file, final_audio, main_sr)

	return output_file

	except Exception as e:
	raise RuntimeError(f"Error inserting audio section: {str(e)}")


	def insert_multiple_sections(
	audio_path: str,
	sections: list[tuple[str, float, float]],
	crossfade_duration: float = 0.1,
	output_path: Optional[str] = None,
	output_format: str = "wav",
	) -> str:
	"""
	Insert multiple sections into an audio track at specified positions.

	This function allows inserting multiple audio sections (like multiple ads,
	sound effects, or musical segments) into a main track with smooth
	transitions between each insertion.

	Args:
	audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
	sections: List of (section_path, insert_time) tuples
	section_path: Path to audio section to insert
	insert_time: Position to insert section (in seconds)
	crossfade_duration: Length of crossfade in seconds (default: 0.1)
	output_path: Optional output directory (default: None, uses temp directory)
	output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')

	Returns:
	Path to the audio file with all sections inserted

	Examples:
	>>> insert_multiple_sections("track.wav", [("intro.wav", 0), ("ad1.wav", 30), ("ad2.wav", 180)], 0.2)
	# Returns 'path/to/track_with_insertions.wav' with intro at start, ads at 30s and 3min

	>>> insert_multiple_sections("podcast.mp3", [("sponsor.wav", 60)], 0.3, "output", "mp3")
	# Returns 'path/to/podcast_with_sponsor.wav' with sponsor segment at 1 minute

	Note:
	- Sections are inserted in chronological order
	- Each section gets crossfade at both start and end
	- If sections overlap, later sections take precedence
	- Total processing time increases with number of sections
	- Works best with non-overlapping insertion times
	"""
	try:
	# Load main audio
	main_audio, main_sr = _load_audio(audio_path, mono=False)
	main_duration = len(main_audio) / main_sr
	current_audio = main_audio.copy()

	# Sort sections by insert time
	sorted_sections = sorted(sections, key=lambda x: x[1])

	# Insert each section
	for section_path, insert_time, _ in sorted_sections:
	# Load section
	section_audio, section_sr = _load_audio(section_path, mono=False)

	# Resample if needed
	if section_sr != main_sr:
	if section_audio.ndim > 1:
	# Resample each channel separately
	section_audio = np.array(
	[
	librosa.resample(
	section_audio[:, ch],
	orig_sr=section_sr,
	target_sr=main_sr,
	)
	for ch in range(section_audio.shape[1])
	]
	).T
	else:
	section_audio = librosa.resample(
	section_audio, orig_sr=section_sr, target_sr=main_sr
	)

	# Calculate crossfade points
	fade_start, fade_end = detect_crossfade_point(
	insert_time, main_duration, crossfade_duration
	)

	# Extract current audio segments
	current_before = current_audio[: int(fade_start * main_sr)]
	current_after = current_audio[int(fade_end * main_sr) :]

	# Apply crossfade and insert section
	section_with_fade = apply_crossfade(
	section_audio, current_after, crossfade_duration, main_sr
	)

	# Update current audio
	current_audio = np.concatenate([current_before, section_with_fade])

	# Update duration for next insertion
	main_duration = len(current_audio) / main_sr

	# Save output
	if output_path is None:
	output_path = tempfile.mkdtemp(suffix="_multi_inserted")
	else:
	os.makedirs(output_path, exist_ok=True)

	# Generate output filename
	main_filename = os.path.splitext(os.path.basename(audio_path))[0]
	output_filename = f"{main_filename}_with_multiple_insertions.{output_format}"
	output_file = os.path.join(output_path, output_filename)

	# Save final audio
	sf.write(output_file, current_audio, main_sr)

	return output_file

	except Exception as e:
	raise RuntimeError(f"Error inserting multiple sections: {str(e)}")


	def replace_section(
	audio_path: str,
	start_time: float,
	end_time: float,
	replacement_path: str,
	crossfade_duration: float = 0.1,
	output_path: Optional[str] = None,
	output_format: str = "wav",
	) -> str:
	"""
	Replace a section of an audio track with another audio segment.

	This function removes a specified time range from the main audio and
	replaces it with new content, using crossfades for smooth transitions.

	Args:
	audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
	start_time: Start time of section to replace (in seconds)
	end_time: End time of section to replace (in seconds)
	replacement_path: Path to the replacement audio segment (supports common formats: WAV, MP3, FLAC, M4A)
	crossfade_duration: Length of crossfade in seconds (default: 0.1)
	output_path: Optional output directory (default: None, uses temp directory)
	output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')

	Returns:
	Path to the audio file with the section replaced

	Examples:
	>>> replace_section("song.wav", 60.0, 90.0, "new_verse.wav", 0.2, "output", "wav")
	# Returns 'path/to/song_replaced.wav' with 60-90s section replaced

	>>> replace_section("podcast.mp3", 120.0, 150.0, "correction.wav", 0.3, "output", "mp3")
	# Returns 'path/to/podcast_replaced.mp3' with 2-minute section replaced

	Note:
	- Start time must be less than end time
	- Crossfade prevents clicks at replacement boundaries
	- Replacement section is trimmed if longer than specified duration
	- Preserves original audio quality and sample rate
	- Useful for fixing mistakes, updating content, or adding corrections
	"""
	try:
	# Load both audio files
	main_audio, main_sr = _load_audio(audio_path, mono=False)
	replacement_audio, replacement_sr = _load_audio(replacement_path, mono=False)

	# Validate timing
	if start_time >= end_time:
	raise ValueError("Start time must be less than end time")

	# Convert times to samples
	start_sample = int(start_time * main_sr)
	end_sample = int(end_time * main_sr)

	# Extract main audio parts
	main_before = main_audio[:start_sample]
	main_after = main_audio[end_sample:]

	# Resample replacement if needed
	if replacement_sr != main_sr:
	if replacement_audio.ndim > 1:
	# Resample each channel separately
	replacement_audio = np.array(
	[
	librosa.resample(
	replacement_audio[:, ch],
	orig_sr=replacement_sr,
	target_sr=main_sr,
	)
	for ch in range(replacement_audio.shape[1])
	]
	).T
	else:
	replacement_audio = librosa.resample(
	replacement_audio, orig_sr=replacement_sr, target_sr=main_sr
	)

	# Trim replacement to specified duration
	replacement_duration = end_time - start_time
	replacement_samples = int(replacement_duration * main_sr)
	trimmed_replacement = (
	replacement_audio[:replacement_samples]
	if len(replacement_audio) > replacement_samples
	else replacement_audio
	)

	# Apply crossfades
	fade_samples = int(crossfade_duration * main_sr)

	# Fade in replacement
	fade_in = np.linspace(0, 1, fade_samples)
	if trimmed_replacement.ndim > 1:
	fade_in = np.outer(fade_in, np.ones(trimmed_replacement.shape[1]))
	trimmed_replacement[:fade_samples] *= fade_in

	# Fade out at end of replacement
	fade_out = np.linspace(1, 0, fade_samples)
	if trimmed_replacement.ndim > 1:
	fade_out = np.outer(fade_out, np.ones(trimmed_replacement.shape[1]))
	trimmed_replacement[-fade_samples:] *= fade_out

	# Combine all parts
	final_audio = np.concatenate([main_before, trimmed_replacement, main_after])

	# Save output
	if output_path is None:
	output_path = tempfile.mkdtemp(suffix="_replaced")
	else:
	os.makedirs(output_path, exist_ok=True)

	# Generate output filename
	main_filename = os.path.splitext(os.path.basename(audio_path))[0]
	output_filename = f"{main_filename}_replaced.{output_format}"
	output_file = os.path.join(output_path, output_filename)

	# Save final audio
	sf.write(output_file, final_audio, main_sr)

	return output_file

	except Exception as e:
	raise RuntimeError(f"Error replacing audio section: {str(e)}")


	def insert_section_wrapper(
	audio_path: str,
	insert_path: str,
	insert_time: float,
	crossfade_duration: float = 0.1,
	output_format: str = "wav",
	) -> str:
	"""
	Wrapper function for inserting audio sections with error handling for MCP integration.

	Args:
	audio_path: Path to the main audio file
	insert_path: Path to the audio section to insert
	insert_time: Time to insert the section (in seconds)
	crossfade_duration: Length of crossfade in seconds (default: 0.1)
	output_format: Output format ('wav' or 'mp3', default: 'wav')

	Returns:
	Path to output file or error message
	"""
	try:
	return insert_section(
	audio_path=audio_path,
	section_path=insert_path,
	insert_time=insert_time,
	crossfade_duration=crossfade_duration,
	output_path=None,
	output_format=output_format,
	)
	except Exception as e:
	return f"Error: {str(e)}"


	def replace_section_wrapper(
	audio_path: str,
	start_time: float,
	end_time: float,
	replacement_path: str,
	crossfade_duration: float = 0.1,
	output_format: str = "wav",
	) -> str:
	"""
	Wrapper function for replacing audio sections with error handling for MCP integration.

	Args:
	audio_path: Path to the main audio file
	start_time: Start time of section to replace (in seconds)
	end_time: End time of section to replace (in seconds)
	replacement_path: Path to the replacement audio segment
	crossfade_duration: Length of crossfade in seconds (default: 0.1)
	output_format: Output format ('wav' or 'mp3', default: 'wav')

	Returns:
	Path to output file or error message
	"""
	try:
	return replace_section(
	audio_path=audio_path,
	start_time=start_time,
	end_time=end_time,
	replacement_path=replacement_path,
	crossfade_duration=crossfade_duration,
	output_path=None,
	output_format=output_format,
	)
	except Exception as e:
	return f"Error: {str(e)}"


	if __name__ == "__main__":
	"""
	Script section for running audio insertion/replacement locally.

	Usage:
	python tools/audio_insertion.py insert main.wav insert.wav 30.0
	python tools/audio_insertion.py replace main.wav 10.0 20.0 replacement.wav
	"""
	import argparse
	import sys

	parser = argparse.ArgumentParser(
	description="Insert or replace audio sections",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	# Insert section at 30 seconds
	python tools/audio_insertion.py insert main.wav insert.wav 30.0

	# Replace section from 10s to 20s
	python tools/audio_insertion.py replace main.wav 10.0 20.0 replacement.wav

	# With custom crossfade
	python tools/audio_insertion.py insert main.wav insert.wav 30.0 --crossfade 0.2
	""",
	)

	subparsers = parser.add_subparsers(dest="command", help="Command to run")

	# Insert command
	insert_parser = subparsers.add_parser("insert", help="Insert audio section")
	insert_parser.add_argument("main", help="Main audio file")
	insert_parser.add_argument("insert", help="Audio section to insert")
	insert_parser.add_argument("time", type=float, help="Insert time in seconds")
	insert_parser.add_argument(
	"--crossfade",
	type=float,
	default=0.1,
	help="Crossfade duration in seconds (default: 0.1)",
	)
	insert_parser.add_argument(
	"--format",
	choices=["wav", "mp3"],
	default="wav",
	help="Output format (default: wav)",
	)

	# Replace command
	replace_parser = subparsers.add_parser("replace", help="Replace audio section")
	replace_parser.add_argument("main", help="Main audio file")
	replace_parser.add_argument("start", type=float, help="Start time in seconds")
	replace_parser.add_argument("end", type=float, help="End time in seconds")
	replace_parser.add_argument("replacement", help="Replacement audio section")
	replace_parser.add_argument(
	"--crossfade",
	type=float,
	default=0.1,
	help="Crossfade duration in seconds (default: 0.1)",
	)
	replace_parser.add_argument(
	"--format",
	choices=["wav", "mp3"],
	default="wav",
	help="Output format (default: wav)",
	)

	args = parser.parse_args()

	if not args.command:
	parser.print_help()
	sys.exit(1)

	print("Audio Insertion Tool")
	print("=" * 25)

	try:
	result = None

	if args.command == "insert":
	print(f"Main audio: {args.main}")
	print(f"Insert section: {args.insert}")
	print(f"Insert time: {args.time}s")
	print(f"Crossfade: {args.crossfade}s")
	print()

	result = insert_section_wrapper(
	audio_path=args.main,
	insert_path=args.insert,
	insert_time=args.time,
	crossfade_duration=args.crossfade,
	output_format=args.format,
	)

	elif args.command == "replace":
	print(f"Main audio: {args.main}")
	print(f"Replace section: {args.start}s - {args.end}s")
	print(f"Replacement: {args.replacement}")
	print(f"Crossfade: {args.crossfade}s")
	print()

	result = replace_section_wrapper(
	audio_path=args.main,
	start_time=args.start,
	end_time=args.end,
	replacement_path=args.replacement,
	crossfade_duration=args.crossfade,
	output_format=args.format,
	)

	if result is None:
	print("❌ No command executed")
	sys.exit(1)
	elif result.startswith("Error:"):
	print(f"❌ {result}")
	sys.exit(1)
	else:
	print(f"✅ Audio {args.command}ion completed!")
	print(f"Output saved to: {result}")

	except Exception as e:
	print(f"❌ Error: {e}")
	sys.exit(1)