Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 22,476 Bytes
a5d8e64 f62bfdb a5d8e64 f62bfdb a5d8e64 f62bfdb a5d8e64 f62bfdb a5d8e64 f62bfdb a5d8e64 f62bfdb a5d8e64 f62bfdb a5d8e64 f62bfdb a5d8e64 f62bfdb a5d8e64 f62bfdb a5d8e64 f62bfdb a5d8e64 5dc3b1e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 |
import os
import tempfile
from typing import Optional
import librosa
import numpy as np
import soundfile as sf
def _load_audio(audio_path: str, mono: bool = False) -> tuple[np.ndarray, int]:
"""Load audio file with standard settings."""
y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
# Ensure consistent (samples, channels) format
if not mono and y.ndim > 1 and y.shape[0] == 2:
y = y.T
return y, int(sr)
def detect_crossfade_point(
insert_position: float, audio_duration: float, crossfade_duration: float = 0.1
) -> tuple[float, float]:
"""
Calculate optimal crossfade points for seamless insertion.
Args:
insert_position: Where to insert the section (in seconds)
audio_duration: Total duration of the target audio (in seconds)
crossfade_duration: Length of crossfade (in seconds)
Returns:
Tuple of (start_time, end_time) for crossfade region
"""
# Calculate crossfade boundaries
fade_start = max(0, insert_position - crossfade_duration / 2)
fade_end = min(audio_duration, insert_position + crossfade_duration / 2)
return fade_start, fade_end
def apply_crossfade(
section: np.ndarray, target: np.ndarray, crossfade_duration: float, sample_rate: int
) -> np.ndarray:
"""
Apply crossfade between section and target audio.
Args:
section: Audio section to insert
target: Target audio to insert into
crossfade_duration: Length of crossfade in seconds
sample_rate: Sample rate of audio
Returns:
Target audio with section inserted
"""
# Calculate crossfade samples
fade_samples = int(crossfade_duration * sample_rate)
# Create crossfade envelope
fade_in = np.linspace(0, 1, fade_samples)
fade_out = np.linspace(1, 0, fade_samples)
# Handle stereo audio
if section.ndim > 1:
crossfade = np.outer(fade_in * fade_out, np.ones(section.shape[1]))
else:
crossfade = fade_in * fade_out
# Apply crossfade to section end
section_end = section[-fade_samples:] if len(section) > fade_samples else section
if section_end.ndim > 1:
section_end[:fade_samples] *= crossfade
else:
section_end[:fade_samples] *= crossfade
# Insert section into target
insert_sample = int(len(target) * 0.5) # Insert at middle
result = np.insert(target, insert_sample, section_end, axis=0)
return result
def insert_section(
audio_path: str,
section_path: str,
insert_time: float,
crossfade_duration: float = 0.1,
output_path: Optional[str] = None,
output_format: str = "wav",
) -> str:
"""
Insert a section from one audio track into another at a precise time position.
This function allows you to insert audio content (like an intro, advertisement,
or sound effect) into an existing track at any position with smooth
crossfading to avoid audible clicks or abrupt transitions.
Args:
audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
section_path: Path to the audio section to insert (supports common formats: WAV, MP3, FLAC, M4A)
insert_time: Position to insert the section (in seconds from start of main audio)
crossfade_duration: Length of crossfade in seconds (default: 0.1)
Longer crossfades create smoother transitions but reduce clarity
output_path: Optional output directory (default: None, uses temp directory)
output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
Returns:
Path to the audio file with the section inserted
Examples:
>>> insert_section("main_track.wav", "intro.wav", 5.0, 0.2, "output", "wav")
# Returns 'path/to/main_with_intro.wav' with intro inserted at 5 seconds
>>> insert_section("podcast.mp3", "advertisement.mp3", 180.0, 0.5, "output", "mp3")
# Returns 'path/to/podcast_with_ad.mp3' with ad inserted at 3 minutes
Note:
- Insert position is measured from the start of the main audio
- Crossfade prevents clicks and creates smooth transitions
- If insert_time + section duration exceeds main audio duration, section is truncated
- Works with mono or stereo audio files
- Preserves original audio quality and sample rate
- Processing time depends on audio length and crossfade duration
"""
try:
# Load both audio files
main_audio, main_sr = _load_audio(audio_path, mono=False)
section_audio, section_sr = _load_audio(section_path, mono=False)
# Resample if needed
if main_sr != section_sr:
if section_audio.ndim > 1:
# Resample each channel separately
section_audio = np.array(
[
librosa.resample(
section_audio[:, ch], orig_sr=section_sr, target_sr=main_sr
)
for ch in range(section_audio.shape[1])
]
).T
else:
section_audio = librosa.resample(
section_audio, orig_sr=section_sr, target_sr=main_sr
)
# Calculate timing
main_duration = len(main_audio) / main_sr
# Validate insert position
if insert_time < 0:
raise ValueError("Insert time must be positive")
if insert_time > main_duration:
raise ValueError(
f"Insert time ({insert_time}s) exceeds main audio duration ({main_duration}s)"
)
# Calculate crossfade points
fade_start, fade_end = detect_crossfade_point(
insert_time, main_duration, crossfade_duration
)
# Extract main audio segments
main_before = main_audio[: int(fade_start * main_sr)]
main_after = main_audio[int(fade_end * main_sr) :]
# Apply crossfade and insert section
result = apply_crossfade(section_audio, main_after, crossfade_duration, main_sr)
# Combine all parts
final_audio = np.concatenate([main_before, result])
# Save output
if output_path is None:
output_path = tempfile.mkdtemp(suffix="_inserted")
else:
os.makedirs(output_path, exist_ok=True)
# Generate output filename
main_filename = os.path.splitext(os.path.basename(audio_path))[0]
output_filename = f"{main_filename}_with_insertion.{output_format}"
output_file = os.path.join(output_path, output_filename)
# Save final audio
sf.write(output_file, final_audio, main_sr)
return output_file
except Exception as e:
raise RuntimeError(f"Error inserting audio section: {str(e)}")
def insert_multiple_sections(
audio_path: str,
sections: list[tuple[str, float, float]],
crossfade_duration: float = 0.1,
output_path: Optional[str] = None,
output_format: str = "wav",
) -> str:
"""
Insert multiple sections into an audio track at specified positions.
This function allows inserting multiple audio sections (like multiple ads,
sound effects, or musical segments) into a main track with smooth
transitions between each insertion.
Args:
audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
sections: List of (section_path, insert_time) tuples
section_path: Path to audio section to insert
insert_time: Position to insert section (in seconds)
crossfade_duration: Length of crossfade in seconds (default: 0.1)
output_path: Optional output directory (default: None, uses temp directory)
output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
Returns:
Path to the audio file with all sections inserted
Examples:
>>> insert_multiple_sections("track.wav", [("intro.wav", 0), ("ad1.wav", 30), ("ad2.wav", 180)], 0.2)
# Returns 'path/to/track_with_insertions.wav' with intro at start, ads at 30s and 3min
>>> insert_multiple_sections("podcast.mp3", [("sponsor.wav", 60)], 0.3, "output", "mp3")
# Returns 'path/to/podcast_with_sponsor.wav' with sponsor segment at 1 minute
Note:
- Sections are inserted in chronological order
- Each section gets crossfade at both start and end
- If sections overlap, later sections take precedence
- Total processing time increases with number of sections
- Works best with non-overlapping insertion times
"""
try:
# Load main audio
main_audio, main_sr = _load_audio(audio_path, mono=False)
main_duration = len(main_audio) / main_sr
current_audio = main_audio.copy()
# Sort sections by insert time
sorted_sections = sorted(sections, key=lambda x: x[1])
# Insert each section
for section_path, insert_time, _ in sorted_sections:
# Load section
section_audio, section_sr = _load_audio(section_path, mono=False)
# Resample if needed
if section_sr != main_sr:
if section_audio.ndim > 1:
# Resample each channel separately
section_audio = np.array(
[
librosa.resample(
section_audio[:, ch],
orig_sr=section_sr,
target_sr=main_sr,
)
for ch in range(section_audio.shape[1])
]
).T
else:
section_audio = librosa.resample(
section_audio, orig_sr=section_sr, target_sr=main_sr
)
# Calculate crossfade points
fade_start, fade_end = detect_crossfade_point(
insert_time, main_duration, crossfade_duration
)
# Extract current audio segments
current_before = current_audio[: int(fade_start * main_sr)]
current_after = current_audio[int(fade_end * main_sr) :]
# Apply crossfade and insert section
section_with_fade = apply_crossfade(
section_audio, current_after, crossfade_duration, main_sr
)
# Update current audio
current_audio = np.concatenate([current_before, section_with_fade])
# Update duration for next insertion
main_duration = len(current_audio) / main_sr
# Save output
if output_path is None:
output_path = tempfile.mkdtemp(suffix="_multi_inserted")
else:
os.makedirs(output_path, exist_ok=True)
# Generate output filename
main_filename = os.path.splitext(os.path.basename(audio_path))[0]
output_filename = f"{main_filename}_with_multiple_insertions.{output_format}"
output_file = os.path.join(output_path, output_filename)
# Save final audio
sf.write(output_file, current_audio, main_sr)
return output_file
except Exception as e:
raise RuntimeError(f"Error inserting multiple sections: {str(e)}")
def replace_section(
audio_path: str,
start_time: float,
end_time: float,
replacement_path: str,
crossfade_duration: float = 0.1,
output_path: Optional[str] = None,
output_format: str = "wav",
) -> str:
"""
Replace a section of an audio track with another audio segment.
This function removes a specified time range from the main audio and
replaces it with new content, using crossfades for smooth transitions.
Args:
audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
start_time: Start time of section to replace (in seconds)
end_time: End time of section to replace (in seconds)
replacement_path: Path to the replacement audio segment (supports common formats: WAV, MP3, FLAC, M4A)
crossfade_duration: Length of crossfade in seconds (default: 0.1)
output_path: Optional output directory (default: None, uses temp directory)
output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
Returns:
Path to the audio file with the section replaced
Examples:
>>> replace_section("song.wav", 60.0, 90.0, "new_verse.wav", 0.2, "output", "wav")
# Returns 'path/to/song_replaced.wav' with 60-90s section replaced
>>> replace_section("podcast.mp3", 120.0, 150.0, "correction.wav", 0.3, "output", "mp3")
# Returns 'path/to/podcast_replaced.mp3' with 2-minute section replaced
Note:
- Start time must be less than end time
- Crossfade prevents clicks at replacement boundaries
- Replacement section is trimmed if longer than specified duration
- Preserves original audio quality and sample rate
- Useful for fixing mistakes, updating content, or adding corrections
"""
try:
# Load both audio files
main_audio, main_sr = _load_audio(audio_path, mono=False)
replacement_audio, replacement_sr = _load_audio(replacement_path, mono=False)
# Validate timing
if start_time >= end_time:
raise ValueError("Start time must be less than end time")
# Convert times to samples
start_sample = int(start_time * main_sr)
end_sample = int(end_time * main_sr)
# Extract main audio parts
main_before = main_audio[:start_sample]
main_after = main_audio[end_sample:]
# Resample replacement if needed
if replacement_sr != main_sr:
if replacement_audio.ndim > 1:
# Resample each channel separately
replacement_audio = np.array(
[
librosa.resample(
replacement_audio[:, ch],
orig_sr=replacement_sr,
target_sr=main_sr,
)
for ch in range(replacement_audio.shape[1])
]
).T
else:
replacement_audio = librosa.resample(
replacement_audio, orig_sr=replacement_sr, target_sr=main_sr
)
# Trim replacement to specified duration
replacement_duration = end_time - start_time
replacement_samples = int(replacement_duration * main_sr)
trimmed_replacement = (
replacement_audio[:replacement_samples]
if len(replacement_audio) > replacement_samples
else replacement_audio
)
# Apply crossfades
fade_samples = int(crossfade_duration * main_sr)
# Fade in replacement
fade_in = np.linspace(0, 1, fade_samples)
if trimmed_replacement.ndim > 1:
fade_in = np.outer(fade_in, np.ones(trimmed_replacement.shape[1]))
trimmed_replacement[:fade_samples] *= fade_in
# Fade out at end of replacement
fade_out = np.linspace(1, 0, fade_samples)
if trimmed_replacement.ndim > 1:
fade_out = np.outer(fade_out, np.ones(trimmed_replacement.shape[1]))
trimmed_replacement[-fade_samples:] *= fade_out
# Combine all parts
final_audio = np.concatenate([main_before, trimmed_replacement, main_after])
# Save output
if output_path is None:
output_path = tempfile.mkdtemp(suffix="_replaced")
else:
os.makedirs(output_path, exist_ok=True)
# Generate output filename
main_filename = os.path.splitext(os.path.basename(audio_path))[0]
output_filename = f"{main_filename}_replaced.{output_format}"
output_file = os.path.join(output_path, output_filename)
# Save final audio
sf.write(output_file, final_audio, main_sr)
return output_file
except Exception as e:
raise RuntimeError(f"Error replacing audio section: {str(e)}")
def insert_section_wrapper(
audio_path: str,
insert_path: str,
insert_time: float,
crossfade_duration: float = 0.1,
output_format: str = "wav",
) -> str:
"""
Wrapper function for inserting audio sections with error handling for MCP integration.
Args:
audio_path: Path to the main audio file
insert_path: Path to the audio section to insert
insert_time: Time to insert the section (in seconds)
crossfade_duration: Length of crossfade in seconds (default: 0.1)
output_format: Output format ('wav' or 'mp3', default: 'wav')
Returns:
Path to output file or error message
"""
try:
return insert_section(
audio_path=audio_path,
section_path=insert_path,
insert_time=insert_time,
crossfade_duration=crossfade_duration,
output_path=None,
output_format=output_format,
)
except Exception as e:
return f"Error: {str(e)}"
def replace_section_wrapper(
audio_path: str,
start_time: float,
end_time: float,
replacement_path: str,
crossfade_duration: float = 0.1,
output_format: str = "wav",
) -> str:
"""
Wrapper function for replacing audio sections with error handling for MCP integration.
Args:
audio_path: Path to the main audio file
start_time: Start time of section to replace (in seconds)
end_time: End time of section to replace (in seconds)
replacement_path: Path to the replacement audio segment
crossfade_duration: Length of crossfade in seconds (default: 0.1)
output_format: Output format ('wav' or 'mp3', default: 'wav')
Returns:
Path to output file or error message
"""
try:
return replace_section(
audio_path=audio_path,
start_time=start_time,
end_time=end_time,
replacement_path=replacement_path,
crossfade_duration=crossfade_duration,
output_path=None,
output_format=output_format,
)
except Exception as e:
return f"Error: {str(e)}"
if __name__ == "__main__":
"""
Script section for running audio insertion/replacement locally.
Usage:
python tools/audio_insertion.py insert main.wav insert.wav 30.0
python tools/audio_insertion.py replace main.wav 10.0 20.0 replacement.wav
"""
import argparse
import sys
parser = argparse.ArgumentParser(
description="Insert or replace audio sections",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Insert section at 30 seconds
python tools/audio_insertion.py insert main.wav insert.wav 30.0
# Replace section from 10s to 20s
python tools/audio_insertion.py replace main.wav 10.0 20.0 replacement.wav
# With custom crossfade
python tools/audio_insertion.py insert main.wav insert.wav 30.0 --crossfade 0.2
""",
)
subparsers = parser.add_subparsers(dest="command", help="Command to run")
# Insert command
insert_parser = subparsers.add_parser("insert", help="Insert audio section")
insert_parser.add_argument("main", help="Main audio file")
insert_parser.add_argument("insert", help="Audio section to insert")
insert_parser.add_argument("time", type=float, help="Insert time in seconds")
insert_parser.add_argument(
"--crossfade",
type=float,
default=0.1,
help="Crossfade duration in seconds (default: 0.1)",
)
insert_parser.add_argument(
"--format",
choices=["wav", "mp3"],
default="wav",
help="Output format (default: wav)",
)
# Replace command
replace_parser = subparsers.add_parser("replace", help="Replace audio section")
replace_parser.add_argument("main", help="Main audio file")
replace_parser.add_argument("start", type=float, help="Start time in seconds")
replace_parser.add_argument("end", type=float, help="End time in seconds")
replace_parser.add_argument("replacement", help="Replacement audio section")
replace_parser.add_argument(
"--crossfade",
type=float,
default=0.1,
help="Crossfade duration in seconds (default: 0.1)",
)
replace_parser.add_argument(
"--format",
choices=["wav", "mp3"],
default="wav",
help="Output format (default: wav)",
)
args = parser.parse_args()
if not args.command:
parser.print_help()
sys.exit(1)
print("Audio Insertion Tool")
print("=" * 25)
try:
result = None
if args.command == "insert":
print(f"Main audio: {args.main}")
print(f"Insert section: {args.insert}")
print(f"Insert time: {args.time}s")
print(f"Crossfade: {args.crossfade}s")
print()
result = insert_section_wrapper(
audio_path=args.main,
insert_path=args.insert,
insert_time=args.time,
crossfade_duration=args.crossfade,
output_format=args.format,
)
elif args.command == "replace":
print(f"Main audio: {args.main}")
print(f"Replace section: {args.start}s - {args.end}s")
print(f"Replacement: {args.replacement}")
print(f"Crossfade: {args.crossfade}s")
print()
result = replace_section_wrapper(
audio_path=args.main,
start_time=args.start,
end_time=args.end,
replacement_path=args.replacement,
crossfade_duration=args.crossfade,
output_format=args.format,
)
if result is None:
print("β No command executed")
sys.exit(1)
elif result.startswith("Error:"):
print(f"β {result}")
sys.exit(1)
else:
print(f"β
Audio {args.command}ion completed!")
print(f"Output saved to: {result}")
except Exception as e:
print(f"β Error: {e}")
sys.exit(1)
|