embedding-voices / enhance.py
knowrohit07's picture
first commit
dfb7d74 verified
"""Enhance audio files using AI-Coustics API."""
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "librosa",
# "requests",
# "soundfile",
# "tqdm",
# ]
# ///
import json
import os
import time
from pathlib import Path
from typing import List, Optional
import librosa
import requests
import soundfile as sf
import tqdm
class AiCousticsEnhancer:
"""Client for AI-Coustics audio enhancement API"""
def __init__(self, api_key: str):
"""
Initialize the AI-Coustics API client
Args:
api_key: Your AI-Coustics API key
"""
self.api_key = api_key
self.base_url = "https://api.ai-coustics.io/v2"
self.headers = {"X-API-Key": self.api_key}
def upload_audio(
self,
file_path: str,
enhancement_level: int = 100,
enhancement_model: str = "LARK_V2",
loudness_target: int = -19,
true_peak: int = -1,
transcode: str = "WAV",
) -> dict:
"""
Upload an audio file for enhancement
Args:
file_path: Path to the audio file
enhancement_level: Enhancement strength (0-100)
enhancement_model: Model to use (LARK_V2 or FINCH_V2)
loudness_target: Target loudness in LUFS
true_peak: True peak level in dBFS
transcode: Output format (WAV, MP3, etc.)
Returns:
Response dictionary with uid and metadata
"""
url = f"{self.base_url}/medias"
# Prepare the enhancement parameters
media_enhancement = {
"loudness_target": loudness_target,
"true_peak": true_peak,
"enhancement_level": enhancement_level,
"enhancement_model": enhancement_model,
"transcode": transcode,
}
# Upload file
with open(file_path, "rb") as f:
files = {"file": f}
data = {"media_enhancement": json.dumps(media_enhancement)}
response = requests.post(url, headers=self.headers, files=files, data=data)
response.raise_for_status()
return response.json()
def check_status(self, uid: str) -> dict:
"""
Check the processing status of an uploaded media file
Args:
uid: Unique identifier returned from upload
Returns:
Metadata dictionary with current status
"""
url = f"{self.base_url}/medias/{uid}/metadata"
response = requests.get(url, headers=self.headers)
response.raise_for_status()
return response.json()
def wait_for_completion(
self, uid: str, poll_interval: int = 2, timeout: int = 300
) -> dict:
"""
Poll the API until processing is complete
Args:
uid: Unique identifier returned from upload
poll_interval: Seconds between status checks
timeout: Maximum seconds to wait
Returns:
Final metadata dictionary
"""
start_time = time.time()
while time.time() - start_time < timeout:
metadata = self.check_status(uid)
status = metadata.get("enhancement_status")
print(f"Status: {status}")
if status == "COMPLETED":
return metadata
elif status == "FAILED":
raise Exception(f"Enhancement failed: {metadata}")
time.sleep(poll_interval)
raise TimeoutError(f"Processing did not complete within {timeout} seconds")
def download_enhanced(self, uid: str, output_path: str):
"""
Download the enhanced audio file
Args:
uid: Unique identifier returned from upload
output_path: Path where to save the enhanced file
"""
url = f"{self.base_url}/medias/{uid}/file"
response = requests.get(url, headers=self.headers, stream=True)
response.raise_for_status()
# Save the file
with open(output_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"Enhanced audio saved to: {output_path}")
def get_enhanced_output_path(input_path: str) -> str:
"""Generate output path by adding '_enhanced' suffix to the name"""
path = Path(input_path)
return str(path.with_stem(path.stem + "_enhanced"))
def enhance_audio_files(
input_paths: List[str],
api_key: str,
enhancement_level: int = 90,
enhancement_model: str = "LARK_V2",
) -> List[str]:
"""
Process a list of audio files through AI-Coustics API
Args:
input_paths: List of paths to audio files
api_key: Your AI-Coustics API key
enhancement_level: Enhancement strength (0-100)
enhancement_model: Model to use (LARK_V2 or FINCH_V2)
Returns:
List of paths to enhanced audio files
"""
# Initialize client
client = AiCousticsEnhancer(api_key)
enhanced_files = []
valid_input_paths = [
p
for p in input_paths
if os.path.exists(p) and not os.path.exists(get_enhanced_output_path(p))
]
for input_path in tqdm.tqdm(valid_input_paths):
# Double-check file existence
if not os.path.exists(input_path):
print(f"Error: File not found: {input_path}")
continue
# Generate output path with '_enhanced' directory suffix
output_path = get_enhanced_output_path(input_path)
output_dir = os.path.dirname(output_path)
if os.path.exists(output_path):
print(f"Found {output_path}, skipping")
continue
os.makedirs(output_dir, exist_ok=True)
try:
# Upload for enhancement
print("Uploading to AI-Coustics...")
result = client.upload_audio(
input_path,
enhancement_level=enhancement_level,
enhancement_model=enhancement_model,
)
uid = result["uid"]
print(f"Uploaded successfully. UID: {uid}")
# Wait for processing
print("Waiting for enhancement to complete...")
_metadata = client.wait_for_completion(uid)
print("Enhancement completed!")
# Download enhanced audio
client.download_enhanced(uid, output_path)
enhanced_files.append(output_path)
except Exception as e:
print(f"Error processing file {input_path}: {e}")
continue
print(f"\n{'=' * 50}")
print(
f"Processing complete! Enhanced {len(enhanced_files)}/{len(input_paths)} files"
)
print(f"{'=' * 50}")
return enhanced_files
# Example usage
if __name__ == "__main__":
# Set your API key (get it from https://developer.ai-coustics.com/)
API_KEY = os.environ["AICOUSTICS_API_KEY"]
input_files = librosa.util.find_files(Path(__file__).parent / "voice-donations")
input_files = [f for f in input_files if "_enhanced" not in f]
# Process audio files
enhanced_files = enhance_audio_files(
input_paths=input_files,
api_key=API_KEY,
enhancement_level=100,
enhancement_model="LARK_V2", # or "FINCH_V2" for voice isolation
)
print("\nEnhanced files:")
for file in enhanced_files:
print(f" - {file}")
# Read and verify the enhanced audio with soundfile
if enhanced_files:
print("\nReading first enhanced file with soundfile:")
audio_data, sample_rate = sf.read(enhanced_files[0])
print(f" Shape: {audio_data.shape}")
print(f" Sample rate: {sample_rate} Hz")
print(f" Duration: {len(audio_data) / sample_rate:.2f} seconds")