File size: 3,787 Bytes

97e3499

"""
Phase 8: Studio-Quality Audio Upsampling

Uses AudioSR to upsample XTTS-v2 output from 24kHz to 48kHz,
reconstructing high-frequency detail that the base model cannot produce.

The XTTS-v2 model outputs 24kHz audio. Changing the internal sample rate
would require retraining the entire model (~370M params) from scratch.
Instead, we use a neural upsampler as a post-processing step — the same
approach used in professional TTS production pipelines.

Usage:
    conda activate new-arabic-tts
    python scripts/upsample.py --input outputs/finetuned_model_test.wav

Output:
    Upsampled WAV saved alongside the input with _48kHz suffix.

Usage for all outputs:
    python scripts/upsample.py --all
"""

import os
import sys
import argparse
import time
import json
import numpy as np
import soundfile as sf
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parent.parent


def upsample_audiosr(input_path, output_path):
    """Upsample using AudioSR neural upsampler."""
    import audiosr

    print(f"  Loading AudioSR model...")
    model = audiosr.build_model(model_name="speech", device="cuda")

    print(f"  Upsampling: {input_path}")
    t0 = time.time()
    waveform = audiosr.super_resolution(
        model,
        str(input_path),
        seed=42,
        guidance_scale=3.5,
        ddim_steps=50,
    )
    elapsed = time.time() - t0

    # AudioSR returns [batch, channels, samples] at 48kHz
    if hasattr(waveform, 'cpu'):
        wav = waveform[0, 0].cpu().numpy()
    else:
        wav = waveform[0, 0]
    target_sr = 48000

    sf.write(str(output_path), wav, target_sr, subtype="PCM_24")
    duration = len(wav) / target_sr
    print(f"  Saved: {output_path}")
    print(f"  Duration: {duration:.2f}s, Sample rate: {target_sr}Hz, Bit depth: 24-bit")
    print(f"  Processing time: {elapsed:.1f}s")

    return {
        "input": str(input_path),
        "output": str(output_path),
        "input_sr": 24000,
        "output_sr": target_sr,
        "bit_depth": 24,
        "duration_s": round(duration, 2),
        "processing_time_s": round(elapsed, 1),
    }


def main():
    parser = argparse.ArgumentParser(description="Audio Upsampling (24kHz → 48kHz)")
    parser.add_argument("--input", type=str, help="Input WAV file to upsample")
    parser.add_argument("--all", action="store_true", help="Upsample all official output files")
    args = parser.parse_args()

    if not args.input and not args.all:
        parser.error("Provide --input <file> or --all")

    print("=" * 70)
    print("  Phase 8: Studio-Quality Audio Upsampling")
    print("  24kHz / 16-bit → 48kHz / 24-bit")
    print("=" * 70)

    results = []

    if args.all:
        files = [
            PROJECT_ROOT / "outputs" / "base_model_test.wav",
            PROJECT_ROOT / "outputs" / "finetuned_model_test.wav",
        ]
    else:
        files = [Path(args.input)]

    for input_path in files:
        if not input_path.exists():
            print(f"\n  WARNING: {input_path} not found, skipping")
            continue

        output_path = input_path.with_stem(input_path.stem + "_48kHz")
        print(f"\n  [{files.index(input_path)+1}/{len(files)}] {input_path.name}")
        result = upsample_audiosr(input_path, output_path)
        results.append(result)

    # Save report
    benchmarks_dir = PROJECT_ROOT / "docs" / "benchmarks"
    benchmarks_dir.mkdir(parents=True, exist_ok=True)
    report_path = benchmarks_dir / "upsampling.json"
    with open(report_path, "w", encoding="utf-8") as f:
        json.dump({"date": time.strftime("%Y-%m-%d"), "results": results}, f, indent=2)

    print(f"\n{'='*70}")
    print(f"  Upsampling Complete!")
    print(f"  Report: {report_path}")
    print(f"{'='*70}")


if __name__ == "__main__":
    main()