| """ |
| Phase 8: Studio-Quality Audio Upsampling |
| |
| Uses AudioSR to upsample XTTS-v2 output from 24kHz to 48kHz, |
| reconstructing high-frequency detail that the base model cannot produce. |
| |
| The XTTS-v2 model outputs 24kHz audio. Changing the internal sample rate |
| would require retraining the entire model (~370M params) from scratch. |
| Instead, we use a neural upsampler as a post-processing step — the same |
| approach used in professional TTS production pipelines. |
| |
| Usage: |
| conda activate new-arabic-tts |
| python scripts/upsample.py --input outputs/finetuned_model_test.wav |
| |
| Output: |
| Upsampled WAV saved alongside the input with _48kHz suffix. |
| |
| Usage for all outputs: |
| python scripts/upsample.py --all |
| """ |
|
|
| import os |
| import sys |
| import argparse |
| import time |
| import json |
| import numpy as np |
| import soundfile as sf |
| from pathlib import Path |
|
|
| PROJECT_ROOT = Path(__file__).resolve().parent.parent |
|
|
|
|
| def upsample_audiosr(input_path, output_path): |
| """Upsample using AudioSR neural upsampler.""" |
| import audiosr |
|
|
| print(f" Loading AudioSR model...") |
| model = audiosr.build_model(model_name="speech", device="cuda") |
|
|
| print(f" Upsampling: {input_path}") |
| t0 = time.time() |
| waveform = audiosr.super_resolution( |
| model, |
| str(input_path), |
| seed=42, |
| guidance_scale=3.5, |
| ddim_steps=50, |
| ) |
| elapsed = time.time() - t0 |
|
|
| |
| if hasattr(waveform, 'cpu'): |
| wav = waveform[0, 0].cpu().numpy() |
| else: |
| wav = waveform[0, 0] |
| target_sr = 48000 |
|
|
| sf.write(str(output_path), wav, target_sr, subtype="PCM_24") |
| duration = len(wav) / target_sr |
| print(f" Saved: {output_path}") |
| print(f" Duration: {duration:.2f}s, Sample rate: {target_sr}Hz, Bit depth: 24-bit") |
| print(f" Processing time: {elapsed:.1f}s") |
|
|
| return { |
| "input": str(input_path), |
| "output": str(output_path), |
| "input_sr": 24000, |
| "output_sr": target_sr, |
| "bit_depth": 24, |
| "duration_s": round(duration, 2), |
| "processing_time_s": round(elapsed, 1), |
| } |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Audio Upsampling (24kHz → 48kHz)") |
| parser.add_argument("--input", type=str, help="Input WAV file to upsample") |
| parser.add_argument("--all", action="store_true", help="Upsample all official output files") |
| args = parser.parse_args() |
|
|
| if not args.input and not args.all: |
| parser.error("Provide --input <file> or --all") |
|
|
| print("=" * 70) |
| print(" Phase 8: Studio-Quality Audio Upsampling") |
| print(" 24kHz / 16-bit → 48kHz / 24-bit") |
| print("=" * 70) |
|
|
| results = [] |
|
|
| if args.all: |
| files = [ |
| PROJECT_ROOT / "outputs" / "base_model_test.wav", |
| PROJECT_ROOT / "outputs" / "finetuned_model_test.wav", |
| ] |
| else: |
| files = [Path(args.input)] |
|
|
| for input_path in files: |
| if not input_path.exists(): |
| print(f"\n WARNING: {input_path} not found, skipping") |
| continue |
|
|
| output_path = input_path.with_stem(input_path.stem + "_48kHz") |
| print(f"\n [{files.index(input_path)+1}/{len(files)}] {input_path.name}") |
| result = upsample_audiosr(input_path, output_path) |
| results.append(result) |
|
|
| |
| benchmarks_dir = PROJECT_ROOT / "docs" / "benchmarks" |
| benchmarks_dir.mkdir(parents=True, exist_ok=True) |
| report_path = benchmarks_dir / "upsampling.json" |
| with open(report_path, "w", encoding="utf-8") as f: |
| json.dump({"date": time.strftime("%Y-%m-%d"), "results": results}, f, indent=2) |
|
|
| print(f"\n{'='*70}") |
| print(f" Upsampling Complete!") |
| print(f" Report: {report_path}") |
| print(f"{'='*70}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|