Moeeldouma
/

arabic-tts-xtts-v2

speech-synthesis

egyptian-arabic

Model card Files Files and versions

arabic-tts-xtts-v2 / scripts /upsample.py

Moeeldouma's picture

Add all scripts with prosody improvements

97e3499 verified 19 days ago

history blame contribute delete

3.79 kB

	"""
	Phase 8: Studio-Quality Audio Upsampling

	Uses AudioSR to upsample XTTS-v2 output from 24kHz to 48kHz,
	reconstructing high-frequency detail that the base model cannot produce.

	The XTTS-v2 model outputs 24kHz audio. Changing the internal sample rate
	would require retraining the entire model (~370M params) from scratch.
	Instead, we use a neural upsampler as a post-processing step — the same
	approach used in professional TTS production pipelines.

	Usage:
	conda activate new-arabic-tts
	python scripts/upsample.py --input outputs/finetuned_model_test.wav

	Output:
	Upsampled WAV saved alongside the input with _48kHz suffix.

	Usage for all outputs:
	python scripts/upsample.py --all
	"""

	import os
	import sys
	import argparse
	import time
	import json
	import numpy as np
	import soundfile as sf
	from pathlib import Path

	PROJECT_ROOT = Path(__file__).resolve().parent.parent


	def upsample_audiosr(input_path, output_path):
	"""Upsample using AudioSR neural upsampler."""
	import audiosr

	print(f" Loading AudioSR model...")
	model = audiosr.build_model(model_name="speech", device="cuda")

	print(f" Upsampling: {input_path}")
	t0 = time.time()
	waveform = audiosr.super_resolution(
	model,
	str(input_path),
	seed=42,
	guidance_scale=3.5,
	ddim_steps=50,
	)
	elapsed = time.time() - t0

	# AudioSR returns [batch, channels, samples] at 48kHz
	if hasattr(waveform, 'cpu'):
	wav = waveform[0, 0].cpu().numpy()
	else:
	wav = waveform[0, 0]
	target_sr = 48000

	sf.write(str(output_path), wav, target_sr, subtype="PCM_24")
	duration = len(wav) / target_sr
	print(f" Saved: {output_path}")
	print(f" Duration: {duration:.2f}s, Sample rate: {target_sr}Hz, Bit depth: 24-bit")
	print(f" Processing time: {elapsed:.1f}s")

	return {
	"input": str(input_path),
	"output": str(output_path),
	"input_sr": 24000,
	"output_sr": target_sr,
	"bit_depth": 24,
	"duration_s": round(duration, 2),
	"processing_time_s": round(elapsed, 1),
	}


	def main():
	parser = argparse.ArgumentParser(description="Audio Upsampling (24kHz → 48kHz)")
	parser.add_argument("--input", type=str, help="Input WAV file to upsample")
	parser.add_argument("--all", action="store_true", help="Upsample all official output files")
	args = parser.parse_args()

	if not args.input and not args.all:
	parser.error("Provide --input <file> or --all")

	print("=" * 70)
	print(" Phase 8: Studio-Quality Audio Upsampling")
	print(" 24kHz / 16-bit → 48kHz / 24-bit")
	print("=" * 70)

	results = []

	if args.all:
	files = [
	PROJECT_ROOT / "outputs" / "base_model_test.wav",
	PROJECT_ROOT / "outputs" / "finetuned_model_test.wav",
	]
	else:
	files = [Path(args.input)]

	for input_path in files:
	if not input_path.exists():
	print(f"\n WARNING: {input_path} not found, skipping")
	continue

	output_path = input_path.with_stem(input_path.stem + "_48kHz")
	print(f"\n [{files.index(input_path)+1}/{len(files)}] {input_path.name}")
	result = upsample_audiosr(input_path, output_path)
	results.append(result)

	# Save report
	benchmarks_dir = PROJECT_ROOT / "docs" / "benchmarks"
	benchmarks_dir.mkdir(parents=True, exist_ok=True)
	report_path = benchmarks_dir / "upsampling.json"
	with open(report_path, "w", encoding="utf-8") as f:
	json.dump({"date": time.strftime("%Y-%m-%d"), "results": results}, f, indent=2)

	print(f"\n{'='*70}")
	print(f" Upsampling Complete!")
	print(f" Report: {report_path}")
	print(f"{'='*70}")


	if __name__ == "__main__":
	main()