| """ |
| Baseline Arabic TTS test using unmodified XTTS-v2. |
| |
| Generates speech from Arabic text using a built-in speaker embedding. |
| This script produces the baseline outputs for comparison against |
| improved versions. |
| |
| Usage: |
| conda activate new-arabic-tts |
| python scripts/baseline_test.py |
| """ |
|
|
| import os |
| import time |
| import json |
| import numpy as np |
| import torch |
| import soundfile as sf |
| from TTS.tts.configs.xtts_config import XttsConfig |
| from TTS.tts.models.xtts import Xtts |
|
|
| |
| PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
| MODEL_DIR = os.path.join(PROJECT_ROOT, "models", "base") |
| OUTPUT_DIR = os.path.join(PROJECT_ROOT, "outputs", "original model") |
| BENCHMARK_DIR = os.path.join(PROJECT_ROOT, "docs", "benchmarks") |
| SPEAKER_NAME = "Gilberto Mathias" |
| SAMPLE_RATE = 24000 |
| SENTENCE_PAUSE = 0.35 |
|
|
| GENERATION_PARAMS = { |
| "temperature": 0.3, |
| "top_p": 0.7, |
| "repetition_penalty": 10.0, |
| } |
|
|
| TEST_SENTENCES = [ |
| "الذكاء الاصطناعي يتطور بسرعة كبيرة، ويدخل في كل مجالات الحياة.", |
| "الآلات أصبحت قادرة على التعلم، واتخاذ قرارات معقدة بمفردها.", |
| "أكثر من 70% من الشركات الكبرى تستخدم الذكاء الاصطناعي اليوم.", |
| "هذه التقنية تساعد الأطباء على تشخيص الأمراض بدقة أعلى.", |
| "كما تساعد المعلمين على تقديم تعليم مخصص لكل طالب.", |
| "لكن كثيراً من الناس يخشون أن تحل الآلات محل الإنسان في العمل.", |
| "الخبراء يرون أن الذكاء الاصطناعي سيخلق وظائف جديدة، لم نعرفها بعد.", |
| "التحدي الأكبر هو ضمان استخدام هذه التقنية بشكل أخلاقي وعادل.", |
| "الدول الكبرى تتسابق على قيادة هذا المجال وتطويره.", |
| "مستقبل البشرية سيتشكل بناءً على كيفية تعاملنا مع هذه التقنية.", |
| ] |
|
|
|
|
| def measure_audio_stats(wav, sr): |
| """Compute basic audio statistics.""" |
| duration = len(wav) / sr |
| rms = np.sqrt(np.mean(wav**2)) |
| peak = np.max(np.abs(wav)) |
| |
| frame_size = int(sr * 0.025) |
| frames = [wav[i : i + frame_size] for i in range(0, len(wav) - frame_size, frame_size)] |
| frame_rms = [np.sqrt(np.mean(f**2)) for f in frames] |
| frame_rms.sort() |
| noise_floor = np.mean(frame_rms[: max(1, len(frame_rms) // 10)]) |
| snr = 20 * np.log10(rms / max(noise_floor, 1e-10)) |
| return { |
| "duration_s": round(duration, 2), |
| "rms": round(float(rms), 6), |
| "peak": round(float(peak), 6), |
| "snr_db": round(float(snr), 1), |
| } |
|
|
|
|
| def main(): |
| os.makedirs(OUTPUT_DIR, exist_ok=True) |
| os.makedirs(BENCHMARK_DIR, exist_ok=True) |
|
|
| print("=" * 60) |
| print("XTTS-v2 Arabic Baseline Test") |
| print("=" * 60) |
|
|
| |
| print("\n[1/4] Loading XTTS-v2 base model...") |
| t0 = time.time() |
| config = XttsConfig() |
| config.load_json(os.path.join(MODEL_DIR, "config.json")) |
| model = Xtts.init_from_config(config) |
| model.load_checkpoint(config, checkpoint_dir=MODEL_DIR) |
| model.cuda() |
| model.eval() |
| load_time = time.time() - t0 |
| print(f" Model loaded in {load_time:.1f}s") |
|
|
| |
| print(f"\n[2/4] Loading speaker: {SPEAKER_NAME}") |
| speakers = torch.load( |
| os.path.join(MODEL_DIR, "speakers_xtts.pth"), weights_only=False |
| ) |
| speaker_data = speakers[SPEAKER_NAME] |
| gpt_cond_latent = speaker_data["gpt_cond_latent"].cuda() |
| speaker_embedding = speaker_data["speaker_embedding"].cuda() |
|
|
| |
| print(f"\n[3/4] Generating {len(TEST_SENTENCES)} sentences...") |
| pause = np.zeros(int(SAMPLE_RATE * SENTENCE_PAUSE)) |
| all_wav = [] |
| per_sentence_stats = [] |
|
|
| for i, text in enumerate(TEST_SENTENCES): |
| t0 = time.time() |
| out = model.inference( |
| text=text, |
| language="ar", |
| gpt_cond_latent=gpt_cond_latent, |
| speaker_embedding=speaker_embedding, |
| **GENERATION_PARAMS, |
| ) |
| gen_time = time.time() - t0 |
| wav = out["wav"] |
| stats = measure_audio_stats(wav, SAMPLE_RATE) |
| stats["generation_time_s"] = round(gen_time, 2) |
| stats["rtf"] = round(gen_time / stats["duration_s"], 3) |
| stats["text"] = text |
| stats["text_length"] = len(text) |
| per_sentence_stats.append(stats) |
|
|
| all_wav.append(wav) |
| if i < len(TEST_SENTENCES) - 1: |
| all_wav.append(pause) |
|
|
| print(f" [{i+1:2d}/{len(TEST_SENTENCES)}] {stats['duration_s']:5.2f}s " |
| f"(RTF={stats['rtf']:.3f}, SNR={stats['snr_db']:.1f}dB) " |
| f"| {text[:40]}...") |
|
|
| |
| final_wav = np.concatenate(all_wav) |
| output_path = os.path.join(OUTPUT_DIR, "AIOriginal.wav") |
| sf.write(output_path, final_wav, SAMPLE_RATE) |
|
|
| |
| total_stats = measure_audio_stats(final_wav, SAMPLE_RATE) |
| total_gen_time = sum(s["generation_time_s"] for s in per_sentence_stats) |
| total_stats["total_generation_time_s"] = round(total_gen_time, 2) |
| total_stats["rtf"] = round(total_gen_time / total_stats["duration_s"], 3) |
| total_stats["num_sentences"] = len(TEST_SENTENCES) |
| total_stats["speaker"] = SPEAKER_NAME |
| total_stats["generation_params"] = GENERATION_PARAMS |
| total_stats["sentence_pause_s"] = SENTENCE_PAUSE |
| total_stats["model"] = "XTTS-v2 base (unmodified)" |
|
|
| |
| benchmark = { |
| "metadata": { |
| "phase": "Phase 1: Baseline", |
| "date": time.strftime("%Y-%m-%d"), |
| "model": "XTTS-v2 base", |
| "speaker": SPEAKER_NAME, |
| "params": GENERATION_PARAMS, |
| }, |
| "overall": total_stats, |
| "per_sentence": per_sentence_stats, |
| } |
| benchmark_path = os.path.join(BENCHMARK_DIR, "baseline.json") |
| with open(benchmark_path, "w", encoding="utf-8") as f: |
| json.dump(benchmark, f, ensure_ascii=False, indent=2) |
|
|
| print(f"\n[4/4] Results") |
| print(f" Output: {output_path}") |
| print(f" Benchmark: {benchmark_path}") |
| print(f" Duration: {total_stats['duration_s']}s") |
| print(f" Gen Time: {total_gen_time:.1f}s") |
| print(f" RTF: {total_stats['rtf']}") |
| print(f" SNR: {total_stats['snr_db']} dB") |
| print("=" * 60) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|