sinhala-tts / scripts /evaluate_channels.py
outlawmold's picture
v3: Fix macOS SSL certificate error - add certifi + ssl bypass
695aea0 verified
#!/usr/bin/env python3
"""
=============================================================
Sinhala TTS - YouTube Channel Audio Quality Evaluator v3
=============================================================
Run this on your LOCAL MACHINE.
Requirements:
pip install -U yt-dlp librosa soundfile numpy scipy certifi
Usage:
python evaluate_channels.py
=============================================================
"""
import os
import sys
import json
import ssl
import numpy as np
import warnings
warnings.filterwarnings("ignore")
# Fix macOS SSL certificate issue
try:
import certifi
os.environ['SSL_CERT_FILE'] = certifi.where()
os.environ['REQUESTS_CA_BUNDLE'] = certifi.where()
except ImportError:
pass
# Also patch ssl globally as fallback
try:
ssl._create_default_https_context = ssl._create_unverified_context
except AttributeError:
pass
# ============================================================
# CONFIGURATION
# ============================================================
CHANNELS = {
"sunchare": {
"url": "https://www.youtube.com/@sunchare/videos",
"label": "NU1's VLOG (Unlimited History)",
},
"Raamuwa": {
"url": "https://www.youtube.com/@Raamuwa/videos",
"label": "Raamuwa",
},
}
N_VIDEOS_PER_CHANNEL = 4
OUTPUT_DIR = "tts_channel_eval"
# ============================================================
# STEP 1: Download samples using yt-dlp Python API
# ============================================================
def download_samples(channel_key, channel_info, n_videos=N_VIDEOS_PER_CHANNEL):
"""Download n_videos from a channel as WAV audio using Python API."""
import yt_dlp
out_dir = os.path.join(OUTPUT_DIR, channel_key)
os.makedirs(out_dir, exist_ok=True)
print(f"\n{'='*60}")
print(f"Downloading from: {channel_info['label']}")
print(f"URL: {channel_info['url']}")
print(f"{'='*60}")
# Step 1: Extract video list from channel
print(f"\n [1/2] Fetching video list...")
list_opts = {
'quiet': True,
'no_warnings': True,
'extract_flat': 'in_playlist',
'playlist_items': f'1-{n_videos * 3}',
'nocheckcertificate': True,
}
entries = []
try:
with yt_dlp.YoutubeDL(list_opts) as ydl:
info = ydl.extract_info(channel_info["url"], download=False)
if info:
channel_title = info.get('channel', info.get('uploader', channel_key))
raw_entries = info.get('entries', [])
entries = [e for e in raw_entries if e is not None]
print(f" Channel: {channel_title}")
print(f" Found {len(entries)} videos")
except Exception as e:
print(f" Error fetching video list: {e}")
if not entries:
print(f" No entries found.")
print(f" Try: pip install -U yt-dlp certifi")
return []
# Select videos (prefer 3-40 min)
selected = []
skipped = []
for e in entries:
vid_id = e.get('id', '')
title = e.get('title', '?')
dur = e.get('duration') or 0
dur_min = dur / 60 if dur else 0
if not vid_id:
continue
if dur == 0 or (180 <= dur <= 2400):
selected.append((vid_id, title, dur))
print(f" + {title[:55]:55s} ({dur_min:.0f}min)")
if len(selected) >= n_videos:
break
else:
skipped.append((title, dur_min))
if not selected and skipped:
print(f" No videos in 3-40min range. Taking first {n_videos} anyway...")
for e in entries[:n_videos]:
vid_id = e.get('id', '')
title = e.get('title', '?')
dur = e.get('duration') or 0
if vid_id:
selected.append((vid_id, title, dur))
if not selected:
print(f" No downloadable videos found!")
return []
# Step 2: Download each video as WAV
print(f"\n [2/2] Downloading {len(selected)} videos as WAV...")
for i, (vid_id, title, dur) in enumerate(selected):
url = f"https://www.youtube.com/watch?v={vid_id}"
out_template = os.path.join(out_dir, f"{vid_id}.%(ext)s")
dl_opts = {
'format': 'bestaudio/best',
'outtmpl': out_template,
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'wav',
}],
'postprocessor_args': {
'ffmpeg': ['-ac', '1', '-ar', '22050'],
},
'quiet': True,
'no_warnings': True,
'nocheckcertificate': True,
}
print(f"\n [{i+1}/{len(selected)}] {title[:50]}...")
try:
with yt_dlp.YoutubeDL(dl_opts) as ydl:
ydl.download([url])
print(f" Done")
except Exception as e:
print(f" Failed: {str(e)[:100]}")
wav_files = sorted([f for f in os.listdir(out_dir) if f.endswith('.wav')])
print(f"\n Downloaded {len(wav_files)} WAV files to {out_dir}/")
return [os.path.join(out_dir, f) for f in wav_files]
# ============================================================
# STEP 2: Audio Quality Analysis
# ============================================================
def analyze_audio(wav_path):
"""Analyze a single WAV file for TTS training suitability."""
import librosa
fname = os.path.basename(wav_path)
print(f"\nAnalyzing: {fname}")
try:
y, sr = librosa.load(wav_path, sr=22050, mono=True)
except Exception as e:
print(f" Failed to load: {e}")
return None
duration_sec = len(y) / sr
duration_min = duration_sec / 60
print(f" Duration: {duration_min:.1f} minutes")
results = {
"file": fname,
"duration_min": round(duration_min, 1),
}
# --- RMS Energy & SNR ---
rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0]
rms_threshold = np.percentile(rms, 20)
noise_frames = rms[rms <= rms_threshold]
speech_frames = rms[rms > rms_threshold]
if len(noise_frames) > 0 and np.mean(noise_frames) > 0:
snr = 20 * np.log10(np.mean(speech_frames) / (np.mean(noise_frames) + 1e-10))
else:
snr = 40.0
results["snr_db"] = round(float(snr), 1)
snr_label = "excellent" if snr >= 25 else "acceptable" if snr >= 15 else "poor"
print(f" SNR: {snr:.1f} dB ({snr_label})")
# --- Spectral Flatness (music vs speech) ---
flatness = librosa.feature.spectral_flatness(y=y)[0]
mean_flat = float(np.mean(flatness))
results["spectral_flatness"] = round(mean_flat, 4)
music_risk = "low" if mean_flat > 0.02 else "medium" if mean_flat > 0.005 else "high"
results["music_risk"] = music_risk
print(f" Music risk: {music_risk} (flatness={mean_flat:.4f})")
# --- Pitch Analysis (first 5 min for speed) ---
y_short = y[:sr * 300] if len(y) > sr * 300 else y
print(f" Running pitch analysis (first {min(duration_min, 5):.0f} min)...")
f0, _, _ = librosa.pyin(y_short, fmin=50, fmax=500, sr=sr)
f0_voiced = f0[~np.isnan(f0)]
if len(f0_voiced) > 0:
pitch_mean = float(np.mean(f0_voiced))
pitch_std = float(np.std(f0_voiced))
voiced_ratio = float(np.sum(~np.isnan(f0)) / len(f0))
results["pitch_mean_hz"] = round(pitch_mean, 1)
results["pitch_std_hz"] = round(pitch_std, 1)
results["voiced_ratio"] = round(voiced_ratio, 3)
if pitch_std > 80:
results["speaker_assessment"] = "likely_multi_speaker"
print(f" Speaker: LIKELY MULTI-SPEAKER (pitch std={pitch_std:.1f}Hz)")
elif pitch_std > 60:
results["speaker_assessment"] = "possibly_multi_speaker"
print(f" Speaker: possibly multi-speaker (pitch std={pitch_std:.1f}Hz)")
else:
results["speaker_assessment"] = "single_speaker"
print(f" Speaker: consistent single speaker (pitch std={pitch_std:.1f}Hz)")
gender = "female" if pitch_mean > 180 else "male"
results["gender_estimate"] = gender
print(f" Voice: {gender} (mean pitch={pitch_mean:.0f}Hz)")
else:
print(f" Pitch: could not extract (no voiced frames detected)")
# --- Speech vs Silence Ratio ---
speech_ratio = np.sum(rms > rms_threshold) / len(rms)
results["speech_pct"] = round(float(speech_ratio * 100), 1)
results["speech_min"] = round(duration_min * speech_ratio, 1)
print(f" Speech content: {speech_ratio:.0%} ({results['speech_min']:.1f} min of speech)")
# --- Overall TTS Quality Score ---
score = 0
if snr >= 25: score += 3
elif snr >= 15: score += 2
elif snr >= 10: score += 1
if results.get("pitch_std_hz", 999) < 50: score += 2
elif results.get("pitch_std_hz", 999) < 80: score += 1
if speech_ratio > 0.6: score += 2
elif speech_ratio > 0.4: score += 1
if mean_flat > 0.01: score += 1
results["tts_score"] = score
grade = "Excellent" if score >= 7 else "Good" if score >= 5 else "Fair" if score >= 3 else "Poor"
results["grade"] = grade
print(f" TTS Quality Score: {score}/8 ({grade})")
return results
# ============================================================
# MAIN
# ============================================================
if __name__ == "__main__":
# Check dependencies
missing = []
for pkg in ['yt_dlp', 'librosa', 'soundfile', 'numpy', 'scipy']:
try:
__import__(pkg)
except ImportError:
missing.append(pkg.replace('_', '-'))
if missing:
print(f"Missing packages: {', '.join(missing)}")
print(f"Install with: pip install -U {' '.join(missing)}")
sys.exit(1)
import yt_dlp
print(f"yt-dlp version: {yt_dlp.version.__version__}")
print(f"Sinhala TTS - YouTube Channel Quality Evaluator v3")
print("=" * 60)
all_results = {}
for channel_key, channel_info in CHANNELS.items():
wav_files = download_samples(channel_key, channel_info)
if not wav_files:
print(f"\nNo files downloaded for {channel_info['label']}")
all_results[channel_key] = []
continue
channel_results = []
for wav_path in wav_files:
res = analyze_audio(wav_path)
if res:
channel_results.append(res)
all_results[channel_key] = channel_results
if channel_results:
total_dur = sum(r["duration_min"] for r in channel_results)
total_speech = sum(r.get("speech_min", 0) for r in channel_results)
avg_snr = np.mean([r["snr_db"] for r in channel_results])
avg_score = np.mean([r["tts_score"] for r in channel_results])
multi_spk = sum(1 for r in channel_results
if "multi" in r.get("speaker_assessment", ""))
music_high = sum(1 for r in channel_results if r.get("music_risk") == "high")
print(f"\n{'='*60}")
print(f"CHANNEL SUMMARY: {channel_info['label']}")
print(f"{'='*60}")
print(f" Videos analyzed: {len(channel_results)}")
print(f" Total duration: {total_dur:.1f} min")
print(f" Usable speech: {total_speech:.1f} min")
print(f" Avg SNR: {avg_snr:.1f} dB")
print(f" Avg TTS Score: {avg_score:.1f}/8")
print(f" Multi-speaker risk: {multi_spk}/{len(channel_results)} videos")
print(f" High music risk: {music_high}/{len(channel_results)} videos")
# Save detailed results
os.makedirs(OUTPUT_DIR, exist_ok=True)
results_path = os.path.join(OUTPUT_DIR, "evaluation_results.json")
with open(results_path, "w") as f:
json.dump(all_results, f, indent=2, ensure_ascii=False)
# ============================================================
# FINAL COMPARISON
# ============================================================
print(f"\n\n{'='*60}")
print(f"FINAL COMPARISON")
print(f"{'='*60}")
print(f"{'Channel':<35} {'Score':>8} {'SNR':>8} {'Speech':>10} {'Speaker':>15} {'Music':>10}")
print(f"{'-'*35} {'-'*8} {'-'*8} {'-'*10} {'-'*15} {'-'*10}")
for channel_key, results in all_results.items():
label = CHANNELS[channel_key]['label']
if isinstance(results, list) and results:
avg_score = np.mean([r["tts_score"] for r in results])
avg_snr = np.mean([r["snr_db"] for r in results])
total_speech = sum(r.get("speech_min", 0) for r in results)
single = sum(1 for r in results if r.get("speaker_assessment") == "single_speaker")
spk_label = "single" if single >= len(results)/2 else "mixed"
high_music = sum(1 for r in results if r.get("music_risk") == "high")
music_label = "low" if high_music == 0 else "some" if high_music < len(results)/2 else "heavy"
print(f"{label:<35} {avg_score:>5.1f}/8 {avg_snr:>6.1f}dB {total_speech:>7.1f}min {spk_label:>15} {music_label:>10}")
else:
print(f"{label:<35} {'No data':>8}")
print(f"\nResults saved to: {results_path}")
print(f"\nDone! Paste the output above (or {results_path}) back to the assistant.")