sinhala-tts / scripts /evaluate_channels.py

v3: Fix macOS SSL certificate error - add certifi + ssl bypass

695aea0 verified about 1 month ago

13.6 kB

	#!/usr/bin/env python3
	"""
	=============================================================
	Sinhala TTS - YouTube Channel Audio Quality Evaluator v3
	=============================================================
	Run this on your LOCAL MACHINE.

	Requirements:
	pip install -U yt-dlp librosa soundfile numpy scipy certifi

	Usage:
	python evaluate_channels.py
	=============================================================
	"""

	import os
	import sys
	import json
	import ssl
	import numpy as np
	import warnings
	warnings.filterwarnings("ignore")

	# Fix macOS SSL certificate issue
	try:
	import certifi
	os.environ['SSL_CERT_FILE'] = certifi.where()
	os.environ['REQUESTS_CA_BUNDLE'] = certifi.where()
	except ImportError:
	pass
	# Also patch ssl globally as fallback
	try:
	ssl._create_default_https_context = ssl._create_unverified_context
	except AttributeError:
	pass

	# ============================================================
	# CONFIGURATION
	# ============================================================
	CHANNELS = {
	"sunchare": {
	"url": "https://www.youtube.com/@sunchare/videos",
	"label": "NU1's VLOG (Unlimited History)",
	},
	"Raamuwa": {
	"url": "https://www.youtube.com/@Raamuwa/videos",
	"label": "Raamuwa",
	},
	}

	N_VIDEOS_PER_CHANNEL = 4
	OUTPUT_DIR = "tts_channel_eval"


	# ============================================================
	# STEP 1: Download samples using yt-dlp Python API
	# ============================================================
	def download_samples(channel_key, channel_info, n_videos=N_VIDEOS_PER_CHANNEL):
	"""Download n_videos from a channel as WAV audio using Python API."""
	import yt_dlp

	out_dir = os.path.join(OUTPUT_DIR, channel_key)
	os.makedirs(out_dir, exist_ok=True)

	print(f"\n{'='*60}")
	print(f"Downloading from: {channel_info['label']}")
	print(f"URL: {channel_info['url']}")
	print(f"{'='*60}")

	# Step 1: Extract video list from channel
	print(f"\n [1/2] Fetching video list...")
	list_opts = {
	'quiet': True,
	'no_warnings': True,
	'extract_flat': 'in_playlist',
	'playlist_items': f'1-{n_videos * 3}',
	'nocheckcertificate': True,
	}

	entries = []
	try:
	with yt_dlp.YoutubeDL(list_opts) as ydl:
	info = ydl.extract_info(channel_info["url"], download=False)
	if info:
	channel_title = info.get('channel', info.get('uploader', channel_key))
	raw_entries = info.get('entries', [])
	entries = [e for e in raw_entries if e is not None]
	print(f" Channel: {channel_title}")
	print(f" Found {len(entries)} videos")
	except Exception as e:
	print(f" Error fetching video list: {e}")

	if not entries:
	print(f" No entries found.")
	print(f" Try: pip install -U yt-dlp certifi")
	return []

	# Select videos (prefer 3-40 min)
	selected = []
	skipped = []
	for e in entries:
	vid_id = e.get('id', '')
	title = e.get('title', '?')
	dur = e.get('duration') or 0
	dur_min = dur / 60 if dur else 0

	if not vid_id:
	continue

	if dur == 0 or (180 <= dur <= 2400):
	selected.append((vid_id, title, dur))
	print(f" + {title[:55]:55s} ({dur_min:.0f}min)")
	if len(selected) >= n_videos:
	break
	else:
	skipped.append((title, dur_min))

	if not selected and skipped:
	print(f" No videos in 3-40min range. Taking first {n_videos} anyway...")
	for e in entries[:n_videos]:
	vid_id = e.get('id', '')
	title = e.get('title', '?')
	dur = e.get('duration') or 0
	if vid_id:
	selected.append((vid_id, title, dur))

	if not selected:
	print(f" No downloadable videos found!")
	return []

	# Step 2: Download each video as WAV
	print(f"\n [2/2] Downloading {len(selected)} videos as WAV...")

	for i, (vid_id, title, dur) in enumerate(selected):
	url = f"https://www.youtube.com/watch?v={vid_id}"
	out_template = os.path.join(out_dir, f"{vid_id}.%(ext)s")

	dl_opts = {
	'format': 'bestaudio/best',
	'outtmpl': out_template,
	'postprocessors': [{
	'key': 'FFmpegExtractAudio',
	'preferredcodec': 'wav',
	}],
	'postprocessor_args': {
	'ffmpeg': ['-ac', '1', '-ar', '22050'],
	},
	'quiet': True,
	'no_warnings': True,
	'nocheckcertificate': True,
	}

	print(f"\n [{i+1}/{len(selected)}] {title[:50]}...")
	try:
	with yt_dlp.YoutubeDL(dl_opts) as ydl:
	ydl.download([url])
	print(f" Done")
	except Exception as e:
	print(f" Failed: {str(e)[:100]}")

	wav_files = sorted([f for f in os.listdir(out_dir) if f.endswith('.wav')])
	print(f"\n Downloaded {len(wav_files)} WAV files to {out_dir}/")
	return [os.path.join(out_dir, f) for f in wav_files]


	# ============================================================
	# STEP 2: Audio Quality Analysis
	# ============================================================
	def analyze_audio(wav_path):
	"""Analyze a single WAV file for TTS training suitability."""
	import librosa

	fname = os.path.basename(wav_path)
	print(f"\nAnalyzing: {fname}")

	try:
	y, sr = librosa.load(wav_path, sr=22050, mono=True)
	except Exception as e:
	print(f" Failed to load: {e}")
	return None

	duration_sec = len(y) / sr
	duration_min = duration_sec / 60
	print(f" Duration: {duration_min:.1f} minutes")

	results = {
	"file": fname,
	"duration_min": round(duration_min, 1),
	}

	# --- RMS Energy & SNR ---
	rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0]
	rms_threshold = np.percentile(rms, 20)
	noise_frames = rms[rms <= rms_threshold]
	speech_frames = rms[rms > rms_threshold]

	if len(noise_frames) > 0 and np.mean(noise_frames) > 0:
	snr = 20 * np.log10(np.mean(speech_frames) / (np.mean(noise_frames) + 1e-10))
	else:
	snr = 40.0
	results["snr_db"] = round(float(snr), 1)

	snr_label = "excellent" if snr >= 25 else "acceptable" if snr >= 15 else "poor"
	print(f" SNR: {snr:.1f} dB ({snr_label})")

	# --- Spectral Flatness (music vs speech) ---
	flatness = librosa.feature.spectral_flatness(y=y)[0]
	mean_flat = float(np.mean(flatness))
	results["spectral_flatness"] = round(mean_flat, 4)

	music_risk = "low" if mean_flat > 0.02 else "medium" if mean_flat > 0.005 else "high"
	results["music_risk"] = music_risk
	print(f" Music risk: {music_risk} (flatness={mean_flat:.4f})")

	# --- Pitch Analysis (first 5 min for speed) ---
	y_short = y[:sr * 300] if len(y) > sr * 300 else y
	print(f" Running pitch analysis (first {min(duration_min, 5):.0f} min)...")
	f0, _, _ = librosa.pyin(y_short, fmin=50, fmax=500, sr=sr)
	f0_voiced = f0[~np.isnan(f0)]

	if len(f0_voiced) > 0:
	pitch_mean = float(np.mean(f0_voiced))
	pitch_std = float(np.std(f0_voiced))
	voiced_ratio = float(np.sum(~np.isnan(f0)) / len(f0))

	results["pitch_mean_hz"] = round(pitch_mean, 1)
	results["pitch_std_hz"] = round(pitch_std, 1)
	results["voiced_ratio"] = round(voiced_ratio, 3)

	if pitch_std > 80:
	results["speaker_assessment"] = "likely_multi_speaker"
	print(f" Speaker: LIKELY MULTI-SPEAKER (pitch std={pitch_std:.1f}Hz)")
	elif pitch_std > 60:
	results["speaker_assessment"] = "possibly_multi_speaker"
	print(f" Speaker: possibly multi-speaker (pitch std={pitch_std:.1f}Hz)")
	else:
	results["speaker_assessment"] = "single_speaker"
	print(f" Speaker: consistent single speaker (pitch std={pitch_std:.1f}Hz)")

	gender = "female" if pitch_mean > 180 else "male"
	results["gender_estimate"] = gender
	print(f" Voice: {gender} (mean pitch={pitch_mean:.0f}Hz)")
	else:
	print(f" Pitch: could not extract (no voiced frames detected)")

	# --- Speech vs Silence Ratio ---
	speech_ratio = np.sum(rms > rms_threshold) / len(rms)
	results["speech_pct"] = round(float(speech_ratio * 100), 1)
	results["speech_min"] = round(duration_min * speech_ratio, 1)
	print(f" Speech content: {speech_ratio:.0%} ({results['speech_min']:.1f} min of speech)")

	# --- Overall TTS Quality Score ---
	score = 0
	if snr >= 25: score += 3
	elif snr >= 15: score += 2
	elif snr >= 10: score += 1

	if results.get("pitch_std_hz", 999) < 50: score += 2
	elif results.get("pitch_std_hz", 999) < 80: score += 1

	if speech_ratio > 0.6: score += 2
	elif speech_ratio > 0.4: score += 1

	if mean_flat > 0.01: score += 1

	results["tts_score"] = score
	grade = "Excellent" if score >= 7 else "Good" if score >= 5 else "Fair" if score >= 3 else "Poor"
	results["grade"] = grade
	print(f" TTS Quality Score: {score}/8 ({grade})")

	return results


	# ============================================================
	# MAIN
	# ============================================================
	if __name__ == "__main__":
	# Check dependencies
	missing = []
	for pkg in ['yt_dlp', 'librosa', 'soundfile', 'numpy', 'scipy']:
	try:
	__import__(pkg)
	except ImportError:
	missing.append(pkg.replace('_', '-'))
	if missing:
	print(f"Missing packages: {', '.join(missing)}")
	print(f"Install with: pip install -U {' '.join(missing)}")
	sys.exit(1)

	import yt_dlp
	print(f"yt-dlp version: {yt_dlp.version.__version__}")
	print(f"Sinhala TTS - YouTube Channel Quality Evaluator v3")
	print("=" * 60)

	all_results = {}

	for channel_key, channel_info in CHANNELS.items():
	wav_files = download_samples(channel_key, channel_info)

	if not wav_files:
	print(f"\nNo files downloaded for {channel_info['label']}")
	all_results[channel_key] = []
	continue

	channel_results = []
	for wav_path in wav_files:
	res = analyze_audio(wav_path)
	if res:
	channel_results.append(res)

	all_results[channel_key] = channel_results

	if channel_results:
	total_dur = sum(r["duration_min"] for r in channel_results)
	total_speech = sum(r.get("speech_min", 0) for r in channel_results)
	avg_snr = np.mean([r["snr_db"] for r in channel_results])
	avg_score = np.mean([r["tts_score"] for r in channel_results])
	multi_spk = sum(1 for r in channel_results
	if "multi" in r.get("speaker_assessment", ""))
	music_high = sum(1 for r in channel_results if r.get("music_risk") == "high")

	print(f"\n{'='*60}")
	print(f"CHANNEL SUMMARY: {channel_info['label']}")
	print(f"{'='*60}")
	print(f" Videos analyzed: {len(channel_results)}")
	print(f" Total duration: {total_dur:.1f} min")
	print(f" Usable speech: {total_speech:.1f} min")
	print(f" Avg SNR: {avg_snr:.1f} dB")
	print(f" Avg TTS Score: {avg_score:.1f}/8")
	print(f" Multi-speaker risk: {multi_spk}/{len(channel_results)} videos")
	print(f" High music risk: {music_high}/{len(channel_results)} videos")

	# Save detailed results
	os.makedirs(OUTPUT_DIR, exist_ok=True)
	results_path = os.path.join(OUTPUT_DIR, "evaluation_results.json")
	with open(results_path, "w") as f:
	json.dump(all_results, f, indent=2, ensure_ascii=False)

	# ============================================================
	# FINAL COMPARISON
	# ============================================================
	print(f"\n\n{'='*60}")
	print(f"FINAL COMPARISON")
	print(f"{'='*60}")
	print(f"{'Channel':<35} {'Score':>8} {'SNR':>8} {'Speech':>10} {'Speaker':>15} {'Music':>10}")
	print(f"{'-'35} {'-'8} {'-'8} {'-'10} {'-'15} {'-'10}")

	for channel_key, results in all_results.items():
	label = CHANNELS[channel_key]['label']
	if isinstance(results, list) and results:
	avg_score = np.mean([r["tts_score"] for r in results])
	avg_snr = np.mean([r["snr_db"] for r in results])
	total_speech = sum(r.get("speech_min", 0) for r in results)

	single = sum(1 for r in results if r.get("speaker_assessment") == "single_speaker")
	spk_label = "single" if single >= len(results)/2 else "mixed"

	high_music = sum(1 for r in results if r.get("music_risk") == "high")
	music_label = "low" if high_music == 0 else "some" if high_music < len(results)/2 else "heavy"

	print(f"{label:<35} {avg_score:>5.1f}/8 {avg_snr:>6.1f}dB {total_speech:>7.1f}min {spk_label:>15} {music_label:>10}")
	else:
	print(f"{label:<35} {'No data':>8}")

	print(f"\nResults saved to: {results_path}")
	print(f"\nDone! Paste the output above (or {results_path}) back to the assistant.")