#!/usr/bin/env python3 """ ============================================================= Sinhala TTS - YouTube Channel Audio Quality Evaluator v3 ============================================================= Run this on your LOCAL MACHINE. Requirements: pip install -U yt-dlp librosa soundfile numpy scipy certifi Usage: python evaluate_channels.py ============================================================= """ import os import sys import json import ssl import numpy as np import warnings warnings.filterwarnings("ignore") # Fix macOS SSL certificate issue try: import certifi os.environ['SSL_CERT_FILE'] = certifi.where() os.environ['REQUESTS_CA_BUNDLE'] = certifi.where() except ImportError: pass # Also patch ssl globally as fallback try: ssl._create_default_https_context = ssl._create_unverified_context except AttributeError: pass # ============================================================ # CONFIGURATION # ============================================================ CHANNELS = { "sunchare": { "url": "https://www.youtube.com/@sunchare/videos", "label": "NU1's VLOG (Unlimited History)", }, "Raamuwa": { "url": "https://www.youtube.com/@Raamuwa/videos", "label": "Raamuwa", }, } N_VIDEOS_PER_CHANNEL = 4 OUTPUT_DIR = "tts_channel_eval" # ============================================================ # STEP 1: Download samples using yt-dlp Python API # ============================================================ def download_samples(channel_key, channel_info, n_videos=N_VIDEOS_PER_CHANNEL): """Download n_videos from a channel as WAV audio using Python API.""" import yt_dlp out_dir = os.path.join(OUTPUT_DIR, channel_key) os.makedirs(out_dir, exist_ok=True) print(f"\n{'='*60}") print(f"Downloading from: {channel_info['label']}") print(f"URL: {channel_info['url']}") print(f"{'='*60}") # Step 1: Extract video list from channel print(f"\n [1/2] Fetching video list...") list_opts = { 'quiet': True, 'no_warnings': True, 'extract_flat': 'in_playlist', 'playlist_items': f'1-{n_videos * 3}', 'nocheckcertificate': True, } entries = [] try: with yt_dlp.YoutubeDL(list_opts) as ydl: info = ydl.extract_info(channel_info["url"], download=False) if info: channel_title = info.get('channel', info.get('uploader', channel_key)) raw_entries = info.get('entries', []) entries = [e for e in raw_entries if e is not None] print(f" Channel: {channel_title}") print(f" Found {len(entries)} videos") except Exception as e: print(f" Error fetching video list: {e}") if not entries: print(f" No entries found.") print(f" Try: pip install -U yt-dlp certifi") return [] # Select videos (prefer 3-40 min) selected = [] skipped = [] for e in entries: vid_id = e.get('id', '') title = e.get('title', '?') dur = e.get('duration') or 0 dur_min = dur / 60 if dur else 0 if not vid_id: continue if dur == 0 or (180 <= dur <= 2400): selected.append((vid_id, title, dur)) print(f" + {title[:55]:55s} ({dur_min:.0f}min)") if len(selected) >= n_videos: break else: skipped.append((title, dur_min)) if not selected and skipped: print(f" No videos in 3-40min range. Taking first {n_videos} anyway...") for e in entries[:n_videos]: vid_id = e.get('id', '') title = e.get('title', '?') dur = e.get('duration') or 0 if vid_id: selected.append((vid_id, title, dur)) if not selected: print(f" No downloadable videos found!") return [] # Step 2: Download each video as WAV print(f"\n [2/2] Downloading {len(selected)} videos as WAV...") for i, (vid_id, title, dur) in enumerate(selected): url = f"https://www.youtube.com/watch?v={vid_id}" out_template = os.path.join(out_dir, f"{vid_id}.%(ext)s") dl_opts = { 'format': 'bestaudio/best', 'outtmpl': out_template, 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'wav', }], 'postprocessor_args': { 'ffmpeg': ['-ac', '1', '-ar', '22050'], }, 'quiet': True, 'no_warnings': True, 'nocheckcertificate': True, } print(f"\n [{i+1}/{len(selected)}] {title[:50]}...") try: with yt_dlp.YoutubeDL(dl_opts) as ydl: ydl.download([url]) print(f" Done") except Exception as e: print(f" Failed: {str(e)[:100]}") wav_files = sorted([f for f in os.listdir(out_dir) if f.endswith('.wav')]) print(f"\n Downloaded {len(wav_files)} WAV files to {out_dir}/") return [os.path.join(out_dir, f) for f in wav_files] # ============================================================ # STEP 2: Audio Quality Analysis # ============================================================ def analyze_audio(wav_path): """Analyze a single WAV file for TTS training suitability.""" import librosa fname = os.path.basename(wav_path) print(f"\nAnalyzing: {fname}") try: y, sr = librosa.load(wav_path, sr=22050, mono=True) except Exception as e: print(f" Failed to load: {e}") return None duration_sec = len(y) / sr duration_min = duration_sec / 60 print(f" Duration: {duration_min:.1f} minutes") results = { "file": fname, "duration_min": round(duration_min, 1), } # --- RMS Energy & SNR --- rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0] rms_threshold = np.percentile(rms, 20) noise_frames = rms[rms <= rms_threshold] speech_frames = rms[rms > rms_threshold] if len(noise_frames) > 0 and np.mean(noise_frames) > 0: snr = 20 * np.log10(np.mean(speech_frames) / (np.mean(noise_frames) + 1e-10)) else: snr = 40.0 results["snr_db"] = round(float(snr), 1) snr_label = "excellent" if snr >= 25 else "acceptable" if snr >= 15 else "poor" print(f" SNR: {snr:.1f} dB ({snr_label})") # --- Spectral Flatness (music vs speech) --- flatness = librosa.feature.spectral_flatness(y=y)[0] mean_flat = float(np.mean(flatness)) results["spectral_flatness"] = round(mean_flat, 4) music_risk = "low" if mean_flat > 0.02 else "medium" if mean_flat > 0.005 else "high" results["music_risk"] = music_risk print(f" Music risk: {music_risk} (flatness={mean_flat:.4f})") # --- Pitch Analysis (first 5 min for speed) --- y_short = y[:sr * 300] if len(y) > sr * 300 else y print(f" Running pitch analysis (first {min(duration_min, 5):.0f} min)...") f0, _, _ = librosa.pyin(y_short, fmin=50, fmax=500, sr=sr) f0_voiced = f0[~np.isnan(f0)] if len(f0_voiced) > 0: pitch_mean = float(np.mean(f0_voiced)) pitch_std = float(np.std(f0_voiced)) voiced_ratio = float(np.sum(~np.isnan(f0)) / len(f0)) results["pitch_mean_hz"] = round(pitch_mean, 1) results["pitch_std_hz"] = round(pitch_std, 1) results["voiced_ratio"] = round(voiced_ratio, 3) if pitch_std > 80: results["speaker_assessment"] = "likely_multi_speaker" print(f" Speaker: LIKELY MULTI-SPEAKER (pitch std={pitch_std:.1f}Hz)") elif pitch_std > 60: results["speaker_assessment"] = "possibly_multi_speaker" print(f" Speaker: possibly multi-speaker (pitch std={pitch_std:.1f}Hz)") else: results["speaker_assessment"] = "single_speaker" print(f" Speaker: consistent single speaker (pitch std={pitch_std:.1f}Hz)") gender = "female" if pitch_mean > 180 else "male" results["gender_estimate"] = gender print(f" Voice: {gender} (mean pitch={pitch_mean:.0f}Hz)") else: print(f" Pitch: could not extract (no voiced frames detected)") # --- Speech vs Silence Ratio --- speech_ratio = np.sum(rms > rms_threshold) / len(rms) results["speech_pct"] = round(float(speech_ratio * 100), 1) results["speech_min"] = round(duration_min * speech_ratio, 1) print(f" Speech content: {speech_ratio:.0%} ({results['speech_min']:.1f} min of speech)") # --- Overall TTS Quality Score --- score = 0 if snr >= 25: score += 3 elif snr >= 15: score += 2 elif snr >= 10: score += 1 if results.get("pitch_std_hz", 999) < 50: score += 2 elif results.get("pitch_std_hz", 999) < 80: score += 1 if speech_ratio > 0.6: score += 2 elif speech_ratio > 0.4: score += 1 if mean_flat > 0.01: score += 1 results["tts_score"] = score grade = "Excellent" if score >= 7 else "Good" if score >= 5 else "Fair" if score >= 3 else "Poor" results["grade"] = grade print(f" TTS Quality Score: {score}/8 ({grade})") return results # ============================================================ # MAIN # ============================================================ if __name__ == "__main__": # Check dependencies missing = [] for pkg in ['yt_dlp', 'librosa', 'soundfile', 'numpy', 'scipy']: try: __import__(pkg) except ImportError: missing.append(pkg.replace('_', '-')) if missing: print(f"Missing packages: {', '.join(missing)}") print(f"Install with: pip install -U {' '.join(missing)}") sys.exit(1) import yt_dlp print(f"yt-dlp version: {yt_dlp.version.__version__}") print(f"Sinhala TTS - YouTube Channel Quality Evaluator v3") print("=" * 60) all_results = {} for channel_key, channel_info in CHANNELS.items(): wav_files = download_samples(channel_key, channel_info) if not wav_files: print(f"\nNo files downloaded for {channel_info['label']}") all_results[channel_key] = [] continue channel_results = [] for wav_path in wav_files: res = analyze_audio(wav_path) if res: channel_results.append(res) all_results[channel_key] = channel_results if channel_results: total_dur = sum(r["duration_min"] for r in channel_results) total_speech = sum(r.get("speech_min", 0) for r in channel_results) avg_snr = np.mean([r["snr_db"] for r in channel_results]) avg_score = np.mean([r["tts_score"] for r in channel_results]) multi_spk = sum(1 for r in channel_results if "multi" in r.get("speaker_assessment", "")) music_high = sum(1 for r in channel_results if r.get("music_risk") == "high") print(f"\n{'='*60}") print(f"CHANNEL SUMMARY: {channel_info['label']}") print(f"{'='*60}") print(f" Videos analyzed: {len(channel_results)}") print(f" Total duration: {total_dur:.1f} min") print(f" Usable speech: {total_speech:.1f} min") print(f" Avg SNR: {avg_snr:.1f} dB") print(f" Avg TTS Score: {avg_score:.1f}/8") print(f" Multi-speaker risk: {multi_spk}/{len(channel_results)} videos") print(f" High music risk: {music_high}/{len(channel_results)} videos") # Save detailed results os.makedirs(OUTPUT_DIR, exist_ok=True) results_path = os.path.join(OUTPUT_DIR, "evaluation_results.json") with open(results_path, "w") as f: json.dump(all_results, f, indent=2, ensure_ascii=False) # ============================================================ # FINAL COMPARISON # ============================================================ print(f"\n\n{'='*60}") print(f"FINAL COMPARISON") print(f"{'='*60}") print(f"{'Channel':<35} {'Score':>8} {'SNR':>8} {'Speech':>10} {'Speaker':>15} {'Music':>10}") print(f"{'-'*35} {'-'*8} {'-'*8} {'-'*10} {'-'*15} {'-'*10}") for channel_key, results in all_results.items(): label = CHANNELS[channel_key]['label'] if isinstance(results, list) and results: avg_score = np.mean([r["tts_score"] for r in results]) avg_snr = np.mean([r["snr_db"] for r in results]) total_speech = sum(r.get("speech_min", 0) for r in results) single = sum(1 for r in results if r.get("speaker_assessment") == "single_speaker") spk_label = "single" if single >= len(results)/2 else "mixed" high_music = sum(1 for r in results if r.get("music_risk") == "high") music_label = "low" if high_music == 0 else "some" if high_music < len(results)/2 else "heavy" print(f"{label:<35} {avg_score:>5.1f}/8 {avg_snr:>6.1f}dB {total_speech:>7.1f}min {spk_label:>15} {music_label:>10}") else: print(f"{label:<35} {'No data':>8}") print(f"\nResults saved to: {results_path}") print(f"\nDone! Paste the output above (or {results_path}) back to the assistant.")