| |
| """ |
| ============================================================= |
| Sinhala TTS - YouTube Channel Audio Quality Evaluator v3 |
| ============================================================= |
| Run this on your LOCAL MACHINE. |
| |
| Requirements: |
| pip install -U yt-dlp librosa soundfile numpy scipy certifi |
| |
| Usage: |
| python evaluate_channels.py |
| ============================================================= |
| """ |
|
|
| import os |
| import sys |
| import json |
| import ssl |
| import numpy as np |
| import warnings |
| warnings.filterwarnings("ignore") |
|
|
| |
| try: |
| import certifi |
| os.environ['SSL_CERT_FILE'] = certifi.where() |
| os.environ['REQUESTS_CA_BUNDLE'] = certifi.where() |
| except ImportError: |
| pass |
| |
| try: |
| ssl._create_default_https_context = ssl._create_unverified_context |
| except AttributeError: |
| pass |
|
|
| |
| |
| |
| CHANNELS = { |
| "sunchare": { |
| "url": "https://www.youtube.com/@sunchare/videos", |
| "label": "NU1's VLOG (Unlimited History)", |
| }, |
| "Raamuwa": { |
| "url": "https://www.youtube.com/@Raamuwa/videos", |
| "label": "Raamuwa", |
| }, |
| } |
|
|
| N_VIDEOS_PER_CHANNEL = 4 |
| OUTPUT_DIR = "tts_channel_eval" |
|
|
|
|
| |
| |
| |
| def download_samples(channel_key, channel_info, n_videos=N_VIDEOS_PER_CHANNEL): |
| """Download n_videos from a channel as WAV audio using Python API.""" |
| import yt_dlp |
| |
| out_dir = os.path.join(OUTPUT_DIR, channel_key) |
| os.makedirs(out_dir, exist_ok=True) |
| |
| print(f"\n{'='*60}") |
| print(f"Downloading from: {channel_info['label']}") |
| print(f"URL: {channel_info['url']}") |
| print(f"{'='*60}") |
| |
| |
| print(f"\n [1/2] Fetching video list...") |
| list_opts = { |
| 'quiet': True, |
| 'no_warnings': True, |
| 'extract_flat': 'in_playlist', |
| 'playlist_items': f'1-{n_videos * 3}', |
| 'nocheckcertificate': True, |
| } |
| |
| entries = [] |
| try: |
| with yt_dlp.YoutubeDL(list_opts) as ydl: |
| info = ydl.extract_info(channel_info["url"], download=False) |
| if info: |
| channel_title = info.get('channel', info.get('uploader', channel_key)) |
| raw_entries = info.get('entries', []) |
| entries = [e for e in raw_entries if e is not None] |
| print(f" Channel: {channel_title}") |
| print(f" Found {len(entries)} videos") |
| except Exception as e: |
| print(f" Error fetching video list: {e}") |
| |
| if not entries: |
| print(f" No entries found.") |
| print(f" Try: pip install -U yt-dlp certifi") |
| return [] |
| |
| |
| selected = [] |
| skipped = [] |
| for e in entries: |
| vid_id = e.get('id', '') |
| title = e.get('title', '?') |
| dur = e.get('duration') or 0 |
| dur_min = dur / 60 if dur else 0 |
| |
| if not vid_id: |
| continue |
| |
| if dur == 0 or (180 <= dur <= 2400): |
| selected.append((vid_id, title, dur)) |
| print(f" + {title[:55]:55s} ({dur_min:.0f}min)") |
| if len(selected) >= n_videos: |
| break |
| else: |
| skipped.append((title, dur_min)) |
| |
| if not selected and skipped: |
| print(f" No videos in 3-40min range. Taking first {n_videos} anyway...") |
| for e in entries[:n_videos]: |
| vid_id = e.get('id', '') |
| title = e.get('title', '?') |
| dur = e.get('duration') or 0 |
| if vid_id: |
| selected.append((vid_id, title, dur)) |
| |
| if not selected: |
| print(f" No downloadable videos found!") |
| return [] |
| |
| |
| print(f"\n [2/2] Downloading {len(selected)} videos as WAV...") |
| |
| for i, (vid_id, title, dur) in enumerate(selected): |
| url = f"https://www.youtube.com/watch?v={vid_id}" |
| out_template = os.path.join(out_dir, f"{vid_id}.%(ext)s") |
| |
| dl_opts = { |
| 'format': 'bestaudio/best', |
| 'outtmpl': out_template, |
| 'postprocessors': [{ |
| 'key': 'FFmpegExtractAudio', |
| 'preferredcodec': 'wav', |
| }], |
| 'postprocessor_args': { |
| 'ffmpeg': ['-ac', '1', '-ar', '22050'], |
| }, |
| 'quiet': True, |
| 'no_warnings': True, |
| 'nocheckcertificate': True, |
| } |
| |
| print(f"\n [{i+1}/{len(selected)}] {title[:50]}...") |
| try: |
| with yt_dlp.YoutubeDL(dl_opts) as ydl: |
| ydl.download([url]) |
| print(f" Done") |
| except Exception as e: |
| print(f" Failed: {str(e)[:100]}") |
| |
| wav_files = sorted([f for f in os.listdir(out_dir) if f.endswith('.wav')]) |
| print(f"\n Downloaded {len(wav_files)} WAV files to {out_dir}/") |
| return [os.path.join(out_dir, f) for f in wav_files] |
|
|
|
|
| |
| |
| |
| def analyze_audio(wav_path): |
| """Analyze a single WAV file for TTS training suitability.""" |
| import librosa |
| |
| fname = os.path.basename(wav_path) |
| print(f"\nAnalyzing: {fname}") |
| |
| try: |
| y, sr = librosa.load(wav_path, sr=22050, mono=True) |
| except Exception as e: |
| print(f" Failed to load: {e}") |
| return None |
| |
| duration_sec = len(y) / sr |
| duration_min = duration_sec / 60 |
| print(f" Duration: {duration_min:.1f} minutes") |
| |
| results = { |
| "file": fname, |
| "duration_min": round(duration_min, 1), |
| } |
| |
| |
| rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0] |
| rms_threshold = np.percentile(rms, 20) |
| noise_frames = rms[rms <= rms_threshold] |
| speech_frames = rms[rms > rms_threshold] |
| |
| if len(noise_frames) > 0 and np.mean(noise_frames) > 0: |
| snr = 20 * np.log10(np.mean(speech_frames) / (np.mean(noise_frames) + 1e-10)) |
| else: |
| snr = 40.0 |
| results["snr_db"] = round(float(snr), 1) |
| |
| snr_label = "excellent" if snr >= 25 else "acceptable" if snr >= 15 else "poor" |
| print(f" SNR: {snr:.1f} dB ({snr_label})") |
| |
| |
| flatness = librosa.feature.spectral_flatness(y=y)[0] |
| mean_flat = float(np.mean(flatness)) |
| results["spectral_flatness"] = round(mean_flat, 4) |
| |
| music_risk = "low" if mean_flat > 0.02 else "medium" if mean_flat > 0.005 else "high" |
| results["music_risk"] = music_risk |
| print(f" Music risk: {music_risk} (flatness={mean_flat:.4f})") |
| |
| |
| y_short = y[:sr * 300] if len(y) > sr * 300 else y |
| print(f" Running pitch analysis (first {min(duration_min, 5):.0f} min)...") |
| f0, _, _ = librosa.pyin(y_short, fmin=50, fmax=500, sr=sr) |
| f0_voiced = f0[~np.isnan(f0)] |
| |
| if len(f0_voiced) > 0: |
| pitch_mean = float(np.mean(f0_voiced)) |
| pitch_std = float(np.std(f0_voiced)) |
| voiced_ratio = float(np.sum(~np.isnan(f0)) / len(f0)) |
| |
| results["pitch_mean_hz"] = round(pitch_mean, 1) |
| results["pitch_std_hz"] = round(pitch_std, 1) |
| results["voiced_ratio"] = round(voiced_ratio, 3) |
| |
| if pitch_std > 80: |
| results["speaker_assessment"] = "likely_multi_speaker" |
| print(f" Speaker: LIKELY MULTI-SPEAKER (pitch std={pitch_std:.1f}Hz)") |
| elif pitch_std > 60: |
| results["speaker_assessment"] = "possibly_multi_speaker" |
| print(f" Speaker: possibly multi-speaker (pitch std={pitch_std:.1f}Hz)") |
| else: |
| results["speaker_assessment"] = "single_speaker" |
| print(f" Speaker: consistent single speaker (pitch std={pitch_std:.1f}Hz)") |
| |
| gender = "female" if pitch_mean > 180 else "male" |
| results["gender_estimate"] = gender |
| print(f" Voice: {gender} (mean pitch={pitch_mean:.0f}Hz)") |
| else: |
| print(f" Pitch: could not extract (no voiced frames detected)") |
| |
| |
| speech_ratio = np.sum(rms > rms_threshold) / len(rms) |
| results["speech_pct"] = round(float(speech_ratio * 100), 1) |
| results["speech_min"] = round(duration_min * speech_ratio, 1) |
| print(f" Speech content: {speech_ratio:.0%} ({results['speech_min']:.1f} min of speech)") |
| |
| |
| score = 0 |
| if snr >= 25: score += 3 |
| elif snr >= 15: score += 2 |
| elif snr >= 10: score += 1 |
| |
| if results.get("pitch_std_hz", 999) < 50: score += 2 |
| elif results.get("pitch_std_hz", 999) < 80: score += 1 |
| |
| if speech_ratio > 0.6: score += 2 |
| elif speech_ratio > 0.4: score += 1 |
| |
| if mean_flat > 0.01: score += 1 |
| |
| results["tts_score"] = score |
| grade = "Excellent" if score >= 7 else "Good" if score >= 5 else "Fair" if score >= 3 else "Poor" |
| results["grade"] = grade |
| print(f" TTS Quality Score: {score}/8 ({grade})") |
| |
| return results |
|
|
|
|
| |
| |
| |
| if __name__ == "__main__": |
| |
| missing = [] |
| for pkg in ['yt_dlp', 'librosa', 'soundfile', 'numpy', 'scipy']: |
| try: |
| __import__(pkg) |
| except ImportError: |
| missing.append(pkg.replace('_', '-')) |
| if missing: |
| print(f"Missing packages: {', '.join(missing)}") |
| print(f"Install with: pip install -U {' '.join(missing)}") |
| sys.exit(1) |
| |
| import yt_dlp |
| print(f"yt-dlp version: {yt_dlp.version.__version__}") |
| print(f"Sinhala TTS - YouTube Channel Quality Evaluator v3") |
| print("=" * 60) |
| |
| all_results = {} |
| |
| for channel_key, channel_info in CHANNELS.items(): |
| wav_files = download_samples(channel_key, channel_info) |
| |
| if not wav_files: |
| print(f"\nNo files downloaded for {channel_info['label']}") |
| all_results[channel_key] = [] |
| continue |
| |
| channel_results = [] |
| for wav_path in wav_files: |
| res = analyze_audio(wav_path) |
| if res: |
| channel_results.append(res) |
| |
| all_results[channel_key] = channel_results |
| |
| if channel_results: |
| total_dur = sum(r["duration_min"] for r in channel_results) |
| total_speech = sum(r.get("speech_min", 0) for r in channel_results) |
| avg_snr = np.mean([r["snr_db"] for r in channel_results]) |
| avg_score = np.mean([r["tts_score"] for r in channel_results]) |
| multi_spk = sum(1 for r in channel_results |
| if "multi" in r.get("speaker_assessment", "")) |
| music_high = sum(1 for r in channel_results if r.get("music_risk") == "high") |
| |
| print(f"\n{'='*60}") |
| print(f"CHANNEL SUMMARY: {channel_info['label']}") |
| print(f"{'='*60}") |
| print(f" Videos analyzed: {len(channel_results)}") |
| print(f" Total duration: {total_dur:.1f} min") |
| print(f" Usable speech: {total_speech:.1f} min") |
| print(f" Avg SNR: {avg_snr:.1f} dB") |
| print(f" Avg TTS Score: {avg_score:.1f}/8") |
| print(f" Multi-speaker risk: {multi_spk}/{len(channel_results)} videos") |
| print(f" High music risk: {music_high}/{len(channel_results)} videos") |
| |
| |
| os.makedirs(OUTPUT_DIR, exist_ok=True) |
| results_path = os.path.join(OUTPUT_DIR, "evaluation_results.json") |
| with open(results_path, "w") as f: |
| json.dump(all_results, f, indent=2, ensure_ascii=False) |
| |
| |
| |
| |
| print(f"\n\n{'='*60}") |
| print(f"FINAL COMPARISON") |
| print(f"{'='*60}") |
| print(f"{'Channel':<35} {'Score':>8} {'SNR':>8} {'Speech':>10} {'Speaker':>15} {'Music':>10}") |
| print(f"{'-'*35} {'-'*8} {'-'*8} {'-'*10} {'-'*15} {'-'*10}") |
| |
| for channel_key, results in all_results.items(): |
| label = CHANNELS[channel_key]['label'] |
| if isinstance(results, list) and results: |
| avg_score = np.mean([r["tts_score"] for r in results]) |
| avg_snr = np.mean([r["snr_db"] for r in results]) |
| total_speech = sum(r.get("speech_min", 0) for r in results) |
| |
| single = sum(1 for r in results if r.get("speaker_assessment") == "single_speaker") |
| spk_label = "single" if single >= len(results)/2 else "mixed" |
| |
| high_music = sum(1 for r in results if r.get("music_risk") == "high") |
| music_label = "low" if high_music == 0 else "some" if high_music < len(results)/2 else "heavy" |
| |
| print(f"{label:<35} {avg_score:>5.1f}/8 {avg_snr:>6.1f}dB {total_speech:>7.1f}min {spk_label:>15} {music_label:>10}") |
| else: |
| print(f"{label:<35} {'No data':>8}") |
| |
| print(f"\nResults saved to: {results_path}") |
| print(f"\nDone! Paste the output above (or {results_path}) back to the assistant.") |
|
|