feat(macos): implement Apple Silicon optimizations and switch to wav2vec2 ASR

1a2a2b3 about 1 month ago

44.4 kB

	#!/usr/bin/env python3
	"""
	=============================================================
	Sinhala TTS - Complete Data Extraction Pipeline
	=============================================================
	Processes YouTube audio into TTS-ready training data.

	Pipeline steps:
	1. Download YouTube videos as audio
	2. Source separation (HTDemucs - remove background music)
	3. Audio enhancement (VoiceFixer + DeepFilterNet3)
	4. Speaker diarization (pyannote 3.1)
	5. VAD segmentation (Silero-VAD, 3-30s chunks)
	6. ASR transcription (Whisper large-v3)
	7. Quality filtering (DNSMOS, SNR, pitch, speaking rate)
	8. Export as LJSpeech-format dataset

	Based on:
	- Emilia-Pipe (arxiv:2407.05361) - pipeline design
	- IndicVoices-R (arxiv:2409.05356) - audio enhancement
	- IndicTTS (arxiv:2211.09536) - target training format

	Requirements:
	pip install -U yt-dlp torch torchaudio demucs voicefixer deepfilternet \
	pyannote.audio whisper librosa soundfile numpy scipy \
	tqdm pandas certifi

	GPU recommended. CPU works but is 10-50x slower.

	Usage:
	# Process from a video list JSON
	python scripts/data_pipeline.py --video-list tts_channel_eval/unlimited_history_videos.json

	# Process a single video
	python scripts/data_pipeline.py --video-id AJ0Ul2Wl4Pk

	# Process a folder of already-downloaded audio files
	python scripts/data_pipeline.py --audio-dir /path/to/raw_audio/

	# Resume from a checkpoint (skips completed steps)
	python scripts/data_pipeline.py --video-list videos.json --resume

	# Skip steps (e.g. if source audio is already clean)
	python scripts/data_pipeline.py --audio-dir audio/ --skip-separation --skip-enhancement

	=============================================================
	"""

	import os
	import sys
	import ssl
	import json
	import argparse
	import hashlib
	import logging
	import warnings
	from pathlib import Path
	from typing import Optional, Dict, List, Tuple

	import numpy as np
	import torch
	import torchaudio
	import soundfile as sf
	from tqdm import tqdm

	warnings.filterwarnings("ignore")

	# macOS SSL fix
	try:
	import certifi
	os.environ['SSL_CERT_FILE'] = certifi.where()
	os.environ['REQUESTS_CA_BUNDLE'] = certifi.where()
	except ImportError:
	pass
	try:
	ssl._create_default_https_context = ssl._create_unverified_context
	except AttributeError:
	pass

	# ============================================================
	# CONFIG
	# ============================================================
	SAMPLE_RATE = 22050 # FastPitch target sample rate
	DIARIZE_SR = 16000 # pyannote expects 16kHz
	MIN_SEGMENT_SEC = 3.0 # minimum utterance length
	MAX_SEGMENT_SEC = 20.0 # maximum utterance length (IndicTTS filters >20s)
	TARGET_SPEAKER = None # set after diarization analysis; None = use dominant speaker

	# Quality thresholds (IndicVoices-R + Emilia-Pipe)
	SNR_THRESHOLD = 25.0 # dB
	PITCH_MEAN_MAX = 350.0 # Hz
	PITCH_STD_MAX = 150.0 # Hz
	SPEAKING_RATE_MAX = 30.0 # chars/second
	MIN_SPEECH_RATIO = 0.5 # at least 50% speech in segment

	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s [%(levelname)s] %(message)s',
	datefmt='%H:%M:%S'
	)
	log = logging.getLogger("sinhala-tts")


	# ============================================================
	# STEP 0: State management (resume support)
	# ============================================================
	class PipelineState:
	"""Track processing state for resume support."""

	def __init__(self, state_dir: str):
	self.state_dir = Path(state_dir)
	self.state_dir.mkdir(parents=True, exist_ok=True)
	self.state_file = self.state_dir / "pipeline_state.json"
	self.state = self._load()

	def _load(self):
	if self.state_file.exists():
	with open(self.state_file) as f:
	return json.load(f)
	return {"completed_videos": {}, "completed_steps": {}}

	def save(self):
	with open(self.state_file, "w") as f:
	json.dump(self.state, f, indent=2)

	def is_done(self, video_id: str, step: str) -> bool:
	return self.state.get("completed_videos", {}).get(video_id, {}).get(step, False)

	def mark_done(self, video_id: str, step: str):
	if video_id not in self.state["completed_videos"]:
	self.state["completed_videos"][video_id] = {}
	self.state["completed_videos"][video_id][step] = True
	self.save()


	# ============================================================
	# STEP 1: Download
	# ============================================================
	def download_video(video_id: str, output_dir: Path) -> Optional[Path]:
	"""Download a YouTube video as mono WAV at target sample rate."""
	import yt_dlp

	wav_path = output_dir / f"{video_id}.wav"
	if wav_path.exists():
	log.info(f" [download] {video_id} already exists, skipping")
	return wav_path

	url = f"https://www.youtube.com/watch?v={video_id}"
	dl_opts = {
	'format': 'bestaudio/best',
	'outtmpl': str(output_dir / f"{video_id}.%(ext)s"),
	'postprocessors': [{
	'key': 'FFmpegExtractAudio',
	'preferredcodec': 'wav',
	}],
	'postprocessor_args': {
	'ffmpeg': ['-ac', '1', '-ar', str(SAMPLE_RATE)],
	},
	'quiet': True,
	'no_warnings': True,
	'nocheckcertificate': True,
	}

	try:
	with yt_dlp.YoutubeDL(dl_opts) as ydl:
	ydl.download([url])
	if wav_path.exists():
	return wav_path
	# yt-dlp sometimes adds double extension
	for f in output_dir.glob(f"{video_id}*.wav"):
	f.rename(wav_path)
	return wav_path
	except Exception as e:
	log.error(f" [download] Failed {video_id}: {e}")
	return None


	def download_all_videos(video_list: List[Dict], output_dir: Path) -> List[Tuple[str, Path]]:
	"""Download all videos from list. Returns [(video_id, wav_path), ...]"""
	output_dir.mkdir(parents=True, exist_ok=True)
	results = []
	for i, v in enumerate(tqdm(video_list, desc="Downloading")):
	vid_id = v["id"]
	log.info(f"[{i+1}/{len(video_list)}] Downloading: {v.get('title', vid_id)[:60]}")
	wav_path = download_video(vid_id, output_dir)
	if wav_path and wav_path.exists():
	results.append((vid_id, wav_path))
	log.info(f"Downloaded {len(results)}/{len(video_list)} videos")
	return results


	# ============================================================
	# STEP 2: Source Separation (HTDemucs)
	# ============================================================
	def separate_vocals(wav_path: Path, output_dir: Path) -> Optional[Path]:
	"""
	Extract vocals using HTDemucs (Meta's hybrid transformer Demucs).
	Removes background music, ambient noise, and effects.
	Based on IndicVoices-R pipeline (arxiv:2409.05356).
	"""
	output_path = output_dir / f"{wav_path.stem}_vocals.wav"
	if output_path.exists():
	log.info(f" [separation] {wav_path.stem} already done, skipping")
	return output_path

	try:
	from demucs.pretrained import get_model
	from demucs.apply import apply_model
	import demucs.audio

	# Load model
	model = get_model("htdemucs")
	model.eval()
	if torch.cuda.is_available():
	device = torch.device("cuda")
	elif torch.backends.mps.is_available():
	device = torch.device("mps")
	else:
	device = torch.device("cpu")
	model.to(device)

	# Load audio (demucs expects stereo at its native sr)
	waveform, sr = torchaudio.load(str(wav_path))

	# Convert mono to stereo if needed (demucs expects stereo)
	if waveform.shape[0] == 1:
	waveform = waveform.repeat(2, 1)

	# Resample to model's sample rate if needed
	if sr != model.samplerate:
	resampler = torchaudio.transforms.Resample(sr, model.samplerate)
	waveform = resampler(waveform)

	# Add batch dimension: (channels, samples) -> (1, channels, samples)
	waveform = waveform.unsqueeze(0).to(device)

	# Separate
	with torch.no_grad():
	sources = apply_model(model, waveform, device=device)

	# sources shape: (batch, n_sources, channels, samples)
	# htdemucs sources: drums, bass, other, vocals
	# We want vocals (index 3)
	vocals = sources[0, 3] # (channels, samples)

	# Convert back to mono
	vocals_mono = vocals.mean(dim=0, keepdim=True)

	# Resample back to target SR
	if model.samplerate != SAMPLE_RATE:
	resampler = torchaudio.transforms.Resample(model.samplerate, SAMPLE_RATE)
	vocals_mono = resampler(vocals_mono)

	# Save
	output_dir.mkdir(parents=True, exist_ok=True)
	torchaudio.save(str(output_path), vocals_mono.cpu(), SAMPLE_RATE)

	log.info(f" [separation] Vocals extracted: {output_path.name}")
	return output_path

	except Exception as e:
	log.error(f" [separation] Failed: {e}")
	# Fall back to original audio
	return wav_path


	# ============================================================
	# STEP 3: Audio Enhancement (VoiceFixer + DeepFilterNet3)
	# ============================================================
	def enhance_audio(wav_path: Path, output_dir: Path) -> Optional[Path]:
	"""
	Two-stage enhancement from IndicVoices-R (arxiv:2409.05356):
	1. VoiceFixer: dereverberation + bandwidth extension + denoising
	2. DeepFilterNet3: remove remaining artifacts + noise

	This sequential approach was shown to produce cleaner speech than
	either method alone.
	"""
	output_path = output_dir / f"{wav_path.stem}_enhanced.wav"
	if output_path.exists():
	log.info(f" [enhance] {wav_path.stem} already done, skipping")
	return output_path

	output_dir.mkdir(parents=True, exist_ok=True)
	current_path = wav_path

	# Stage 1: VoiceFixer (dereverberation + restoration)
	try:
	from voicefixer import VoiceFixer
	vf = VoiceFixer()
	vf_output = output_dir / f"{wav_path.stem}_vf.wav"
	vf.restore(
	input=str(current_path),
	output=str(vf_output),
	cuda=torch.cuda.is_available(),
	mode=0 # mode 0 = speech restoration (denoise + dereverb + upsample)
	)
	if vf_output.exists():
	current_path = vf_output
	log.info(f" [enhance] VoiceFixer done")
	except Exception as e:
	log.warning(f" [enhance] VoiceFixer failed (continuing): {e}")

	# Stage 2: DeepFilterNet3 (fine noise/artifact removal)
	try:
	from df.enhance import enhance, init_df, load_audio, save_audio
	df_model, df_state, _ = init_df()
	audio, _ = load_audio(str(current_path), sr=df_state.sr())
	enhanced = enhance(df_model, df_state, audio)
	save_audio(str(output_path), enhanced, df_state.sr())

	if output_path.exists():
	# Resample to target SR if DeepFilterNet outputs different SR
	waveform, sr = torchaudio.load(str(output_path))
	if sr != SAMPLE_RATE:
	resampler = torchaudio.transforms.Resample(sr, SAMPLE_RATE)
	waveform = resampler(waveform)
	torchaudio.save(str(output_path), waveform, SAMPLE_RATE)
	log.info(f" [enhance] DeepFilterNet3 done")
	else:
	# If DeepFilterNet failed silently, use VoiceFixer output
	if current_path != wav_path:
	import shutil
	shutil.copy2(str(current_path), str(output_path))
	except Exception as e:
	log.warning(f" [enhance] DeepFilterNet3 failed (continuing): {e}")
	# Use whatever we have so far
	if current_path != wav_path:
	import shutil
	shutil.copy2(str(current_path), str(output_path))
	else:
	return wav_path

	# Clean up intermediate VoiceFixer file
	vf_temp = output_dir / f"{wav_path.stem}_vf.wav"
	if vf_temp.exists() and output_path.exists():
	vf_temp.unlink()

	return output_path if output_path.exists() else wav_path


	# ============================================================
	# STEP 4: Speaker Diarization (pyannote 3.1)
	# ============================================================
	def diarize_audio(wav_path: Path, num_speakers: int = 2,
	hf_token: Optional[str] = None) -> Dict[str, List[Dict]]:
	"""
	Run speaker diarization using pyannote/speaker-diarization-3.1.

	Returns dict: {speaker_label: [{start, end, duration}, ...], ...}

	NOTE: Requires accepting model licenses on HuggingFace:
	- https://huggingface.co/pyannote/segmentation-3.0
	- https://huggingface.co/pyannote/speaker-diarization-3.1
	"""
	from pyannote.audio import Pipeline

	token = hf_token or os.environ.get("HF_TOKEN")
	if not token:
	log.warning(" [diarize] No HF_TOKEN found. pyannote requires auth.")
	log.warning(" [diarize] Set HF_TOKEN env var or pass --hf-token")
	# Fall back to simple-diarizer (no auth needed)
	return _diarize_simple(wav_path, num_speakers)

	try:
	pipeline = Pipeline.from_pretrained(
	"pyannote/speaker-diarization-3.1",
	use_auth_token=token
	)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	pipeline.to(device)

	# Load at 16kHz for diarization
	waveform, sr = torchaudio.load(str(wav_path))
	if sr != DIARIZE_SR:
	resampler = torchaudio.transforms.Resample(sr, DIARIZE_SR)
	waveform = resampler(waveform)

	diarization = pipeline(
	{"waveform": waveform, "sample_rate": DIARIZE_SR},
	num_speakers=num_speakers
	)

	speakers = {}
	for turn, _, speaker in diarization.itertracks(yield_label=True):
	if speaker not in speakers:
	speakers[speaker] = []
	speakers[speaker].append({
	"start": round(turn.start, 3),
	"end": round(turn.end, 3),
	"duration": round(turn.end - turn.start, 3),
	})

	log.info(f" [diarize] Found {len(speakers)} speakers (pyannote)")
	return speakers

	except Exception as e:
	log.warning(f" [diarize] pyannote failed: {e}")
	log.info(" [diarize] Falling back to simple-diarizer (no auth)")
	return _diarize_simple(wav_path, num_speakers)


	def _diarize_simple(wav_path: Path, num_speakers: int = 2) -> Dict[str, List[Dict]]:
	"""Fallback: simple-diarizer using SpeechBrain ECAPA (no auth needed)."""
	try:
	from simple_diarizer.diarizer import Diarizer

	diar = Diarizer(embed_model='ecapa', cluster_method='sc')
	segments = diar.diarize(str(wav_path), num_speakers=num_speakers)

	speakers = {}
	for seg in segments:
	label = str(seg['label'])
	if label not in speakers:
	speakers[label] = []
	speakers[label].append({
	"start": round(seg['start'], 3),
	"end": round(seg['end'], 3),
	"duration": round(seg['end'] - seg['start'], 3),
	})

	log.info(f" [diarize] Found {len(speakers)} speakers (simple-diarizer)")
	return speakers

	except Exception as e:
	log.error(f" [diarize] simple-diarizer also failed: {e}")
	# Last resort: treat entire audio as one speaker
	import librosa
	dur = librosa.get_duration(path=str(wav_path))
	return {"SPEAKER_0": [{"start": 0.0, "end": round(dur, 3), "duration": round(dur, 3)}]}


	def select_target_speaker(speakers: Dict[str, List[Dict]],
	target_speaker: Optional[str] = None) -> str:
	"""Select which speaker to extract. Default: the one with most speaking time."""
	if target_speaker and target_speaker in speakers:
	return target_speaker

	# Pick speaker with most total duration
	durations = {}
	for spk, segs in speakers.items():
	durations[spk] = sum(s["duration"] for s in segs)

	best = max(durations, key=durations.get)
	log.info(f" [diarize] Selected speaker: {best} "
	f"({durations[best]/60:.1f} min / "
	f"{sum(durations.values())/60:.1f} min total)")
	return best


	# ============================================================
	# STEP 5: VAD Segmentation (Silero-VAD)
	# ============================================================
	def segment_with_vad(wav_path: Path, speaker_segments: List[Dict],
	output_dir: Path) -> List[Dict]:
	"""
	Fine-grained VAD segmentation within speaker turns.

	Takes diarization segments for one speaker and:
	1. Extracts audio for that speaker
	2. Runs Silero-VAD to find speech boundaries
	3. Splits long segments, merges short ones
	4. Exports individual utterance WAV files (3-20s each)

	Returns list of {path, start, end, duration} for each utterance.
	"""
	output_dir.mkdir(parents=True, exist_ok=True)

	# Load full audio
	waveform, sr = torchaudio.load(str(wav_path))
	if waveform.shape[0] > 1:
	waveform = waveform.mean(dim=0, keepdim=True)

	# Resample to 16kHz for Silero-VAD
	if sr != 16000:
	resampler = torchaudio.transforms.Resample(sr, 16000)
	waveform_16k = resampler(waveform)
	else:
	waveform_16k = waveform

	# Load Silero-VAD
	vad_model, vad_utils = torch.hub.load(
	repo_or_dir='snakers4/silero-vad',
	model='silero_vad',
	force_reload=False,
	trust_repo=True,
	)
	get_speech_timestamps = vad_utils[0]

	utterances = []
	utt_idx = 0

	for seg in speaker_segments:
	# Extract segment audio (at 16kHz for VAD)
	start_sample_16k = int(seg["start"] * 16000)
	end_sample_16k = int(seg["end"] * 16000)
	seg_audio = waveform_16k[0, start_sample_16k:end_sample_16k]

	if len(seg_audio) < int(MIN_SEGMENT_SEC * 16000):
	continue

	# Run VAD on this segment
	try:
	speech_ts = get_speech_timestamps(
	seg_audio,
	vad_model,
	sampling_rate=16000,
	min_speech_duration_ms=500,
	min_silence_duration_ms=300,
	speech_pad_ms=100,
	return_seconds=False,
	)
	except Exception:
	speech_ts = [{"start": 0, "end": len(seg_audio)}]

	if not speech_ts:
	continue

	# Merge close VAD segments and enforce duration limits
	merged = _merge_vad_segments(speech_ts, sr=16000)

	for vad_seg in merged:
	# Convert back to original audio timestamps
	vad_start_sec = seg["start"] + vad_seg["start"] / 16000
	vad_end_sec = seg["start"] + vad_seg["end"] / 16000
	duration = vad_end_sec - vad_start_sec

	if duration < MIN_SEGMENT_SEC or duration > MAX_SEGMENT_SEC:
	continue

	# Extract from original audio at target sample rate
	start_sample = int(vad_start_sec * sr)
	end_sample = int(vad_end_sec * sr)
	utt_audio = waveform[:, start_sample:end_sample]

	# Resample to target if needed
	if sr != SAMPLE_RATE:
	resampler = torchaudio.transforms.Resample(sr, SAMPLE_RATE)
	utt_audio = resampler(utt_audio)

	# Normalize volume (peak normalize to -3 dBFS)
	peak = utt_audio.abs().max()
	if peak > 0:
	target_peak = 10 ** (-3 / 20) # -3 dBFS
	utt_audio = utt_audio * (target_peak / peak)

	# Save
	utt_name = f"{wav_path.stem}_utt{utt_idx:05d}.wav"
	utt_path = output_dir / utt_name
	torchaudio.save(str(utt_path), utt_audio, SAMPLE_RATE)

	utterances.append({
	"path": str(utt_path),
	"filename": utt_name,
	"start": round(vad_start_sec, 3),
	"end": round(vad_end_sec, 3),
	"duration": round(duration, 3),
	})
	utt_idx += 1

	log.info(f" [vad] Extracted {len(utterances)} utterances "
	f"({sum(u['duration'] for u in utterances)/60:.1f} min)")
	return utterances


	def _merge_vad_segments(segments: List[Dict], sr: int = 16000,
	gap_ms: int = 500) -> List[Dict]:
	"""Merge VAD segments that are close together."""
	if not segments:
	return []

	gap_samples = int(gap_ms * sr / 1000)
	merged = [{"start": segments[0]["start"], "end": segments[0]["end"]}]

	for seg in segments[1:]:
	if seg["start"] - merged[-1]["end"] < gap_samples:
	merged[-1]["end"] = seg["end"]
	else:
	merged.append({"start": seg["start"], "end": seg["end"]})

	# Split segments that are too long
	final = []
	for seg in merged:
	duration_sec = (seg["end"] - seg["start"]) / sr
	if duration_sec > MAX_SEGMENT_SEC:
	# Split into chunks at MAX_SEGMENT_SEC boundaries
	chunk_samples = int(MAX_SEGMENT_SEC * sr)
	pos = seg["start"]
	while pos < seg["end"]:
	end = min(pos + chunk_samples, seg["end"])
	if (end - pos) / sr >= MIN_SEGMENT_SEC:
	final.append({"start": pos, "end": end})
	pos = end
	else:
	final.append(seg)

	return final


	# ============================================================
	# STEP 6: ASR Transcription (Whisper large-v3)
	# ============================================================
	def transcribe_utterances(utterances: List[Dict],
	model_size: str = "large-v3") -> List[Dict]:
	"""
	Transcribe utterances using Whisper.

	Uses faster-whisper (CTranslate2 backend) if available,
	falls back to openai-whisper.
	"""
	# Try faster-whisper first (2-4x faster)
	try:
	from faster_whisper import WhisperModel

	log.info(f" [asr] Loading faster-whisper {model_size}...")
	device = "cuda" if torch.cuda.is_available() else "cpu"
	compute_type = "float16" if device == "cuda" else "int8"
	model = WhisperModel(model_size, device=device, compute_type=compute_type)

	for utt in tqdm(utterances, desc="Transcribing"):
	try:
	segments, info = model.transcribe(
	utt["path"],
	language="si",
	beam_size=5,
	best_of=5,
	temperature=0.0,
	condition_on_previous_text=False,
	vad_filter=False, # we already did VAD
	)
	text = " ".join(seg.text.strip() for seg in segments)
	utt["text"] = text.strip()
	utt["language_prob"] = info.language_probability
	except Exception as e:
	utt["text"] = ""
	utt["language_prob"] = 0.0
	log.warning(f" [asr] Failed on {utt['filename']}: {e}")

	return utterances

	except ImportError:
	pass

	# Fallback: openai-whisper
	try:
	import whisper

	log.info(f" [asr] Loading whisper {model_size}...")
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model = whisper.load_model(model_size, device=device)

	for utt in tqdm(utterances, desc="Transcribing"):
	try:
	result = model.transcribe(
	utt["path"],
	language="si",
	beam_size=5,
	best_of=5,
	temperature=0.0,
	condition_on_previous_text=False,
	no_speech_threshold=0.6,
	)
	utt["text"] = result["text"].strip()
	utt["language_prob"] = result.get("language", {}).get("si", 0.0)
	except Exception as e:
	utt["text"] = ""
	utt["language_prob"] = 0.0
	log.warning(f" [asr] Failed on {utt['filename']}: {e}")

	return utterances

	except ImportError:
	log.error(" [asr] Neither faster-whisper nor openai-whisper installed!")
	log.error(" Install: pip install faster-whisper (recommended)")
	log.error(" or: pip install openai-whisper")
	return utterances


	# ============================================================
	# STEP 7: Quality Filtering
	# ============================================================
	def compute_snr(wav_path: str) -> float:
	"""Compute approximate SNR using RMS energy thresholding."""
	import librosa
	y, sr = librosa.load(wav_path, sr=SAMPLE_RATE, mono=True)
	rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0]
	threshold = np.percentile(rms, 20)
	noise = rms[rms <= threshold]
	speech = rms[rms > threshold]
	if len(noise) > 0 and np.mean(noise) > 1e-10:
	return float(20 * np.log10(np.mean(speech) / np.mean(noise)))
	return 40.0


	def compute_pitch_stats(wav_path: str) -> Tuple[float, float]:
	"""Compute pitch mean and std for a single utterance."""
	import librosa
	y, sr = librosa.load(wav_path, sr=SAMPLE_RATE, mono=True)
	f0, _, _ = librosa.pyin(y, fmin=50, fmax=500, sr=sr)
	f0v = f0[~np.isnan(f0)]
	if len(f0v) > 0:
	return float(np.mean(f0v)), float(np.std(f0v))
	return 0.0, 0.0


	def compute_speaking_rate(text: str, duration: float) -> float:
	"""Characters per second (for Sinhala, grapheme clusters approximate syllables)."""
	# Remove spaces and punctuation for char count
	chars = len([c for c in text if c.strip() and c not in "!?.,;:\"'()-"])
	if duration > 0:
	return chars / duration
	return 0.0


	def filter_utterances(utterances: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
	"""
	Apply quality filters based on IndicVoices-R and Emilia-Pipe thresholds.

	Returns (kept, rejected) lists.
	"""
	import librosa

	kept = []
	rejected = []

	for utt in tqdm(utterances, desc="Quality filtering"):
	reasons = []

	# Skip empty transcriptions
	if not utt.get("text", "").strip():
	reasons.append("empty_text")

	# Skip very low language probability
	if utt.get("language_prob", 0) < 0.5:
	reasons.append(f"low_lang_prob={utt.get('language_prob', 0):.2f}")

	# Duration check (should already be filtered, but double-check)
	if utt["duration"] < MIN_SEGMENT_SEC or utt["duration"] > MAX_SEGMENT_SEC:
	reasons.append(f"duration={utt['duration']:.1f}s")

	# SNR check
	try:
	snr = compute_snr(utt["path"])
	utt["snr_db"] = round(snr, 1)
	if snr < SNR_THRESHOLD:
	reasons.append(f"low_snr={snr:.1f}dB")
	except Exception:
	utt["snr_db"] = 0.0
	reasons.append("snr_failed")

	# Pitch check (detect multi-speaker leakage or non-speech)
	try:
	pitch_mean, pitch_std = compute_pitch_stats(utt["path"])
	utt["pitch_mean_hz"] = round(pitch_mean, 1)
	utt["pitch_std_hz"] = round(pitch_std, 1)
	if pitch_mean > PITCH_MEAN_MAX:
	reasons.append(f"high_pitch={pitch_mean:.0f}Hz")
	if pitch_std > PITCH_STD_MAX:
	reasons.append(f"high_pitch_var={pitch_std:.0f}Hz")
	except Exception:
	utt["pitch_mean_hz"] = 0.0
	utt["pitch_std_hz"] = 0.0

	# Speaking rate check
	if utt.get("text"):
	rate = compute_speaking_rate(utt["text"], utt["duration"])
	utt["speaking_rate"] = round(rate, 1)
	if rate > SPEAKING_RATE_MAX:
	reasons.append(f"fast_speech={rate:.1f}c/s")
	if rate < 1.0 and utt["duration"] > 3.0:
	reasons.append(f"slow_speech={rate:.1f}c/s")

	# Speech ratio (check for silence-heavy segments)
	try:
	y, sr_loaded = librosa.load(utt["path"], sr=SAMPLE_RATE, mono=True)
	rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0]
	threshold = np.percentile(rms, 20)
	speech_ratio = float(np.sum(rms > threshold) / len(rms))
	utt["speech_ratio"] = round(speech_ratio, 3)
	if speech_ratio < MIN_SPEECH_RATIO:
	reasons.append(f"low_speech_ratio={speech_ratio:.2f}")
	except Exception:
	utt["speech_ratio"] = 0.0

	if reasons:
	utt["reject_reasons"] = reasons
	rejected.append(utt)
	else:
	kept.append(utt)

	log.info(f" [filter] Kept {len(kept)}/{len(utterances)} "
	f"({len(kept)/len(utterances)*100:.1f}%)")

	# Log rejection stats
	if rejected:
	all_reasons = [r for u in rejected for r in u.get("reject_reasons", [])]
	reason_counts = {}
	for r in all_reasons:
	key = r.split("=")[0]
	reason_counts[key] = reason_counts.get(key, 0) + 1
	log.info(f" [filter] Rejection reasons: {reason_counts}")

	return kept, rejected


	# ============================================================
	# STEP 8: Export as LJSpeech Format
	# ============================================================
	def export_dataset(utterances: List[Dict], output_dir: Path,
	val_split: float = 0.05):
	"""
	Export as LJSpeech-format dataset for Coqui-TTS FastPitch training.

	Creates:
	output_dir/
	wavs/ - all WAV files (22050 Hz, mono)
	metadata.csv - full dataset: filename\|text\|normalized_text
	metadata_train.csv
	metadata_val.csv
	dataset_stats.json - corpus statistics
	"""
	output_dir.mkdir(parents=True, exist_ok=True)
	wavs_dir = output_dir / "wavs"
	wavs_dir.mkdir(exist_ok=True)

	# Copy/rename WAV files to sequential names
	import shutil
	metadata = []

	for i, utt in enumerate(tqdm(utterances, desc="Exporting")):
	new_name = f"si_{i:06d}"
	new_path = wavs_dir / f"{new_name}.wav"

	# Copy WAV
	if not new_path.exists():
	src = Path(utt["path"])
	if src.exists():
	shutil.copy2(str(src), str(new_path))
	else:
	continue

	text = utt.get("text", "").strip()
	if not text:
	continue

	# Normalize text: basic Sinhala text cleaning
	normalized = _normalize_sinhala_text(text)
	metadata.append(f"{new_name}\|{text}\|{normalized}")

	# Shuffle and split
	import random
	random.seed(42)
	random.shuffle(metadata)

	n_val = max(1, int(len(metadata) * val_split))
	val_lines = metadata[:n_val]
	train_lines = metadata[n_val:]

	# Write metadata files
	with open(output_dir / "metadata.csv", "w", encoding="utf-8") as f:
	f.write("\n".join(metadata) + "\n")

	with open(output_dir / "metadata_train.csv", "w", encoding="utf-8") as f:
	f.write("\n".join(train_lines) + "\n")

	with open(output_dir / "metadata_val.csv", "w", encoding="utf-8") as f:
	f.write("\n".join(val_lines) + "\n")

	# Compute corpus statistics
	durations = [u["duration"] for u in utterances]
	stats = {
	"total_utterances": len(metadata),
	"train_utterances": len(train_lines),
	"val_utterances": len(val_lines),
	"total_hours": round(sum(durations) / 3600, 2),
	"mean_duration_sec": round(np.mean(durations), 2),
	"median_duration_sec": round(np.median(durations), 2),
	"min_duration_sec": round(min(durations), 2),
	"max_duration_sec": round(max(durations), 2),
	"sample_rate": SAMPLE_RATE,
	}

	# Pitch stats across corpus
	pitches = [u.get("pitch_mean_hz", 0) for u in utterances if u.get("pitch_mean_hz", 0) > 0]
	if pitches:
	stats["corpus_pitch_mean_hz"] = round(float(np.mean(pitches)), 1)
	stats["corpus_pitch_std_hz"] = round(float(np.std(pitches)), 1)

	with open(output_dir / "dataset_stats.json", "w") as f:
	json.dump(stats, f, indent=2)

	log.info(f"\n{'='*60}")
	log.info(f"DATASET EXPORTED")
	log.info(f"{'='*60}")
	log.info(f" Location: {output_dir}")
	log.info(f" Total: {stats['total_utterances']} utterances")
	log.info(f" Train: {stats['train_utterances']}")
	log.info(f" Val: {stats['val_utterances']}")
	log.info(f" Duration: {stats['total_hours']} hours")
	log.info(f" Mean length: {stats['mean_duration_sec']}s")
	if 'corpus_pitch_mean_hz' in stats:
	log.info(f" Pitch mean: {stats['corpus_pitch_mean_hz']} Hz")
	log.info(f" Pitch std: {stats['corpus_pitch_std_hz']} Hz")
	log.info(f"{'='*60}")

	return stats


	def _normalize_sinhala_text(text: str) -> str:
	"""
	Basic Sinhala text normalization for TTS.

	- Unicode NFC normalization (canonical decomposition → composition)
	- Remove zero-width characters (except ZWJ which forms conjuncts)
	- Normalize punctuation
	- Collapse whitespace
	"""
	import unicodedata

	# NFC normalization (critical for Brahmic scripts)
	text = unicodedata.normalize('NFC', text)

	# Remove zero-width non-joiner (ZWNJ) but keep ZWJ (conjunct former)
	text = text.replace('\u200C', '') # ZWNJ
	# Keep \u200D (ZWJ) — it's part of Sinhala conjunct consonants like ක්‍ෂ

	# Normalize quotation marks
	text = text.replace('"', '"').replace('"', '"')
	text = text.replace(''', "'").replace(''', "'")

	# Replace semicolons and colons with commas (IndicTTS convention)
	text = text.replace(';', ',').replace(':', ',')

	# Remove parentheses (but keep content)
	text = text.replace('(', '').replace(')', '')
	text = text.replace('[', '').replace(']', '')

	# Collapse whitespace
	text = ' '.join(text.split())

	return text.strip()


	# ============================================================
	# MAIN
	# ============================================================
	def parse_args():
	parser = argparse.ArgumentParser(
	description="Sinhala TTS Data Pipeline",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	)

	# Input sources (pick one)
	input_group = parser.add_mutually_exclusive_group(required=True)
	input_group.add_argument("--video-list", type=str,
	help="JSON file with video list (from list_unlimited_history.py)")
	input_group.add_argument("--video-id", type=str,
	help="Single YouTube video ID")
	input_group.add_argument("--audio-dir", type=str,
	help="Directory of pre-downloaded audio files")

	# Output
	parser.add_argument("--output-dir", type=str, default="sinhala_tts_dataset",
	help="Output directory (default: sinhala_tts_dataset)")

	# Pipeline options
	parser.add_argument("--num-speakers", type=int, default=2,
	help="Expected number of speakers per video (default: 2)")
	parser.add_argument("--target-speaker", type=str, default=None,
	help="Speaker label to extract (default: auto-select dominant)")
	parser.add_argument("--whisper-model", type=str, default="large-v3",
	choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"],
	help="Whisper model size (default: large-v3)")
	parser.add_argument("--hf-token", type=str, default=None,
	help="HuggingFace token for pyannote (optional)")

	# Skip options
	parser.add_argument("--skip-separation", action="store_true",
	help="Skip source separation step")
	parser.add_argument("--skip-enhancement", action="store_true",
	help="Skip audio enhancement step")
	parser.add_argument("--skip-diarization", action="store_true",
	help="Skip diarization (treat all audio as one speaker)")
	parser.add_argument("--skip-transcription", action="store_true",
	help="Skip Whisper transcription (need pre-existing transcripts)")

	# Control
	parser.add_argument("--resume", action="store_true",
	help="Resume from checkpoint (skip completed steps)")
	parser.add_argument("--max-videos", type=int, default=None,
	help="Process only first N videos (for testing)")
	parser.add_argument("--batch-size", type=int, default=10,
	help="Process videos in batches of N (default: 10)")
	parser.add_argument("--use-unlimited-history-only", action="store_true",
	help="Only use Unlimited History videos from the list")

	return parser.parse_args()


	def main():
	args = parse_args()

	output_dir = Path(args.output_dir)
	raw_dir = output_dir / "raw_audio"
	separated_dir = output_dir / "separated"
	enhanced_dir = output_dir / "enhanced"
	segments_dir = output_dir / "segments"
	dataset_dir = output_dir / "dataset"

	state = PipelineState(str(output_dir / ".state"))

	log.info("=" * 60)
	log.info("Sinhala TTS Data Pipeline")
	log.info("=" * 60)
	log.info(f"Output: {output_dir}")
	log.info(f"Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")
	if torch.cuda.is_available():
	log.info(f"GPU: {torch.cuda.get_device_name(0)}")
	log.info("")

	# ---- Resolve input ----
	if args.video_list:
	with open(args.video_list) as f:
	data = json.load(f)

	if args.use_unlimited_history_only:
	videos = data.get("unlimited_history", [])
	log.info(f"Using {len(videos)} Unlimited History videos")
	else:
	videos = data.get("unlimited_history", []) + data.get("other", [])
	log.info(f"Using all {len(videos)} videos")

	if args.max_videos:
	videos = videos[:args.max_videos]
	log.info(f"Limited to first {args.max_videos} videos")

	elif args.video_id:
	videos = [{"id": args.video_id, "title": args.video_id}]

	elif args.audio_dir:
	audio_dir = Path(args.audio_dir)
	videos = []
	for f in sorted(audio_dir.glob("*.wav")):
	videos.append({"id": f.stem, "title": f.stem, "_local_path": str(f)})
	log.info(f"Found {len(videos)} audio files in {audio_dir}")

	if not videos:
	log.error("No videos to process!")
	sys.exit(1)

	# ---- Process in batches ----
	all_utterances = []
	batch_size = args.batch_size

	for batch_start in range(0, len(videos), batch_size):
	batch = videos[batch_start:batch_start + batch_size]
	batch_num = batch_start // batch_size + 1
	total_batches = (len(videos) + batch_size - 1) // batch_size

	log.info(f"\n{'='*60}")
	log.info(f"BATCH {batch_num}/{total_batches} ({len(batch)} videos)")
	log.info(f"{'='*60}")

	for v in batch:
	vid_id = v["id"]
	title = v.get("title", vid_id)
	log.info(f"\n--- Processing: {title[:60]} ({vid_id}) ---")

	# Step 1: Download (or use local file)
	if "_local_path" in v:
	wav_path = Path(v["_local_path"])
	elif args.resume and state.is_done(vid_id, "download"):
	wav_path = raw_dir / f"{vid_id}.wav"
	else:
	wav_path = download_video(vid_id, raw_dir)
	if wav_path:
	state.mark_done(vid_id, "download")

	if not wav_path or not wav_path.exists():
	log.warning(f" Skipping {vid_id}: no audio")
	continue

	current_audio = wav_path

	# Step 2: Source separation
	if not args.skip_separation:
	if args.resume and state.is_done(vid_id, "separation"):
	current_audio = separated_dir / f"{vid_id}_vocals.wav"
	if not current_audio.exists():
	current_audio = wav_path
	else:
	result = separate_vocals(wav_path, separated_dir)
	if result:
	current_audio = result
	state.mark_done(vid_id, "separation")

	# Step 3: Audio enhancement
	if not args.skip_enhancement:
	if args.resume and state.is_done(vid_id, "enhancement"):
	enh_path = enhanced_dir / f"{current_audio.stem}_enhanced.wav"
	if enh_path.exists():
	current_audio = enh_path
	else:
	result = enhance_audio(current_audio, enhanced_dir)
	if result:
	current_audio = result
	state.mark_done(vid_id, "enhancement")

	# Step 4: Speaker diarization
	if not args.skip_diarization:
	if args.resume and state.is_done(vid_id, "diarization"):
	diar_file = output_dir / ".state" / f"{vid_id}_diarization.json"
	if diar_file.exists():
	with open(diar_file) as f:
	speakers = json.load(f)
	else:
	speakers = diarize_audio(current_audio, args.num_speakers, args.hf_token)
	else:
	speakers = diarize_audio(current_audio, args.num_speakers, args.hf_token)
	# Save diarization results
	diar_file = output_dir / ".state" / f"{vid_id}_diarization.json"
	with open(diar_file, "w") as f:
	json.dump(speakers, f, indent=2)
	state.mark_done(vid_id, "diarization")

	target = select_target_speaker(speakers, args.target_speaker)
	speaker_segments = speakers[target]
	else:
	# No diarization: use full audio
	import librosa
	dur = librosa.get_duration(path=str(current_audio))
	speaker_segments = [{"start": 0.0, "end": round(dur, 3), "duration": round(dur, 3)}]

	# Step 5: VAD segmentation
	vid_segments_dir = segments_dir / vid_id
	utterances = segment_with_vad(current_audio, speaker_segments, vid_segments_dir)

	# Step 6: ASR transcription
	if not args.skip_transcription and utterances:
	utterances = transcribe_utterances(utterances, args.whisper_model)

	all_utterances.extend(utterances)
	state.mark_done(vid_id, "complete")

	log.info(f" Total utterances so far: {len(all_utterances)} "
	f"({sum(u['duration'] for u in all_utterances)/3600:.1f}h)")

	# ---- Step 7: Quality filtering ----
	log.info(f"\n{'='*60}")
	log.info(f"QUALITY FILTERING ({len(all_utterances)} utterances)")
	log.info(f"{'='*60}")

	if all_utterances:
	kept, rejected = filter_utterances(all_utterances)

	# Save rejected for inspection
	with open(output_dir / "rejected_utterances.json", "w", encoding="utf-8") as f:
	json.dump(rejected, f, indent=2, ensure_ascii=False)

	# ---- Step 8: Export dataset ----
	stats = export_dataset(kept, dataset_dir)

	# Save full manifest
	with open(output_dir / "full_manifest.json", "w", encoding="utf-8") as f:
	json.dump(kept, f, indent=2, ensure_ascii=False)

	log.info(f"\n{'='*60}")
	log.info(f"PIPELINE COMPLETE")
	log.info(f"{'='*60}")
	log.info(f" Raw utterances: {len(all_utterances)}")
	log.info(f" After filtering: {len(kept)}")
	log.info(f" Rejected: {len(rejected)}")
	log.info(f" Dataset: {dataset_dir}")
	log.info(f" Next step: python scripts/train_fastpitch.py --dataset {dataset_dir}")
	else:
	log.error("No utterances extracted! Check logs above for errors.")


	if __name__ == "__main__":
	main()