sinhala-tts / scripts /cloud_pipeline.py

feat(macos): implement Apple Silicon optimizations and switch to wav2vec2 ASR

1a2a2b3 about 1 month ago

33.9 kB

	#!/usr/bin/env python3
	"""
	=============================================================
	Sinhala TTS - Phase 2: Cloud GPU Processing Pipeline
	=============================================================
	Runs on HF Jobs. Reads raw audio from HF dataset repo,
	processes through the full pipeline, and pushes results back.

	Pipeline:
	1. Download raw audio from HF dataset repo
	2. Source separation (HTDemucs → vocals only)
	3. Audio enhancement (VoiceFixer + DeepFilterNet3)
	4. Speaker diarization (pyannote 3.1 / simple-diarizer fallback)
	5. VAD segmentation (Silero-VAD, 3-20s chunks)
	6. ASR transcription (Whisper large-v3)
	7. Quality filtering
	8. Export as LJSpeech-format dataset → push to Hub

	Usage (on HF Jobs - configured via hf_jobs tool):
	python scripts/cloud_pipeline.py \
	--source-repo outlawmold/sinhala-tts-raw-audio \
	--output-repo outlawmold/sinhala-tts-dataset \
	--batch-size 5
	=============================================================
	"""

	import os
	import sys
	import json
	import argparse
	import logging
	import shutil
	import tempfile
	import warnings
	from pathlib import Path
	from typing import Optional, Dict, List, Tuple

	import numpy as np
	import torch
	import torchaudio
	import soundfile as sf
	from tqdm import tqdm

	warnings.filterwarnings("ignore")

	# ============================================================
	# CONFIG
	# ============================================================
	SAMPLE_RATE = 22050
	DIARIZE_SR = 16000
	MIN_SEGMENT_SEC = 3.0
	MAX_SEGMENT_SEC = 20.0

	# Quality thresholds (IndicVoices-R + Emilia-Pipe)
	SNR_THRESHOLD = 25.0
	PITCH_MEAN_MAX = 350.0
	PITCH_STD_MAX = 150.0
	SPEAKING_RATE_MAX = 30.0
	MIN_SPEECH_RATIO = 0.5

	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s [%(levelname)s] %(message)s',
	datefmt='%H:%M:%S',
	handlers=[
	logging.StreamHandler(sys.stdout),
	logging.FileHandler('/app/pipeline.log'),
	]
	)
	log = logging.getLogger("sinhala-tts-cloud")


	# ============================================================
	# HUB I/O
	# ============================================================
	def get_api():
	from huggingface_hub import HfApi
	return HfApi()


	def download_raw_audio(source_repo: str, work_dir: Path, video_ids: List[str] = None) -> List[Path]:
	"""Download raw audio files from HF dataset repo."""
	from huggingface_hub import hf_hub_download, list_repo_tree

	api = get_api()
	audio_dir = work_dir / "raw"
	audio_dir.mkdir(parents=True, exist_ok=True)

	# List available audio files
	files = list(api.list_repo_tree(source_repo, repo_type="dataset", path_in_repo="audio"))
	wav_files = [f for f in files if f.rfilename.endswith(".wav")]
	log.info(f"Found {len(wav_files)} audio files in {source_repo}")

	# Filter to specific video IDs if requested
	if video_ids:
	vid_set = set(video_ids)
	wav_files = [f for f in wav_files if Path(f.rfilename).stem in vid_set]
	log.info(f"Filtered to {len(wav_files)} requested videos")

	downloaded = []
	for wf in wav_files:
	local_path = audio_dir / Path(wf.rfilename).name
	if local_path.exists():
	downloaded.append(local_path)
	continue
	try:
	dl_path = hf_hub_download(
	repo_id=source_repo,
	filename=wf.rfilename,
	repo_type="dataset",
	local_dir=str(work_dir / "_hub_cache"),
	)
	shutil.copy2(dl_path, str(local_path))
	downloaded.append(local_path)
	except Exception as e:
	log.error(f"Failed to download {wf.rfilename}: {e}")

	log.info(f"Downloaded {len(downloaded)} audio files")
	return downloaded


	def load_processing_state(output_repo: str) -> dict:
	"""Load processing state from output repo."""
	api = get_api()
	try:
	path = api.hf_hub_download(
	repo_id=output_repo,
	filename="processing_state.json",
	repo_type="dataset",
	)
	with open(path) as f:
	return json.load(f)
	except Exception:
	return {"completed_videos": [], "total_utterances": 0, "total_hours": 0.0}


	def save_processing_state(output_repo: str, state: dict):
	"""Save processing state to output repo."""
	api = get_api()
	state_bytes = json.dumps(state, indent=2).encode("utf-8")
	api.upload_file(
	path_or_fileobj=state_bytes,
	path_in_repo="processing_state.json",
	repo_id=output_repo,
	repo_type="dataset",
	commit_message=f"Update state: {len(state['completed_videos'])} videos, {state['total_hours']:.1f}h",
	)


	def upload_utterances_batch(
	utterances: List[dict],
	output_repo: str,
	video_id: str,
	):
	"""Upload processed utterances (WAV + metadata) for one video."""
	from huggingface_hub import HfApi, CommitOperationAdd

	api = get_api()
	operations = []

	for utt in utterances:
	wav_path = Path(utt["path"])
	if not wav_path.exists():
	continue
	remote_path = f"wavs/{wav_path.name}"
	operations.append(
	CommitOperationAdd(
	path_in_repo=remote_path,
	path_or_fileobj=str(wav_path),
	)
	)

	# Also upload per-video metadata
	meta_bytes = json.dumps(utterances, indent=2, ensure_ascii=False).encode("utf-8")
	operations.append(
	CommitOperationAdd(
	path_in_repo=f"metadata/{video_id}.json",
	path_or_fileobj=meta_bytes,
	)
	)

	if operations:
	try:
	api.create_commit(
	repo_id=output_repo,
	repo_type="dataset",
	operations=operations,
	commit_message=f"Add {len(utterances)} utterances from {video_id}",
	)
	log.info(f" [upload] Pushed {len(utterances)} utterances for {video_id}")
	except Exception as e:
	log.error(f" [upload] Failed to push {video_id}: {e}")


	def upload_final_dataset(
	all_utterances: List[dict],
	dataset_dir: Path,
	output_repo: str,
	stats: dict,
	):
	"""Upload the final LJSpeech-format dataset."""
	from huggingface_hub import HfApi, CommitOperationAdd

	api = get_api()
	operations = []

	# Upload metadata CSVs
	for csv_name in ["metadata.csv", "metadata_train.csv", "metadata_val.csv"]:
	csv_path = dataset_dir / csv_name
	if csv_path.exists():
	operations.append(
	CommitOperationAdd(
	path_in_repo=csv_name,
	path_or_fileobj=str(csv_path),
	)
	)

	# Upload stats
	stats_bytes = json.dumps(stats, indent=2).encode("utf-8")
	operations.append(
	CommitOperationAdd(
	path_in_repo="dataset_stats.json",
	path_or_fileobj=stats_bytes,
	)
	)

	# Upload README
	readme = _generate_dataset_readme(stats)
	operations.append(
	CommitOperationAdd(
	path_in_repo="README.md",
	path_or_fileobj=readme.encode("utf-8"),
	)
	)

	if operations:
	try:
	api.create_commit(
	repo_id=output_repo,
	repo_type="dataset",
	operations=operations,
	commit_message=f"Final dataset: {stats['total_utterances']} utterances, {stats['total_hours']}h",
	)
	log.info(f"Final dataset pushed to {output_repo}")
	except Exception as e:
	log.error(f"Failed to push final dataset: {e}")


	def _generate_dataset_readme(stats: dict) -> str:
	return f"""---
	language:
	- si
	license: cc-by-4.0
	task_categories:
	- text-to-speech
	- automatic-speech-recognition
	pretty_name: Sinhala TTS Dataset (Unlimited History)
	size_categories:
	- 10K<n<100K
	tags:
	- sinhala
	- tts
	- speech
	---

	# Sinhala TTS Dataset

	Clean, segmented Sinhala speech from the "Unlimited History" YouTube series by @sunchare.

	## Dataset Statistics

	\| Metric \| Value \|
	\|---\|---\|
	\| Total utterances \| {stats.get('total_utterances', 'N/A')} \|
	\| Training set \| {stats.get('train_utterances', 'N/A')} \|
	\| Validation set \| {stats.get('val_utterances', 'N/A')} \|
	\| Total hours \| {stats.get('total_hours', 'N/A')} \|
	\| Mean duration \| {stats.get('mean_duration_sec', 'N/A')}s \|
	\| Sample rate \| {stats.get('sample_rate', 22050)} Hz \|

	## Processing Pipeline

	Raw YouTube audio → HTDemucs (source separation) → VoiceFixer + DeepFilterNet3 (enhancement) →
	pyannote 3.1 (diarization) → Silero-VAD (segmentation) → Whisper large-v3 (transcription) →
	Quality filtering (SNR≥25dB, pitch, speaking rate)

	## Format

	LJSpeech-compatible:
	- `wavs/` — mono WAV files at 22050 Hz
	- `metadata.csv` — `filename\|text\|normalized_text`
	- `metadata_train.csv` / `metadata_val.csv` — train/val splits

	## Source

	[NU1's VLOG (@sunchare)](https://www.youtube.com/@sunchare) - "Unlimited History" series on Sri Lankan history.
	"""


	# ============================================================
	# PROCESSING STEPS (adapted from data_pipeline.py)
	# ============================================================

	def separate_vocals(wav_path: Path, output_dir: Path) -> Path:
	"""HTDemucs source separation."""
	output_path = output_dir / f"{wav_path.stem}_vocals.wav"
	if output_path.exists():
	return output_path

	try:
	from demucs.pretrained import get_model
	from demucs.apply import apply_model

	model = get_model("htdemucs")
	model.eval()
	if torch.cuda.is_available():
	device = torch.device("cuda")
	elif torch.backends.mps.is_available():
	device = torch.device("mps")
	else:
	device = torch.device("cpu")
	model.to(device)

	waveform, sr = torchaudio.load(str(wav_path))
	if waveform.shape[0] == 1:
	waveform = waveform.repeat(2, 1)
	if sr != model.samplerate:
	waveform = torchaudio.transforms.Resample(sr, model.samplerate)(waveform)

	waveform = waveform.unsqueeze(0).to(device)

	# Process in chunks to avoid OOM on long audio
	with torch.no_grad():
	sources = apply_model(model, waveform, device=device, split=True, overlap=0.25)

	vocals = sources[0, 3] # drums, bass, other, vocals
	vocals_mono = vocals.mean(dim=0, keepdim=True)

	if model.samplerate != SAMPLE_RATE:
	vocals_mono = torchaudio.transforms.Resample(model.samplerate, SAMPLE_RATE)(vocals_mono)

	output_dir.mkdir(parents=True, exist_ok=True)
	torchaudio.save(str(output_path), vocals_mono.cpu(), SAMPLE_RATE)
	log.info(f" [separation] Done: {output_path.name}")
	return output_path

	except Exception as e:
	log.warning(f" [separation] Failed ({e}), using original audio")
	return wav_path


	def enhance_audio(wav_path: Path, output_dir: Path) -> Path:
	"""VoiceFixer + DeepFilterNet3 enhancement."""
	output_path = output_dir / f"{wav_path.stem}_enhanced.wav"
	if output_path.exists():
	return output_path

	output_dir.mkdir(parents=True, exist_ok=True)
	current_path = wav_path

	# Stage 1: VoiceFixer
	try:
	from voicefixer import VoiceFixer
	vf = VoiceFixer()
	vf_output = output_dir / f"{wav_path.stem}_vf.wav"
	vf.restore(
	input=str(current_path),
	output=str(vf_output),
	cuda=torch.cuda.is_available(),
	mode=0,
	)
	if vf_output.exists():
	current_path = vf_output
	log.info(f" [enhance] VoiceFixer done")
	except Exception as e:
	log.warning(f" [enhance] VoiceFixer failed: {e}")

	# Stage 2: DeepFilterNet3
	try:
	from df.enhance import enhance, init_df, load_audio, save_audio
	df_model, df_state, _ = init_df()
	audio, _ = load_audio(str(current_path), sr=df_state.sr())
	enhanced = enhance(df_model, df_state, audio)
	save_audio(str(output_path), enhanced, df_state.sr())

	if output_path.exists():
	waveform, sr = torchaudio.load(str(output_path))
	if sr != SAMPLE_RATE:
	waveform = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(waveform)
	torchaudio.save(str(output_path), waveform, SAMPLE_RATE)
	log.info(f" [enhance] DeepFilterNet3 done")
	elif current_path != wav_path:
	shutil.copy2(str(current_path), str(output_path))
	except Exception as e:
	log.warning(f" [enhance] DeepFilterNet3 failed: {e}")
	if current_path != wav_path:
	shutil.copy2(str(current_path), str(output_path))
	else:
	return wav_path

	# Cleanup VoiceFixer temp
	vf_temp = output_dir / f"{wav_path.stem}_vf.wav"
	if vf_temp.exists() and output_path.exists() and vf_temp != output_path:
	vf_temp.unlink()

	return output_path if output_path.exists() else wav_path


	def diarize_audio(wav_path: Path, num_speakers: int = 2) -> Dict[str, List[Dict]]:
	"""Speaker diarization with pyannote 3.1 (fallback to simple-diarizer)."""
	token = os.environ.get("HF_TOKEN")

	# Try pyannote first
	if token:
	try:
	from pyannote.audio import Pipeline as PyannotePipeline

	pipeline = PyannotePipeline.from_pretrained(
	"pyannote/speaker-diarization-3.1",
	use_auth_token=token,
	)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	pipeline.to(device)

	waveform, sr = torchaudio.load(str(wav_path))
	if sr != DIARIZE_SR:
	waveform = torchaudio.transforms.Resample(sr, DIARIZE_SR)(waveform)

	diarization = pipeline(
	{"waveform": waveform, "sample_rate": DIARIZE_SR},
	num_speakers=num_speakers,
	)

	speakers = {}
	for turn, _, speaker in diarization.itertracks(yield_label=True):
	if speaker not in speakers:
	speakers[speaker] = []
	speakers[speaker].append({
	"start": round(turn.start, 3),
	"end": round(turn.end, 3),
	"duration": round(turn.end - turn.start, 3),
	})

	log.info(f" [diarize] {len(speakers)} speakers (pyannote 3.1)")
	return speakers

	except Exception as e:
	log.warning(f" [diarize] pyannote failed: {e}")

	# Fallback: simple-diarizer
	try:
	from simple_diarizer.diarizer import Diarizer

	# Monkeypatch torchaudio.load for compatibility
	def _fixed_load(uri, frame_offset=0, num_frames=-1, normalize=True, channels_first=True, **kwargs):
	stop = None if num_frames == -1 else frame_offset + num_frames
	data, samplerate = sf.read(uri, start=frame_offset, stop=stop, dtype='float32')
	tensor = torch.from_numpy(data)
	if tensor.ndim == 1:
	tensor = tensor.unsqueeze(0)
	elif channels_first:
	tensor = tensor.T
	return tensor, samplerate

	torchaudio.load = _fixed_load

	diar = Diarizer(embed_model='ecapa', cluster_method='sc')
	segments = diar.diarize(str(wav_path), num_speakers=num_speakers)

	speakers = {}
	for seg in segments:
	label = str(seg['label'])
	if label not in speakers:
	speakers[label] = []
	speakers[label].append({
	"start": round(seg['start'], 3),
	"end": round(seg['end'], 3),
	"duration": round(seg['end'] - seg['start'], 3),
	})

	log.info(f" [diarize] {len(speakers)} speakers (simple-diarizer)")
	return speakers

	except Exception as e:
	log.error(f" [diarize] All diarization failed: {e}")
	import librosa
	dur = librosa.get_duration(path=str(wav_path))
	return {"SPEAKER_0": [{"start": 0.0, "end": round(dur, 3), "duration": round(dur, 3)}]}


	def select_target_speaker(speakers: Dict[str, List[Dict]]) -> str:
	"""Select speaker with most speaking time."""
	durations = {spk: sum(s["duration"] for s in segs) for spk, segs in speakers.items()}
	best = max(durations, key=durations.get)
	log.info(f" [diarize] Target: {best} ({durations[best]/60:.1f}min / {sum(durations.values())/60:.1f}min)")
	return best


	def segment_with_vad(wav_path: Path, speaker_segments: List[Dict], output_dir: Path) -> List[Dict]:
	"""Silero-VAD segmentation within speaker turns."""
	output_dir.mkdir(parents=True, exist_ok=True)

	waveform, sr = torchaudio.load(str(wav_path))
	if waveform.shape[0] > 1:
	waveform = waveform.mean(dim=0, keepdim=True)

	if sr != 16000:
	waveform_16k = torchaudio.transforms.Resample(sr, 16000)(waveform)
	else:
	waveform_16k = waveform

	vad_model, vad_utils = torch.hub.load(
	'snakers4/silero-vad', 'silero_vad', force_reload=False, trust_repo=True,
	)
	get_speech_timestamps = vad_utils[0]

	utterances = []
	utt_idx = 0

	for seg in speaker_segments:
	s16 = int(seg["start"] * 16000)
	e16 = int(seg["end"] * 16000)
	seg_audio = waveform_16k[0, s16:e16]

	if len(seg_audio) < int(MIN_SEGMENT_SEC * 16000):
	continue

	try:
	speech_ts = get_speech_timestamps(
	seg_audio, vad_model, sampling_rate=16000,
	min_speech_duration_ms=500, min_silence_duration_ms=300,
	speech_pad_ms=100, return_seconds=False,
	)
	except Exception:
	speech_ts = [{"start": 0, "end": len(seg_audio)}]

	if not speech_ts:
	continue

	merged = _merge_vad_segments(speech_ts, sr=16000)

	for vad_seg in merged:
	vad_start_sec = seg["start"] + vad_seg["start"] / 16000
	vad_end_sec = seg["start"] + vad_seg["end"] / 16000
	duration = vad_end_sec - vad_start_sec

	if duration < MIN_SEGMENT_SEC or duration > MAX_SEGMENT_SEC:
	continue

	start_sample = int(vad_start_sec * sr)
	end_sample = int(vad_end_sec * sr)
	utt_audio = waveform[:, start_sample:end_sample]

	if sr != SAMPLE_RATE:
	utt_audio = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(utt_audio)

	peak = utt_audio.abs().max()
	if peak > 0:
	utt_audio = utt_audio * (10 ** (-3 / 20) / peak)

	utt_name = f"{wav_path.stem}_utt{utt_idx:05d}.wav"
	utt_path = output_dir / utt_name
	torchaudio.save(str(utt_path), utt_audio, SAMPLE_RATE)

	utterances.append({
	"path": str(utt_path),
	"filename": utt_name,
	"start": round(vad_start_sec, 3),
	"end": round(vad_end_sec, 3),
	"duration": round(duration, 3),
	})
	utt_idx += 1

	log.info(f" [vad] {len(utterances)} utterances ({sum(u['duration'] for u in utterances)/60:.1f}min)")
	return utterances


	def _merge_vad_segments(segments, sr=16000, gap_ms=500):
	if not segments:
	return []
	gap_samples = int(gap_ms * sr / 1000)
	merged = [{"start": segments[0]["start"], "end": segments[0]["end"]}]
	for seg in segments[1:]:
	if seg["start"] - merged[-1]["end"] < gap_samples:
	merged[-1]["end"] = seg["end"]
	else:
	merged.append({"start": seg["start"], "end": seg["end"]})

	final = []
	for seg in merged:
	dur = (seg["end"] - seg["start"]) / sr
	if dur > MAX_SEGMENT_SEC:
	chunk = int(MAX_SEGMENT_SEC * sr)
	pos = seg["start"]
	while pos < seg["end"]:
	end = min(pos + chunk, seg["end"])
	if (end - pos) / sr >= MIN_SEGMENT_SEC:
	final.append({"start": pos, "end": end})
	pos = end
	else:
	final.append(seg)
	return final


	def transcribe_utterances(utterances: List[Dict], model_size: str = "large-v3") -> List[Dict]:
	"""Whisper transcription."""
	try:
	from faster_whisper import WhisperModel
	device = "cuda" if torch.cuda.is_available() else "cpu"
	compute_type = "float16" if device == "cuda" else "int8"
	log.info(f" [asr] Loading faster-whisper {model_size} on {device}...")
	model = WhisperModel(model_size, device=device, compute_type=compute_type)

	for utt in tqdm(utterances, desc="Transcribing", leave=False):
	try:
	segments, info = model.transcribe(
	utt["path"], language="si", beam_size=5, best_of=5,
	temperature=0.0, condition_on_previous_text=False, vad_filter=False,
	)
	utt["text"] = " ".join(seg.text.strip() for seg in segments).strip()
	utt["language_prob"] = info.language_probability
	except Exception as e:
	utt["text"] = ""
	utt["language_prob"] = 0.0
	return utterances
	except ImportError:
	log.error(" [asr] faster-whisper not installed!")
	return utterances


	def compute_snr(wav_path: str) -> float:
	import librosa
	y, sr = librosa.load(wav_path, sr=SAMPLE_RATE, mono=True)
	rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0]
	threshold = np.percentile(rms, 20)
	noise = rms[rms <= threshold]
	speech = rms[rms > threshold]
	if len(noise) > 0 and np.mean(noise) > 1e-10:
	return float(20 * np.log10(np.mean(speech) / np.mean(noise)))
	return 40.0


	def compute_pitch_stats(wav_path: str) -> Tuple[float, float]:
	import librosa
	y, sr = librosa.load(wav_path, sr=SAMPLE_RATE, mono=True)
	f0, _, _ = librosa.pyin(y, fmin=50, fmax=500, sr=sr)
	f0v = f0[~np.isnan(f0)]
	if len(f0v) > 0:
	return float(np.mean(f0v)), float(np.std(f0v))
	return 0.0, 0.0


	def filter_utterances(utterances: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
	"""Quality filtering."""
	import librosa
	kept, rejected = [], []

	for utt in tqdm(utterances, desc="Filtering", leave=False):
	reasons = []

	if not utt.get("text", "").strip():
	reasons.append("empty_text")
	if utt.get("language_prob", 0) < 0.5:
	reasons.append(f"low_lang_prob={utt.get('language_prob', 0):.2f}")
	if utt["duration"] < MIN_SEGMENT_SEC or utt["duration"] > MAX_SEGMENT_SEC:
	reasons.append(f"duration={utt['duration']:.1f}s")

	try:
	snr = compute_snr(utt["path"])
	utt["snr_db"] = round(snr, 1)
	if snr < SNR_THRESHOLD:
	reasons.append(f"low_snr={snr:.1f}dB")
	except Exception:
	utt["snr_db"] = 0.0
	reasons.append("snr_failed")

	try:
	pm, ps = compute_pitch_stats(utt["path"])
	utt["pitch_mean_hz"] = round(pm, 1)
	utt["pitch_std_hz"] = round(ps, 1)
	if pm > PITCH_MEAN_MAX:
	reasons.append(f"high_pitch={pm:.0f}Hz")
	if ps > PITCH_STD_MAX:
	reasons.append(f"high_pitch_var={ps:.0f}Hz")
	except Exception:
	utt["pitch_mean_hz"] = 0.0
	utt["pitch_std_hz"] = 0.0

	if utt.get("text"):
	chars = len([c for c in utt["text"] if c.strip() and c not in "!?.,;:\"'()-"])
	rate = chars / utt["duration"] if utt["duration"] > 0 else 0
	utt["speaking_rate"] = round(rate, 1)
	if rate > SPEAKING_RATE_MAX:
	reasons.append(f"fast_speech={rate:.1f}c/s")
	if rate < 1.0 and utt["duration"] > 3.0:
	reasons.append(f"slow_speech={rate:.1f}c/s")

	try:
	y, _ = librosa.load(utt["path"], sr=SAMPLE_RATE, mono=True)
	rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0]
	threshold = np.percentile(rms, 20)
	speech_ratio = float(np.sum(rms > threshold) / len(rms))
	utt["speech_ratio"] = round(speech_ratio, 3)
	if speech_ratio < MIN_SPEECH_RATIO:
	reasons.append(f"low_speech_ratio={speech_ratio:.2f}")
	except Exception:
	utt["speech_ratio"] = 0.0

	if reasons:
	utt["reject_reasons"] = reasons
	rejected.append(utt)
	else:
	kept.append(utt)

	log.info(f" [filter] Kept {len(kept)}/{len(utterances)} ({len(kept)/max(1,len(utterances))*100:.1f}%)")
	if rejected:
	all_reasons = [r for u in rejected for r in u.get("reject_reasons", [])]
	reason_counts = {}
	for r in all_reasons:
	reason_counts[r.split("=")[0]] = reason_counts.get(r.split("=")[0], 0) + 1
	log.info(f" [filter] Rejections: {reason_counts}")

	return kept, rejected


	def normalize_sinhala_text(text: str) -> str:
	import unicodedata
	text = unicodedata.normalize('NFC', text)
	text = text.replace('\u200C', '')
	text = text.replace('\u201c', '"').replace('\u201d', '"')
	text = text.replace('\u2018', "'").replace('\u2019', "'")
	text = text.replace(';', ',').replace(':', ',')
	text = text.replace('(', '').replace(')', '')
	text = text.replace('[', '').replace(']', '')
	return ' '.join(text.split()).strip()


	def export_dataset(utterances: List[Dict], output_dir: Path, val_split: float = 0.05) -> dict:
	"""Export as LJSpeech format."""
	import random

	output_dir.mkdir(parents=True, exist_ok=True)
	wavs_dir = output_dir / "wavs"
	wavs_dir.mkdir(exist_ok=True)

	metadata = []
	for i, utt in enumerate(tqdm(utterances, desc="Exporting", leave=False)):
	name = f"si_{i:06d}"
	new_path = wavs_dir / f"{name}.wav"
	src = Path(utt["path"])
	if src.exists() and not new_path.exists():
	shutil.copy2(str(src), str(new_path))
	text = utt.get("text", "").strip()
	if not text:
	continue
	metadata.append(f"{name}\|{text}\|{normalize_sinhala_text(text)}")

	random.seed(42)
	random.shuffle(metadata)
	n_val = max(1, int(len(metadata) * val_split))

	(output_dir / "metadata.csv").write_text("\n".join(metadata) + "\n", encoding="utf-8")
	(output_dir / "metadata_train.csv").write_text("\n".join(metadata[n_val:]) + "\n", encoding="utf-8")
	(output_dir / "metadata_val.csv").write_text("\n".join(metadata[:n_val]) + "\n", encoding="utf-8")

	durations = [u["duration"] for u in utterances]
	stats = {
	"total_utterances": len(metadata),
	"train_utterances": len(metadata) - n_val,
	"val_utterances": n_val,
	"total_hours": round(sum(durations) / 3600, 2),
	"mean_duration_sec": round(float(np.mean(durations)), 2),
	"median_duration_sec": round(float(np.median(durations)), 2),
	"min_duration_sec": round(min(durations), 2),
	"max_duration_sec": round(max(durations), 2),
	"sample_rate": SAMPLE_RATE,
	}
	pitches = [u.get("pitch_mean_hz", 0) for u in utterances if u.get("pitch_mean_hz", 0) > 0]
	if pitches:
	stats["corpus_pitch_mean_hz"] = round(float(np.mean(pitches)), 1)
	stats["corpus_pitch_std_hz"] = round(float(np.std(pitches)), 1)

	(output_dir / "dataset_stats.json").write_text(json.dumps(stats, indent=2))

	log.info(f"\n{'='*60}")
	log.info(f"DATASET: {stats['total_utterances']} utts, {stats['total_hours']}h")
	log.info(f" Train: {stats['train_utterances']}, Val: {stats['val_utterances']}")
	log.info(f" Duration: {stats['mean_duration_sec']}s mean, {stats['median_duration_sec']}s median")
	log.info(f"{'='*60}")
	return stats


	# ============================================================
	# MAIN PIPELINE
	# ============================================================
	def process_one_video(
	wav_path: Path,
	work_dir: Path,
	whisper_model: str,
	num_speakers: int,
	skip_separation: bool,
	skip_enhancement: bool,
	) -> Tuple[List[Dict], List[Dict]]:
	"""Full pipeline for one video. Returns (kept_utterances, rejected_utterances)."""
	vid_id = wav_path.stem

	# Step 1: Source separation
	current_audio = wav_path
	if not skip_separation:
	current_audio = separate_vocals(wav_path, work_dir / "separated")

	# Step 2: Enhancement
	if not skip_enhancement:
	current_audio = enhance_audio(current_audio, work_dir / "enhanced")

	# Step 3: Diarization
	speakers = diarize_audio(current_audio, num_speakers)
	target = select_target_speaker(speakers)
	speaker_segments = speakers[target]

	# Step 4: VAD segmentation
	utterances = segment_with_vad(current_audio, speaker_segments, work_dir / "segments" / vid_id)

	if not utterances:
	return [], []

	# Step 5: Transcription
	utterances = transcribe_utterances(utterances, whisper_model)

	# Step 6: Quality filtering
	kept, rejected = filter_utterances(utterances)

	return kept, rejected


	def main():
	parser = argparse.ArgumentParser(description="Sinhala TTS Cloud Pipeline (Phase 2)")
	parser.add_argument("--source-repo", required=True, help="HF dataset repo with raw audio")
	parser.add_argument("--output-repo", required=True, help="HF dataset repo for processed output")
	parser.add_argument("--whisper-model", default="large-v3")
	parser.add_argument("--num-speakers", type=int, default=2)
	parser.add_argument("--batch-size", type=int, default=5, help="Videos per processing batch")
	parser.add_argument("--max-videos", type=int, default=None)
	parser.add_argument("--skip-separation", action="store_true")
	parser.add_argument("--skip-enhancement", action="store_true")
	parser.add_argument("--video-ids", type=str, default=None, help="Comma-separated video IDs to process")
	args = parser.parse_args()

	work_dir = Path("/app/work")
	work_dir.mkdir(parents=True, exist_ok=True)
	dataset_dir = work_dir / "dataset"

	log.info("=" * 60)
	log.info("Sinhala TTS Cloud Pipeline (Phase 2)")
	log.info("=" * 60)
	log.info(f"Source: {args.source_repo}")
	log.info(f"Output: {args.output_repo}")
	log.info(f"Device: {'CUDA — ' + torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

	# Create output repo
	api = get_api()
	api.create_repo(repo_id=args.output_repo, repo_type="dataset", exist_ok=True)

	# Load state
	state = load_processing_state(args.output_repo)
	completed = set(state["completed_videos"])
	log.info(f"Already completed: {len(completed)} videos")

	# Download raw audio
	video_ids = args.video_ids.split(",") if args.video_ids else None
	raw_files = download_raw_audio(args.source_repo, work_dir, video_ids)

	# Filter out already completed
	raw_files = [f for f in raw_files if f.stem not in completed]
	if args.max_videos:
	raw_files = raw_files[:args.max_videos]
	log.info(f"To process: {len(raw_files)} videos")

	if not raw_files:
	log.info("Nothing to process!")
	return

	# Process in batches
	all_kept = []
	all_rejected = []

	for i, wav_path in enumerate(raw_files):
	vid_id = wav_path.stem
	log.info(f"\n[{i+1}/{len(raw_files)}] Processing: {vid_id}")

	try:
	kept, rejected = process_one_video(
	wav_path, work_dir, args.whisper_model, args.num_speakers,
	args.skip_separation, args.skip_enhancement,
	)

	all_kept.extend(kept)
	all_rejected.extend(rejected)

	# Upload utterances for this video
	if kept:
	upload_utterances_batch(kept, args.output_repo, vid_id)

	# Update state
	state["completed_videos"].append(vid_id)
	state["total_utterances"] = len(all_kept)
	state["total_hours"] = round(sum(u["duration"] for u in all_kept) / 3600, 2)
	save_processing_state(args.output_repo, state)

	log.info(f" TOTAL so far: {len(all_kept)} utterances, {state['total_hours']}h")

	# Cleanup this video's intermediate files to save disk
	for subdir in ["separated", "enhanced", "segments"]:
	d = work_dir / subdir
	if d.exists():
	for f in d.rglob(f"{vid_id}*"):
	f.unlink(missing_ok=True)
	wav_path.unlink(missing_ok=True)

	except Exception as e:
	log.error(f" FAILED: {e}")
	import traceback
	traceback.print_exc()
	continue

	# Export final dataset
	if all_kept:
	log.info(f"\n{'='*60}")
	log.info(f"EXPORTING FINAL DATASET")
	log.info(f"{'='*60}")

	stats = export_dataset(all_kept, dataset_dir)
	upload_final_dataset(all_kept, dataset_dir, args.output_repo, stats)

	# Also upload rejected for inspection
	rej_bytes = json.dumps(all_rejected, indent=2, ensure_ascii=False).encode("utf-8")
	api.upload_file(
	path_or_fileobj=rej_bytes,
	path_in_repo="rejected_utterances.json",
	repo_id=args.output_repo,
	repo_type="dataset",
	commit_message=f"Rejected: {len(all_rejected)} utterances",
	)

	log.info(f"\n{'='*60}")
	log.info(f"PIPELINE COMPLETE")
	log.info(f"{'='*60}")
	log.info(f" Processed: {len(state['completed_videos'])} videos")
	log.info(f" Kept: {len(all_kept)} utterances")
	log.info(f" Rejected: {len(all_rejected)} utterances")
	log.info(f" Total hours: {state['total_hours']}")
	log.info(f" Output: https://huggingface.co/datasets/{args.output_repo}")


	if __name__ == "__main__":
	main()