sinhala-tts / process /raw-extract /split_manager.py
outlawmold's picture
Implement deterministic quality gates
ea99d10
#!/usr/bin/env python3
"""
Deterministic train/validation/test split assignment.
The policy groups clips before splitting to avoid leakage. Speaker clusters
take priority when present; otherwise video_id is used as the grouping key.
"""
from __future__ import annotations
import hashlib
from collections import Counter
from typing import Any, Dict, List
def _stable_fraction(value: str) -> float:
digest = hashlib.sha256(value.encode("utf-8")).hexdigest()
return int(digest[:16], 16) / float(0xFFFFFFFFFFFFFFFF)
def _split_for_group(group_key: str, seed: str, train_ratio: float, val_ratio: float) -> str:
x = _stable_fraction(f"{seed}:{group_key}")
if x < train_ratio:
return "train"
if x < train_ratio + val_ratio:
return "val"
return "test"
def _group_key(row: Dict[str, Any]) -> str:
speaker = str(row.get("speaker_cluster_id", "") or "").strip()
if speaker:
return f"speaker:{speaker}"
video_id = str(row.get("video_id", "") or "unknown").strip()
return f"video:{video_id}"
def assign_splits(rows: List[Dict[str, Any]], config: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Mutate and return rows with deterministic split metadata."""
if not config.get("enable_split_assignment", True):
for row in rows:
row["split"] = row.get("split", "unsplit")
row["split_policy"] = "disabled"
return rows
train_ratio = float(config.get("split_train_ratio", 0.90))
val_ratio = float(config.get("split_val_ratio", 0.05))
if train_ratio <= 0 or val_ratio < 0 or train_ratio + val_ratio >= 1.0:
raise ValueError("Invalid split ratios: require train_ratio > 0 and train_ratio + val_ratio < 1.0")
seed = str(config.get("split_seed", "sinhala_tts_llm_cc_v1"))
assignments: Dict[str, str] = {}
for row in rows:
key = _group_key(row)
if key not in assignments:
assignments[key] = _split_for_group(key, seed, train_ratio, val_ratio)
row["split"] = assignments[key]
row["split_group_key"] = key
row["split_policy"] = "speaker_cluster_else_video_hash_v1"
return rows
def split_distribution(rows: List[Dict[str, Any]]) -> Dict[str, int]:
return dict(Counter(str(row.get("split", "unknown") or "unknown") for row in rows))