File size: 2,319 Bytes
ea99d10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | #!/usr/bin/env python3
"""
Deterministic train/validation/test split assignment.
The policy groups clips before splitting to avoid leakage. Speaker clusters
take priority when present; otherwise video_id is used as the grouping key.
"""
from __future__ import annotations
import hashlib
from collections import Counter
from typing import Any, Dict, List
def _stable_fraction(value: str) -> float:
digest = hashlib.sha256(value.encode("utf-8")).hexdigest()
return int(digest[:16], 16) / float(0xFFFFFFFFFFFFFFFF)
def _split_for_group(group_key: str, seed: str, train_ratio: float, val_ratio: float) -> str:
x = _stable_fraction(f"{seed}:{group_key}")
if x < train_ratio:
return "train"
if x < train_ratio + val_ratio:
return "val"
return "test"
def _group_key(row: Dict[str, Any]) -> str:
speaker = str(row.get("speaker_cluster_id", "") or "").strip()
if speaker:
return f"speaker:{speaker}"
video_id = str(row.get("video_id", "") or "unknown").strip()
return f"video:{video_id}"
def assign_splits(rows: List[Dict[str, Any]], config: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Mutate and return rows with deterministic split metadata."""
if not config.get("enable_split_assignment", True):
for row in rows:
row["split"] = row.get("split", "unsplit")
row["split_policy"] = "disabled"
return rows
train_ratio = float(config.get("split_train_ratio", 0.90))
val_ratio = float(config.get("split_val_ratio", 0.05))
if train_ratio <= 0 or val_ratio < 0 or train_ratio + val_ratio >= 1.0:
raise ValueError("Invalid split ratios: require train_ratio > 0 and train_ratio + val_ratio < 1.0")
seed = str(config.get("split_seed", "sinhala_tts_llm_cc_v1"))
assignments: Dict[str, str] = {}
for row in rows:
key = _group_key(row)
if key not in assignments:
assignments[key] = _split_for_group(key, seed, train_ratio, val_ratio)
row["split"] = assignments[key]
row["split_group_key"] = key
row["split_policy"] = "speaker_cluster_else_video_hash_v1"
return rows
def split_distribution(rows: List[Dict[str, Any]]) -> Dict[str, int]:
return dict(Counter(str(row.get("split", "unknown") or "unknown") for row in rows))
|