| |
| """ |
| Parse YouTube CC JSON3 files into normalized caption rows. |
| |
| Supports YouTube's auto-generated caption JSON3 format which contains |
| word-level or phrase-level timing information. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| import logging |
| import re |
| import unicodedata |
| from pathlib import Path |
| from statistics import median |
| from typing import List, Dict, Any, Optional, Tuple |
|
|
| log = logging.getLogger("caption_parser") |
|
|
| _MIN_SEGMENT_MS = 160 |
| _MAX_SEGMENT_MS = 1200 |
| _DEFAULT_SEGMENT_MS = 420 |
|
|
|
|
| def normalize_sinhala_text(raw_text: str, version: str = "sinhala_cc_norm_v1") -> str: |
| """Normalize Sinhala caption text for TTS training. |
| |
| Steps: |
| 1. Unicode NFC normalization |
| 2. Strip leading/trailing whitespace |
| 3. Normalize internal whitespace (collapse multiple spaces) |
| 4. Strip common caption artifacts (music markers, sound effects) |
| 5. Normalize punctuation spacing |
| 6. Preserve Sinhala letters, numerals, and meaningful punctuation |
| """ |
| text = raw_text |
|
|
| |
| text = unicodedata.normalize("NFC", text) |
|
|
| |
| text = text.strip() |
|
|
| |
| text = re.sub(r"\s+", " ", text) |
|
|
| |
| |
| text = re.sub(r"[♪♫]+", "", text) |
| text = re.sub(r"\([^)]*\)", "", text) |
| text = re.sub(r"\[[^\]]*\]", "", text) |
|
|
| |
| text = re.sub(r"\s+([.,;:!?])", r"\1", text) |
| text = re.sub(r"([.,;:!?])([^\s])", r"\1 \2", text) |
|
|
| |
| text = re.sub(r"\s+", " ", text) |
|
|
| |
| text = text.strip() |
|
|
| return text |
|
|
|
|
| def parse_youtube_json3(json3_path: Path) -> List[Dict[str, Any]]: |
| """Parse a YouTube JSON3 caption file into normalized caption rows. |
| |
| YouTube JSON3 format has events with segments containing words and timings. |
| Returns a list of caption rows with contiguous caption_ids. |
| """ |
| data = json.loads(json3_path.read_text(encoding="utf-8")) |
|
|
| rows: List[Dict[str, Any]] = [] |
| caption_id = 0 |
| events = data.get("events", []) |
|
|
| def _extract_segments(event: Dict[str, Any]) -> Tuple[List[Tuple[int, str]], int]: |
| """Return absolute-ms/text segment tuples and event start time.""" |
| event_start_ms = int(event.get("tStartMs", 0) or 0) |
| segs = event.get("segs", []) or [] |
| out: List[Tuple[int, str]] = [] |
|
|
| for seg in segs: |
| text = str(seg.get("utf8", "") or "") |
| |
| if not text.strip(): |
| continue |
| offset_ms = int(seg.get("tOffsetMs", 0) or 0) |
| out.append((event_start_ms + offset_ms, text)) |
|
|
| return out, event_start_ms |
|
|
| def _estimate_tail_ms(seg_starts_ms: List[int]) -> int: |
| if len(seg_starts_ms) <= 1: |
| return _DEFAULT_SEGMENT_MS |
| diffs = [ |
| seg_starts_ms[i + 1] - seg_starts_ms[i] |
| for i in range(len(seg_starts_ms) - 1) |
| if seg_starts_ms[i + 1] > seg_starts_ms[i] |
| ] |
| if not diffs: |
| return _DEFAULT_SEGMENT_MS |
| tail_ms = int(median(diffs)) |
| return max(_MIN_SEGMENT_MS, min(_MAX_SEGMENT_MS, tail_ms)) |
|
|
| |
| meaningful_starts_ms: List[Optional[int]] = [None] * len(events) |
| for i, event in enumerate(events): |
| segs, event_start = _extract_segments(event) |
| if segs: |
| meaningful_starts_ms[i] = min(s[0] for s in segs) |
| else: |
| |
| meaningful_starts_ms[i] = int(event.get("tStartMs", event_start) or event_start) |
|
|
| for i, event in enumerate(events): |
| segments, event_start_ms = _extract_segments(event) |
| if not segments: |
| continue |
|
|
| raw_text = "".join(text for _, text in segments) |
| normalized = normalize_sinhala_text(raw_text) |
| if not normalized: |
| continue |
|
|
| seg_starts_ms = sorted(start_ms for start_ms, _ in segments) |
| start_ms = seg_starts_ms[0] |
| duration_ms = int(event.get("dDurationMs", 0) or 0) |
|
|
| next_start_ms: Optional[int] = None |
| for j in range(i + 1, len(events)): |
| nxt = meaningful_starts_ms[j] |
| if nxt is not None and nxt > start_ms: |
| next_start_ms = int(nxt) |
| break |
|
|
| tail_ms = _estimate_tail_ms(seg_starts_ms) |
| end_ms = seg_starts_ms[-1] + tail_ms |
|
|
| |
| if duration_ms > 0: |
| end_ms = min(end_ms, event_start_ms + duration_ms) |
|
|
| |
| if next_start_ms is not None: |
| end_ms = min(end_ms, next_start_ms - 1) |
|
|
| |
| min_end_ms = start_ms + _MIN_SEGMENT_MS |
| if end_ms < min_end_ms: |
| end_ms = min_end_ms |
| if next_start_ms is not None: |
| end_ms = min(end_ms, next_start_ms - 1 if next_start_ms > start_ms else min_end_ms) |
|
|
| if end_ms <= start_ms: |
| continue |
|
|
| rows.append({ |
| "caption_id": caption_id, |
| "start_sec": round(start_ms / 1000.0, 3), |
| "end_sec": round(end_ms / 1000.0, 3), |
| "duration_sec": round((end_ms - start_ms) / 1000.0, 3), |
| "raw_text": raw_text, |
| "normalized_text": normalized, |
| }) |
| caption_id += 1 |
|
|
| log.info(f"Parsed {len(rows)} caption rows from {json3_path.name}") |
| return rows |
|
|
|
|
| def parse_captions_for_video( |
| video_id: str, |
| cc_dir: Path, |
| sub_lang: str = "si", |
| ) -> Optional[List[Dict[str, Any]]]: |
| """Find and parse the caption file for a given video ID. |
| |
| Tries multiple filename patterns to handle lang code variations. |
| """ |
| patterns = [ |
| f"{video_id}.{sub_lang}.json3", |
| f"{video_id}.{sub_lang}-orig.json3", |
| f"{video_id}.{sub_lang}.*.json3", |
| ] |
|
|
| for pattern in patterns: |
| matches = list(cc_dir.glob(pattern)) |
| if matches: |
| |
| non_orig = [m for m in matches if "-orig" not in m.name] |
| target = non_orig[0] if non_orig else matches[0] |
| try: |
| return parse_youtube_json3(target) |
| except Exception as e: |
| log.warning(f"Failed to parse {target}: {e}") |
| continue |
|
|
| log.warning(f"No caption file found for video {video_id} in {cc_dir}") |
| return None |
|
|
|
|
| def save_caption_rows( |
| video_id: str, |
| rows: List[Dict[str, Any]], |
| output_dir: Path, |
| ) -> Path: |
| """Save parsed caption rows to a JSONL file.""" |
| output_dir.mkdir(parents=True, exist_ok=True) |
| out_path = output_dir / f"{video_id}_captions.jsonl" |
|
|
| with out_path.open("w", encoding="utf-8") as f: |
| for row in rows: |
| record = { |
| "video_id": video_id, |
| "caption_id": row["caption_id"], |
| "start_sec": row["start_sec"], |
| "end_sec": row["end_sec"], |
| "duration_sec": row["duration_sec"], |
| "raw_text": row["raw_text"], |
| "normalized_text": row["normalized_text"], |
| } |
| f.write(json.dumps(record, ensure_ascii=False) + "\n") |
|
|
| log.info(f"Saved {len(rows)} caption rows to {out_path}") |
| return out_path |
|
|