""" Parse JSON transcript files into timed segments. Expected JSON format (one file per audio): { "video_id": "...", "title": "...", "transcript": [ {"start": 1.605, "duration": 1.557, "text": "خير يا بيريوم؟"}, {"start": 4.301, "duration": 3.45, "text": "مصادرنا بتؤكد إن فيه\nمؤامرة اغتيال ضد حضرتك."}, ... ] } Each entry carries: - start : float — start time in seconds - duration : float — length of this entry in seconds - text : str — transcript text (may contain \\n within a single entry) """ from __future__ import annotations import json import re from dataclasses import dataclass from pathlib import Path from typing import List # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- @dataclass class TranscriptEntry: start: float # start time in seconds end: float # end time in seconds (= start + duration) text: str # normalized transcript text @dataclass class TranscriptSegment: segment_id: int start: float # seconds — used to slice the audio end: float # seconds — used to slice the audio text: str # full normalized text for this segment source_audio: str # stem of the original audio file (for traceability) # --------------------------------------------------------------------------- # Compiled regular expressions used by normalize_arabic() # --------------------------------------------------------------------------- # Arabic diacritics (tashkeel / harakat) — full Unicode range # U+0610–U+061A : Arabic honorifics and signs # U+064B–U+065F : Standard harakat (fathah, dammah, kasrah, tanwin, shadda, sukun …) # U+0670 : Superscript alef # U+06D6–U+06E4, U+06E7, U+06E8, U+06EA–U+06ED : Extended Arabic marks _DIACRITICS_RE = re.compile( r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED]" ) # Alef variants with hamza or madda → bare alef ا # أ (U+0623) إ (U+0625) آ (U+0622) ٱ (U+0671) _ALEF_RE = re.compile(r"[أإآٱ]") # Hamza on waw → plain waw (ؤ U+0624 → و U+0648) # Hamza on ya → plain ya (ئ U+0626 → ي U+064A) # These are written inconsistently in Egyptian informal text; # Whisper tends to output the base letter without hamza. _HAMZA_WAW_RE = re.compile(r"ؤ") _HAMZA_YA_RE = re.compile(r"ئ") # Tatweel / Kashida (U+0640) — decorative elongation, carries no phonetic value _TATWEEL_RE = re.compile(r"\u0640") # Dialogue dash at the very start of a string (speaker-turn marker) _DIALOGUE_DASH_RE = re.compile(r"^\s*[-–—]\s*") # Arabic punctuation characters and common Western punctuation that appear # in transcript files but are never spoken. We remove them so that labels # and Whisper predictions can be compared without punctuation mismatch. # Kept intentionally narrow — only characters Whisper never outputs for Arabic. _PUNCTUATION_RE = re.compile( r"[،؛؟!\"\'«»\(\)\[\]\{\}\.\,\:\;\-–—…]" ) # Collapse any run of two or more spaces into one _MULTI_SPACE_RE = re.compile(r" {2,}") # --------------------------------------------------------------------------- # Text normalization # --------------------------------------------------------------------------- def normalize_arabic(text: str) -> str: """ Normalize Arabic text for use as a Whisper fine-tuning target. Steps applied (in order): 1. Remove tashkeel (diacritics) — Whisper never outputs them; training with diacritics in labels penalizes correct predictions. 2. Remove tatweel/kashida (U+0640) — decorative character, not spoken. 3. Unify Alef variants (أ إ آ ٱ → ا) — same phoneme written differently. 4. Normalize hamza-on-waw (ؤ → و) and hamza-on-ya (ئ → ي) — Egyptian informal writing often omits the hamza; Whisper follows this convention. 5. Strip dialogue dashes at line start — transcription-tool artifacts. 6. Remove punctuation marks — never present in Whisper's Arabic output. 7. Collapse extra whitespace left behind by the previous steps. """ text = _DIACRITICS_RE.sub("", text) text = _TATWEEL_RE.sub("", text) text = _ALEF_RE.sub("ا", text) text = _HAMZA_WAW_RE.sub("و", text) text = _HAMZA_YA_RE.sub("ي", text) text = _DIALOGUE_DASH_RE.sub("", text) text = _PUNCTUATION_RE.sub("", text) text = _MULTI_SPACE_RE.sub(" ", text) return text.strip() # --------------------------------------------------------------------------- # Parsing # --------------------------------------------------------------------------- def parse_transcript_file(path: Path | str) -> List[TranscriptEntry]: """ Read a JSON transcript file and return a list of TranscriptEntry objects sorted by start time. Each JSON entry is expected to have: "start" : number — start time in seconds "duration" : number — length of this entry in seconds "text" : string — transcript text (may contain internal \\n) Internal newlines inside a single entry's text (e.g. a two-line subtitle) are replaced with a space before normalization — they represent a single continuous utterance, not separate dialogue turns. Empty entries (text becomes empty after normalization) are silently skipped. """ path = Path(path) with path.open(encoding="utf-8") as fh: data = json.load(fh) entries: List[TranscriptEntry] = [] for item in data["transcript"]: start = float(item["start"]) end = start + float(item["duration"]) # Internal \n within one JSON entry = line-wrapped subtitle, not a new speaker. raw_text = item["text"].replace("\n", " ") text = normalize_arabic(raw_text) if not text: # skip entries that are empty after normalization continue entries.append(TranscriptEntry(start=start, end=end, text=text)) entries.sort(key=lambda e: e.start) return entries # --------------------------------------------------------------------------- # Segmentation # --------------------------------------------------------------------------- def build_segments( entries: List[TranscriptEntry], source_audio: str, max_duration: float = 30.0, min_duration: float = 1.0, ) -> List[TranscriptSegment]: """ Group TranscriptEntry objects into contiguous TranscriptSegments. Goals: - Each segment is as long as possible to give the model rich context. - No segment exceeds max_duration seconds (default 30 s) — this keeps every audio chunk inside Whisper's 30-second encoder window. - Segments shorter than min_duration seconds (default 1 s) are discarded. How it works (greedy grouping): - Walk through entries in order. - Before adding entry i, check whether doing so would push the segment duration (from seg_start to entry.end) over max_duration. - If yes: seal the current segment at the previous entry's end, start a new segment beginning at entry.start, then add entry i to the new one. - After the loop, seal whatever remains. """ if not entries: return [] segments: List[TranscriptSegment] = [] seg_id = 0 seg_start = entries[0].start seg_texts: List[str] = [] seg_last_end = entries[0].end for entry in entries: if seg_texts and (entry.end - seg_start) > max_duration: if (seg_last_end - seg_start) >= min_duration: segments.append(TranscriptSegment( segment_id=seg_id, start=seg_start, end=seg_last_end, text=" ".join(seg_texts), source_audio=source_audio, )) seg_id += 1 seg_start = entry.start seg_texts = [] seg_texts.append(entry.text) seg_last_end = entry.end # Seal the final segment if seg_texts and (seg_last_end - seg_start) >= min_duration: segments.append(TranscriptSegment( segment_id=seg_id, start=seg_start, end=seg_last_end, text=" ".join(seg_texts), source_audio=source_audio, )) return segments