Spaces:
Sleeping
Sleeping
| """ | |
| Parse JSON transcript files into timed segments. | |
| Expected JSON format (one file per audio): | |
| { | |
| "video_id": "...", | |
| "title": "...", | |
| "transcript": [ | |
| {"start": 1.605, "duration": 1.557, "text": "ุฎูุฑ ูุง ุจูุฑููู ุ"}, | |
| {"start": 4.301, "duration": 3.45, "text": "ู ุตุงุฏุฑูุง ุจุชุคูุฏ ุฅู ููู\nู ุคุงู ุฑุฉ ุงุบุชูุงู ุถุฏ ุญุถุฑุชู."}, | |
| ... | |
| ] | |
| } | |
| Each entry carries: | |
| - start : float โ start time in seconds | |
| - duration : float โ length of this entry in seconds | |
| - text : str โ transcript text (may contain \\n within a single entry) | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import re | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import List | |
| # --------------------------------------------------------------------------- | |
| # Data classes | |
| # --------------------------------------------------------------------------- | |
| class TranscriptEntry: | |
| start: float # start time in seconds | |
| end: float # end time in seconds (= start + duration) | |
| text: str # normalized transcript text | |
| class TranscriptSegment: | |
| segment_id: int | |
| start: float # seconds โ used to slice the audio | |
| end: float # seconds โ used to slice the audio | |
| text: str # full normalized text for this segment | |
| source_audio: str # stem of the original audio file (for traceability) | |
| # --------------------------------------------------------------------------- | |
| # Compiled regular expressions used by normalize_arabic() | |
| # --------------------------------------------------------------------------- | |
| # Arabic diacritics (tashkeel / harakat) โ full Unicode range | |
| # U+0610โU+061A : Arabic honorifics and signs | |
| # U+064BโU+065F : Standard harakat (fathah, dammah, kasrah, tanwin, shadda, sukun โฆ) | |
| # U+0670 : Superscript alef | |
| # U+06D6โU+06E4, U+06E7, U+06E8, U+06EAโU+06ED : Extended Arabic marks | |
| _DIACRITICS_RE = re.compile( | |
| r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED]" | |
| ) | |
| # Alef variants with hamza or madda โ bare alef ุง | |
| # ุฃ (U+0623) ุฅ (U+0625) ุข (U+0622) ูฑ (U+0671) | |
| _ALEF_RE = re.compile(r"[ุฃุฅุขูฑ]") | |
| # Hamza on waw โ plain waw (ุค U+0624 โ ู U+0648) | |
| # Hamza on ya โ plain ya (ุฆ U+0626 โ ู U+064A) | |
| # These are written inconsistently in Egyptian informal text; | |
| # Whisper tends to output the base letter without hamza. | |
| _HAMZA_WAW_RE = re.compile(r"ุค") | |
| _HAMZA_YA_RE = re.compile(r"ุฆ") | |
| # Tatweel / Kashida (U+0640) โ decorative elongation, carries no phonetic value | |
| _TATWEEL_RE = re.compile(r"\u0640") | |
| # Dialogue dash at the very start of a string (speaker-turn marker) | |
| _DIALOGUE_DASH_RE = re.compile(r"^\s*[-โโ]\s*") | |
| # Arabic punctuation characters and common Western punctuation that appear | |
| # in transcript files but are never spoken. We remove them so that labels | |
| # and Whisper predictions can be compared without punctuation mismatch. | |
| # Kept intentionally narrow โ only characters Whisper never outputs for Arabic. | |
| _PUNCTUATION_RE = re.compile( | |
| r"[ุุุ!\"\'ยซยป\(\)\[\]\{\}\.\,\:\;\-โโโฆ]" | |
| ) | |
| # Collapse any run of two or more spaces into one | |
| _MULTI_SPACE_RE = re.compile(r" {2,}") | |
| # --------------------------------------------------------------------------- | |
| # Text normalization | |
| # --------------------------------------------------------------------------- | |
| def normalize_arabic(text: str) -> str: | |
| """ | |
| Normalize Arabic text for use as a Whisper fine-tuning target. | |
| Steps applied (in order): | |
| 1. Remove tashkeel (diacritics) โ Whisper never outputs them; training with | |
| diacritics in labels penalizes correct predictions. | |
| 2. Remove tatweel/kashida (U+0640) โ decorative character, not spoken. | |
| 3. Unify Alef variants (ุฃ ุฅ ุข ูฑ โ ุง) โ same phoneme written differently. | |
| 4. Normalize hamza-on-waw (ุค โ ู) and hamza-on-ya (ุฆ โ ู) โ Egyptian | |
| informal writing often omits the hamza; Whisper follows this convention. | |
| 5. Strip dialogue dashes at line start โ transcription-tool artifacts. | |
| 6. Remove punctuation marks โ never present in Whisper's Arabic output. | |
| 7. Collapse extra whitespace left behind by the previous steps. | |
| """ | |
| text = _DIACRITICS_RE.sub("", text) | |
| text = _TATWEEL_RE.sub("", text) | |
| text = _ALEF_RE.sub("ุง", text) | |
| text = _HAMZA_WAW_RE.sub("ู", text) | |
| text = _HAMZA_YA_RE.sub("ู", text) | |
| text = _DIALOGUE_DASH_RE.sub("", text) | |
| text = _PUNCTUATION_RE.sub("", text) | |
| text = _MULTI_SPACE_RE.sub(" ", text) | |
| return text.strip() | |
| # --------------------------------------------------------------------------- | |
| # Parsing | |
| # --------------------------------------------------------------------------- | |
| def parse_transcript_file(path: Path | str) -> List[TranscriptEntry]: | |
| """ | |
| Read a JSON transcript file and return a list of TranscriptEntry objects | |
| sorted by start time. | |
| Each JSON entry is expected to have: | |
| "start" : number โ start time in seconds | |
| "duration" : number โ length of this entry in seconds | |
| "text" : string โ transcript text (may contain internal \\n) | |
| Internal newlines inside a single entry's text (e.g. a two-line subtitle) | |
| are replaced with a space before normalization โ they represent a single | |
| continuous utterance, not separate dialogue turns. | |
| Empty entries (text becomes empty after normalization) are silently skipped. | |
| """ | |
| path = Path(path) | |
| with path.open(encoding="utf-8") as fh: | |
| data = json.load(fh) | |
| entries: List[TranscriptEntry] = [] | |
| for item in data["transcript"]: | |
| start = float(item["start"]) | |
| end = start + float(item["duration"]) | |
| # Internal \n within one JSON entry = line-wrapped subtitle, not a new speaker. | |
| raw_text = item["text"].replace("\n", " ") | |
| text = normalize_arabic(raw_text) | |
| if not text: # skip entries that are empty after normalization | |
| continue | |
| entries.append(TranscriptEntry(start=start, end=end, text=text)) | |
| entries.sort(key=lambda e: e.start) | |
| return entries | |
| # --------------------------------------------------------------------------- | |
| # Segmentation | |
| # --------------------------------------------------------------------------- | |
| def build_segments( | |
| entries: List[TranscriptEntry], | |
| source_audio: str, | |
| max_duration: float = 30.0, | |
| min_duration: float = 1.0, | |
| ) -> List[TranscriptSegment]: | |
| """ | |
| Group TranscriptEntry objects into contiguous TranscriptSegments. | |
| Goals: | |
| - Each segment is as long as possible to give the model rich context. | |
| - No segment exceeds max_duration seconds (default 30 s) โ this keeps | |
| every audio chunk inside Whisper's 30-second encoder window. | |
| - Segments shorter than min_duration seconds (default 1 s) are discarded. | |
| How it works (greedy grouping): | |
| - Walk through entries in order. | |
| - Before adding entry i, check whether doing so would push the segment | |
| duration (from seg_start to entry.end) over max_duration. | |
| - If yes: seal the current segment at the previous entry's end, start a | |
| new segment beginning at entry.start, then add entry i to the new one. | |
| - After the loop, seal whatever remains. | |
| """ | |
| if not entries: | |
| return [] | |
| segments: List[TranscriptSegment] = [] | |
| seg_id = 0 | |
| seg_start = entries[0].start | |
| seg_texts: List[str] = [] | |
| seg_last_end = entries[0].end | |
| for entry in entries: | |
| if seg_texts and (entry.end - seg_start) > max_duration: | |
| if (seg_last_end - seg_start) >= min_duration: | |
| segments.append(TranscriptSegment( | |
| segment_id=seg_id, | |
| start=seg_start, | |
| end=seg_last_end, | |
| text=" ".join(seg_texts), | |
| source_audio=source_audio, | |
| )) | |
| seg_id += 1 | |
| seg_start = entry.start | |
| seg_texts = [] | |
| seg_texts.append(entry.text) | |
| seg_last_end = entry.end | |
| # Seal the final segment | |
| if seg_texts and (seg_last_end - seg_start) >= min_duration: | |
| segments.append(TranscriptSegment( | |
| segment_id=seg_id, | |
| start=seg_start, | |
| end=seg_last_end, | |
| text=" ".join(seg_texts), | |
| source_audio=source_audio, | |
| )) | |
| return segments | |