Speach-To-Text / src /data_preparation /parse_transcripts.py
MIP-Tech's picture
Deploy to HF Spaces
0db822c
"""
Parse JSON transcript files into timed segments.
Expected JSON format (one file per audio):
{
"video_id": "...",
"title": "...",
"transcript": [
{"start": 1.605, "duration": 1.557, "text": "ุฎูŠุฑ ูŠุง ุจูŠุฑูŠูˆู…ุŸ"},
{"start": 4.301, "duration": 3.45, "text": "ู…ุตุงุฏุฑู†ุง ุจุชุคูƒุฏ ุฅู† ููŠู‡\nู…ุคุงู…ุฑุฉ ุงุบุชูŠุงู„ ุถุฏ ุญุถุฑุชูƒ."},
...
]
}
Each entry carries:
- start : float โ€” start time in seconds
- duration : float โ€” length of this entry in seconds
- text : str โ€” transcript text (may contain \\n within a single entry)
"""
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from pathlib import Path
from typing import List
# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------
@dataclass
class TranscriptEntry:
start: float # start time in seconds
end: float # end time in seconds (= start + duration)
text: str # normalized transcript text
@dataclass
class TranscriptSegment:
segment_id: int
start: float # seconds โ€” used to slice the audio
end: float # seconds โ€” used to slice the audio
text: str # full normalized text for this segment
source_audio: str # stem of the original audio file (for traceability)
# ---------------------------------------------------------------------------
# Compiled regular expressions used by normalize_arabic()
# ---------------------------------------------------------------------------
# Arabic diacritics (tashkeel / harakat) โ€” full Unicode range
# U+0610โ€“U+061A : Arabic honorifics and signs
# U+064Bโ€“U+065F : Standard harakat (fathah, dammah, kasrah, tanwin, shadda, sukun โ€ฆ)
# U+0670 : Superscript alef
# U+06D6โ€“U+06E4, U+06E7, U+06E8, U+06EAโ€“U+06ED : Extended Arabic marks
_DIACRITICS_RE = re.compile(
r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED]"
)
# Alef variants with hamza or madda โ†’ bare alef ุง
# ุฃ (U+0623) ุฅ (U+0625) ุข (U+0622) ูฑ (U+0671)
_ALEF_RE = re.compile(r"[ุฃุฅุขูฑ]")
# Hamza on waw โ†’ plain waw (ุค U+0624 โ†’ ูˆ U+0648)
# Hamza on ya โ†’ plain ya (ุฆ U+0626 โ†’ ูŠ U+064A)
# These are written inconsistently in Egyptian informal text;
# Whisper tends to output the base letter without hamza.
_HAMZA_WAW_RE = re.compile(r"ุค")
_HAMZA_YA_RE = re.compile(r"ุฆ")
# Tatweel / Kashida (U+0640) โ€” decorative elongation, carries no phonetic value
_TATWEEL_RE = re.compile(r"\u0640")
# Dialogue dash at the very start of a string (speaker-turn marker)
_DIALOGUE_DASH_RE = re.compile(r"^\s*[-โ€“โ€”]\s*")
# Arabic punctuation characters and common Western punctuation that appear
# in transcript files but are never spoken. We remove them so that labels
# and Whisper predictions can be compared without punctuation mismatch.
# Kept intentionally narrow โ€” only characters Whisper never outputs for Arabic.
_PUNCTUATION_RE = re.compile(
r"[ุŒุ›ุŸ!\"\'ยซยป\(\)\[\]\{\}\.\,\:\;\-โ€“โ€”โ€ฆ]"
)
# Collapse any run of two or more spaces into one
_MULTI_SPACE_RE = re.compile(r" {2,}")
# ---------------------------------------------------------------------------
# Text normalization
# ---------------------------------------------------------------------------
def normalize_arabic(text: str) -> str:
"""
Normalize Arabic text for use as a Whisper fine-tuning target.
Steps applied (in order):
1. Remove tashkeel (diacritics) โ€” Whisper never outputs them; training with
diacritics in labels penalizes correct predictions.
2. Remove tatweel/kashida (U+0640) โ€” decorative character, not spoken.
3. Unify Alef variants (ุฃ ุฅ ุข ูฑ โ†’ ุง) โ€” same phoneme written differently.
4. Normalize hamza-on-waw (ุค โ†’ ูˆ) and hamza-on-ya (ุฆ โ†’ ูŠ) โ€” Egyptian
informal writing often omits the hamza; Whisper follows this convention.
5. Strip dialogue dashes at line start โ€” transcription-tool artifacts.
6. Remove punctuation marks โ€” never present in Whisper's Arabic output.
7. Collapse extra whitespace left behind by the previous steps.
"""
text = _DIACRITICS_RE.sub("", text)
text = _TATWEEL_RE.sub("", text)
text = _ALEF_RE.sub("ุง", text)
text = _HAMZA_WAW_RE.sub("ูˆ", text)
text = _HAMZA_YA_RE.sub("ูŠ", text)
text = _DIALOGUE_DASH_RE.sub("", text)
text = _PUNCTUATION_RE.sub("", text)
text = _MULTI_SPACE_RE.sub(" ", text)
return text.strip()
# ---------------------------------------------------------------------------
# Parsing
# ---------------------------------------------------------------------------
def parse_transcript_file(path: Path | str) -> List[TranscriptEntry]:
"""
Read a JSON transcript file and return a list of TranscriptEntry objects
sorted by start time.
Each JSON entry is expected to have:
"start" : number โ€” start time in seconds
"duration" : number โ€” length of this entry in seconds
"text" : string โ€” transcript text (may contain internal \\n)
Internal newlines inside a single entry's text (e.g. a two-line subtitle)
are replaced with a space before normalization โ€” they represent a single
continuous utterance, not separate dialogue turns.
Empty entries (text becomes empty after normalization) are silently skipped.
"""
path = Path(path)
with path.open(encoding="utf-8") as fh:
data = json.load(fh)
entries: List[TranscriptEntry] = []
for item in data["transcript"]:
start = float(item["start"])
end = start + float(item["duration"])
# Internal \n within one JSON entry = line-wrapped subtitle, not a new speaker.
raw_text = item["text"].replace("\n", " ")
text = normalize_arabic(raw_text)
if not text: # skip entries that are empty after normalization
continue
entries.append(TranscriptEntry(start=start, end=end, text=text))
entries.sort(key=lambda e: e.start)
return entries
# ---------------------------------------------------------------------------
# Segmentation
# ---------------------------------------------------------------------------
def build_segments(
entries: List[TranscriptEntry],
source_audio: str,
max_duration: float = 30.0,
min_duration: float = 1.0,
) -> List[TranscriptSegment]:
"""
Group TranscriptEntry objects into contiguous TranscriptSegments.
Goals:
- Each segment is as long as possible to give the model rich context.
- No segment exceeds max_duration seconds (default 30 s) โ€” this keeps
every audio chunk inside Whisper's 30-second encoder window.
- Segments shorter than min_duration seconds (default 1 s) are discarded.
How it works (greedy grouping):
- Walk through entries in order.
- Before adding entry i, check whether doing so would push the segment
duration (from seg_start to entry.end) over max_duration.
- If yes: seal the current segment at the previous entry's end, start a
new segment beginning at entry.start, then add entry i to the new one.
- After the loop, seal whatever remains.
"""
if not entries:
return []
segments: List[TranscriptSegment] = []
seg_id = 0
seg_start = entries[0].start
seg_texts: List[str] = []
seg_last_end = entries[0].end
for entry in entries:
if seg_texts and (entry.end - seg_start) > max_duration:
if (seg_last_end - seg_start) >= min_duration:
segments.append(TranscriptSegment(
segment_id=seg_id,
start=seg_start,
end=seg_last_end,
text=" ".join(seg_texts),
source_audio=source_audio,
))
seg_id += 1
seg_start = entry.start
seg_texts = []
seg_texts.append(entry.text)
seg_last_end = entry.end
# Seal the final segment
if seg_texts and (seg_last_end - seg_start) >= min_duration:
segments.append(TranscriptSegment(
segment_id=seg_id,
start=seg_start,
end=seg_last_end,
text=" ".join(seg_texts),
source_audio=source_audio,
))
return segments