Spaces:
Sleeping
Sleeping
File size: 8,463 Bytes
0db822c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 | """
Parse JSON transcript files into timed segments.
Expected JSON format (one file per audio):
{
"video_id": "...",
"title": "...",
"transcript": [
{"start": 1.605, "duration": 1.557, "text": "خير يا بيريوم؟"},
{"start": 4.301, "duration": 3.45, "text": "مصادرنا بتؤكد إن فيه\nمؤامرة اغتيال ضد حضرتك."},
...
]
}
Each entry carries:
- start : float — start time in seconds
- duration : float — length of this entry in seconds
- text : str — transcript text (may contain \\n within a single entry)
"""
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from pathlib import Path
from typing import List
# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------
@dataclass
class TranscriptEntry:
start: float # start time in seconds
end: float # end time in seconds (= start + duration)
text: str # normalized transcript text
@dataclass
class TranscriptSegment:
segment_id: int
start: float # seconds — used to slice the audio
end: float # seconds — used to slice the audio
text: str # full normalized text for this segment
source_audio: str # stem of the original audio file (for traceability)
# ---------------------------------------------------------------------------
# Compiled regular expressions used by normalize_arabic()
# ---------------------------------------------------------------------------
# Arabic diacritics (tashkeel / harakat) — full Unicode range
# U+0610–U+061A : Arabic honorifics and signs
# U+064B–U+065F : Standard harakat (fathah, dammah, kasrah, tanwin, shadda, sukun …)
# U+0670 : Superscript alef
# U+06D6–U+06E4, U+06E7, U+06E8, U+06EA–U+06ED : Extended Arabic marks
_DIACRITICS_RE = re.compile(
r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED]"
)
# Alef variants with hamza or madda → bare alef ا
# أ (U+0623) إ (U+0625) آ (U+0622) ٱ (U+0671)
_ALEF_RE = re.compile(r"[أإآٱ]")
# Hamza on waw → plain waw (ؤ U+0624 → و U+0648)
# Hamza on ya → plain ya (ئ U+0626 → ي U+064A)
# These are written inconsistently in Egyptian informal text;
# Whisper tends to output the base letter without hamza.
_HAMZA_WAW_RE = re.compile(r"ؤ")
_HAMZA_YA_RE = re.compile(r"ئ")
# Tatweel / Kashida (U+0640) — decorative elongation, carries no phonetic value
_TATWEEL_RE = re.compile(r"\u0640")
# Dialogue dash at the very start of a string (speaker-turn marker)
_DIALOGUE_DASH_RE = re.compile(r"^\s*[-–—]\s*")
# Arabic punctuation characters and common Western punctuation that appear
# in transcript files but are never spoken. We remove them so that labels
# and Whisper predictions can be compared without punctuation mismatch.
# Kept intentionally narrow — only characters Whisper never outputs for Arabic.
_PUNCTUATION_RE = re.compile(
r"[،؛؟!\"\'«»\(\)\[\]\{\}\.\,\:\;\-–—…]"
)
# Collapse any run of two or more spaces into one
_MULTI_SPACE_RE = re.compile(r" {2,}")
# ---------------------------------------------------------------------------
# Text normalization
# ---------------------------------------------------------------------------
def normalize_arabic(text: str) -> str:
"""
Normalize Arabic text for use as a Whisper fine-tuning target.
Steps applied (in order):
1. Remove tashkeel (diacritics) — Whisper never outputs them; training with
diacritics in labels penalizes correct predictions.
2. Remove tatweel/kashida (U+0640) — decorative character, not spoken.
3. Unify Alef variants (أ إ آ ٱ → ا) — same phoneme written differently.
4. Normalize hamza-on-waw (ؤ → و) and hamza-on-ya (ئ → ي) — Egyptian
informal writing often omits the hamza; Whisper follows this convention.
5. Strip dialogue dashes at line start — transcription-tool artifacts.
6. Remove punctuation marks — never present in Whisper's Arabic output.
7. Collapse extra whitespace left behind by the previous steps.
"""
text = _DIACRITICS_RE.sub("", text)
text = _TATWEEL_RE.sub("", text)
text = _ALEF_RE.sub("ا", text)
text = _HAMZA_WAW_RE.sub("و", text)
text = _HAMZA_YA_RE.sub("ي", text)
text = _DIALOGUE_DASH_RE.sub("", text)
text = _PUNCTUATION_RE.sub("", text)
text = _MULTI_SPACE_RE.sub(" ", text)
return text.strip()
# ---------------------------------------------------------------------------
# Parsing
# ---------------------------------------------------------------------------
def parse_transcript_file(path: Path | str) -> List[TranscriptEntry]:
"""
Read a JSON transcript file and return a list of TranscriptEntry objects
sorted by start time.
Each JSON entry is expected to have:
"start" : number — start time in seconds
"duration" : number — length of this entry in seconds
"text" : string — transcript text (may contain internal \\n)
Internal newlines inside a single entry's text (e.g. a two-line subtitle)
are replaced with a space before normalization — they represent a single
continuous utterance, not separate dialogue turns.
Empty entries (text becomes empty after normalization) are silently skipped.
"""
path = Path(path)
with path.open(encoding="utf-8") as fh:
data = json.load(fh)
entries: List[TranscriptEntry] = []
for item in data["transcript"]:
start = float(item["start"])
end = start + float(item["duration"])
# Internal \n within one JSON entry = line-wrapped subtitle, not a new speaker.
raw_text = item["text"].replace("\n", " ")
text = normalize_arabic(raw_text)
if not text: # skip entries that are empty after normalization
continue
entries.append(TranscriptEntry(start=start, end=end, text=text))
entries.sort(key=lambda e: e.start)
return entries
# ---------------------------------------------------------------------------
# Segmentation
# ---------------------------------------------------------------------------
def build_segments(
entries: List[TranscriptEntry],
source_audio: str,
max_duration: float = 30.0,
min_duration: float = 1.0,
) -> List[TranscriptSegment]:
"""
Group TranscriptEntry objects into contiguous TranscriptSegments.
Goals:
- Each segment is as long as possible to give the model rich context.
- No segment exceeds max_duration seconds (default 30 s) — this keeps
every audio chunk inside Whisper's 30-second encoder window.
- Segments shorter than min_duration seconds (default 1 s) are discarded.
How it works (greedy grouping):
- Walk through entries in order.
- Before adding entry i, check whether doing so would push the segment
duration (from seg_start to entry.end) over max_duration.
- If yes: seal the current segment at the previous entry's end, start a
new segment beginning at entry.start, then add entry i to the new one.
- After the loop, seal whatever remains.
"""
if not entries:
return []
segments: List[TranscriptSegment] = []
seg_id = 0
seg_start = entries[0].start
seg_texts: List[str] = []
seg_last_end = entries[0].end
for entry in entries:
if seg_texts and (entry.end - seg_start) > max_duration:
if (seg_last_end - seg_start) >= min_duration:
segments.append(TranscriptSegment(
segment_id=seg_id,
start=seg_start,
end=seg_last_end,
text=" ".join(seg_texts),
source_audio=source_audio,
))
seg_id += 1
seg_start = entry.start
seg_texts = []
seg_texts.append(entry.text)
seg_last_end = entry.end
# Seal the final segment
if seg_texts and (seg_last_end - seg_start) >= min_duration:
segments.append(TranscriptSegment(
segment_id=seg_id,
start=seg_start,
end=seg_last_end,
text=" ".join(seg_texts),
source_audio=source_audio,
))
return segments
|