File size: 8,463 Bytes
0db822c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
"""
Parse JSON transcript files into timed segments.

Expected JSON format (one file per audio):
    {
      "video_id": "...",
      "title": "...",
      "transcript": [
        {"start": 1.605, "duration": 1.557, "text": "خير يا بيريوم؟"},
        {"start": 4.301, "duration": 3.45,  "text": "مصادرنا بتؤكد إن فيه\nمؤامرة اغتيال ضد حضرتك."},
        ...
      ]
    }

Each entry carries:
  - start    : float  — start time in seconds
  - duration : float  — length of this entry in seconds
  - text     : str    — transcript text (may contain \\n within a single entry)
"""

from __future__ import annotations

import json
import re
from dataclasses import dataclass
from pathlib import Path
from typing import List


# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------

@dataclass
class TranscriptEntry:
    start: float   # start time in seconds
    end: float     # end time in seconds  (= start + duration)
    text: str      # normalized transcript text


@dataclass
class TranscriptSegment:
    segment_id: int
    start: float       # seconds — used to slice the audio
    end: float         # seconds — used to slice the audio
    text: str          # full normalized text for this segment
    source_audio: str  # stem of the original audio file (for traceability)


# ---------------------------------------------------------------------------
# Compiled regular expressions used by normalize_arabic()
# ---------------------------------------------------------------------------

# Arabic diacritics (tashkeel / harakat) — full Unicode range
# U+0610–U+061A : Arabic honorifics and signs
# U+064B–U+065F : Standard harakat (fathah, dammah, kasrah, tanwin, shadda, sukun …)
# U+0670        : Superscript alef
# U+06D6–U+06E4, U+06E7, U+06E8, U+06EA–U+06ED : Extended Arabic marks
_DIACRITICS_RE = re.compile(
    r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED]"
)

# Alef variants with hamza or madda → bare alef ا
# أ (U+0623)  إ (U+0625)  آ (U+0622)  ٱ (U+0671)
_ALEF_RE = re.compile(r"[أإآٱ]")

# Hamza on waw → plain waw  (ؤ U+0624 → و U+0648)
# Hamza on ya  → plain ya   (ئ U+0626 → ي U+064A)
# These are written inconsistently in Egyptian informal text;
# Whisper tends to output the base letter without hamza.
_HAMZA_WAW_RE = re.compile(r"ؤ")
_HAMZA_YA_RE  = re.compile(r"ئ")

# Tatweel / Kashida (U+0640) — decorative elongation, carries no phonetic value
_TATWEEL_RE = re.compile(r"\u0640")

# Dialogue dash at the very start of a string (speaker-turn marker)
_DIALOGUE_DASH_RE = re.compile(r"^\s*[-–—]\s*")

# Arabic punctuation characters and common Western punctuation that appear
# in transcript files but are never spoken.  We remove them so that labels
# and Whisper predictions can be compared without punctuation mismatch.
# Kept intentionally narrow — only characters Whisper never outputs for Arabic.
_PUNCTUATION_RE = re.compile(
    r"[،؛؟!\"\'«»\(\)\[\]\{\}\.\,\:\;\-–—…]"
)

# Collapse any run of two or more spaces into one
_MULTI_SPACE_RE = re.compile(r" {2,}")


# ---------------------------------------------------------------------------
# Text normalization
# ---------------------------------------------------------------------------

def normalize_arabic(text: str) -> str:
    """
    Normalize Arabic text for use as a Whisper fine-tuning target.

    Steps applied (in order):
    1. Remove tashkeel (diacritics) — Whisper never outputs them; training with
       diacritics in labels penalizes correct predictions.
    2. Remove tatweel/kashida (U+0640) — decorative character, not spoken.
    3. Unify Alef variants (أ إ آ ٱ → ا) — same phoneme written differently.
    4. Normalize hamza-on-waw (ؤ → و) and hamza-on-ya (ئ → ي) — Egyptian
       informal writing often omits the hamza; Whisper follows this convention.
    5. Strip dialogue dashes at line start — transcription-tool artifacts.
    6. Remove punctuation marks — never present in Whisper's Arabic output.
    7. Collapse extra whitespace left behind by the previous steps.
    """
    text = _DIACRITICS_RE.sub("", text)
    text = _TATWEEL_RE.sub("", text)
    text = _ALEF_RE.sub("ا", text)
    text = _HAMZA_WAW_RE.sub("و", text)
    text = _HAMZA_YA_RE.sub("ي", text)
    text = _DIALOGUE_DASH_RE.sub("", text)
    text = _PUNCTUATION_RE.sub("", text)
    text = _MULTI_SPACE_RE.sub(" ", text)
    return text.strip()


# ---------------------------------------------------------------------------
# Parsing
# ---------------------------------------------------------------------------

def parse_transcript_file(path: Path | str) -> List[TranscriptEntry]:
    """
    Read a JSON transcript file and return a list of TranscriptEntry objects
    sorted by start time.

    Each JSON entry is expected to have:
      "start"    : number  — start time in seconds
      "duration" : number  — length of this entry in seconds
      "text"     : string  — transcript text (may contain internal \\n)

    Internal newlines inside a single entry's text (e.g. a two-line subtitle)
    are replaced with a space before normalization — they represent a single
    continuous utterance, not separate dialogue turns.

    Empty entries (text becomes empty after normalization) are silently skipped.
    """
    path = Path(path)
    with path.open(encoding="utf-8") as fh:
        data = json.load(fh)

    entries: List[TranscriptEntry] = []
    for item in data["transcript"]:
        start = float(item["start"])
        end = start + float(item["duration"])

        # Internal \n within one JSON entry = line-wrapped subtitle, not a new speaker.
        raw_text = item["text"].replace("\n", " ")
        text = normalize_arabic(raw_text)

        if not text:  # skip entries that are empty after normalization
            continue

        entries.append(TranscriptEntry(start=start, end=end, text=text))

    entries.sort(key=lambda e: e.start)
    return entries


# ---------------------------------------------------------------------------
# Segmentation
# ---------------------------------------------------------------------------

def build_segments(
    entries: List[TranscriptEntry],
    source_audio: str,
    max_duration: float = 30.0,
    min_duration: float = 1.0,
) -> List[TranscriptSegment]:
    """
    Group TranscriptEntry objects into contiguous TranscriptSegments.

    Goals:
    - Each segment is as long as possible to give the model rich context.
    - No segment exceeds max_duration seconds (default 30 s) — this keeps
      every audio chunk inside Whisper's 30-second encoder window.
    - Segments shorter than min_duration seconds (default 1 s) are discarded.

    How it works (greedy grouping):
    - Walk through entries in order.
    - Before adding entry i, check whether doing so would push the segment
      duration (from seg_start to entry.end) over max_duration.
    - If yes: seal the current segment at the previous entry's end, start a
      new segment beginning at entry.start, then add entry i to the new one.
    - After the loop, seal whatever remains.
    """
    if not entries:
        return []

    segments: List[TranscriptSegment] = []
    seg_id = 0
    seg_start = entries[0].start
    seg_texts: List[str] = []
    seg_last_end = entries[0].end

    for entry in entries:
        if seg_texts and (entry.end - seg_start) > max_duration:
            if (seg_last_end - seg_start) >= min_duration:
                segments.append(TranscriptSegment(
                    segment_id=seg_id,
                    start=seg_start,
                    end=seg_last_end,
                    text=" ".join(seg_texts),
                    source_audio=source_audio,
                ))
                seg_id += 1
            seg_start = entry.start
            seg_texts = []

        seg_texts.append(entry.text)
        seg_last_end = entry.end

    # Seal the final segment
    if seg_texts and (seg_last_end - seg_start) >= min_duration:
        segments.append(TranscriptSegment(
            segment_id=seg_id,
            start=seg_start,
            end=seg_last_end,
            text=" ".join(seg_texts),
            source_audio=source_audio,
        ))

    return segments