Spaces:

MIP-Tech
/

Speach-To-Text

Sleeping

File size: 8,463 Bytes

0db822c

"""
Parse JSON transcript files into timed segments.

Expected JSON format (one file per audio):
    {
      "video_id": "...",
      "title": "...",
      "transcript": [
        {"start": 1.605, "duration": 1.557, "text": "خير يا بيريوم؟"},
        {"start": 4.301, "duration": 3.45,  "text": "مصادرنا بتؤكد إن فيه\nمؤامرة اغتيال ضد حضرتك."},
        ...
      ]
    }

Each entry carries:
  - start    : float  — start time in seconds
  - duration : float  — length of this entry in seconds
  - text     : str    — transcript text (may contain \\n within a single entry)
"""

from __future__ import annotations

import json
import re
from dataclasses import dataclass
from pathlib import Path
from typing import List


# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------

@dataclass
class TranscriptEntry:
    start: float   # start time in seconds
    end: float     # end time in seconds  (= start + duration)
    text: str      # normalized transcript text


@dataclass
class TranscriptSegment:
    segment_id: int
    start: float       # seconds — used to slice the audio
    end: float         # seconds — used to slice the audio
    text: str          # full normalized text for this segment
    source_audio: str  # stem of the original audio file (for traceability)


# ---------------------------------------------------------------------------
# Compiled regular expressions used by normalize_arabic()
# ---------------------------------------------------------------------------

# Arabic diacritics (tashkeel / harakat) — full Unicode range
# U+0610–U+061A : Arabic honorifics and signs
# U+064B–U+065F : Standard harakat (fathah, dammah, kasrah, tanwin, shadda, sukun …)
# U+0670        : Superscript alef
# U+06D6–U+06E4, U+06E7, U+06E8, U+06EA–U+06ED : Extended Arabic marks
_DIACRITICS_RE = re.compile(
    r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED]"
)

# Alef variants with hamza or madda → bare alef ا
# أ (U+0623)  إ (U+0625)  آ (U+0622)  ٱ (U+0671)
_ALEF_RE = re.compile(r"[أإآٱ]")

# Hamza on waw → plain waw  (ؤ U+0624 → و U+0648)
# Hamza on ya  → plain ya   (ئ U+0626 → ي U+064A)
# These are written inconsistently in Egyptian informal text;
# Whisper tends to output the base letter without hamza.
_HAMZA_WAW_RE = re.compile(r"ؤ")
_HAMZA_YA_RE  = re.compile(r"ئ")

# Tatweel / Kashida (U+0640) — decorative elongation, carries no phonetic value
_TATWEEL_RE = re.compile(r"\u0640")

# Dialogue dash at the very start of a string (speaker-turn marker)
_DIALOGUE_DASH_RE = re.compile(r"^\s*[-–—]\s*")

# Arabic punctuation characters and common Western punctuation that appear
# in transcript files but are never spoken.  We remove them so that labels
# and Whisper predictions can be compared without punctuation mismatch.
# Kept intentionally narrow — only characters Whisper never outputs for Arabic.
_PUNCTUATION_RE = re.compile(
    r"[،؛؟!\"\'«»\(\)\[\]\{\}\.\,\:\;\-–—…]"
)

# Collapse any run of two or more spaces into one
_MULTI_SPACE_RE = re.compile(r" {2,}")


# ---------------------------------------------------------------------------
# Text normalization
# ---------------------------------------------------------------------------

def normalize_arabic(text: str) -> str:
    """
    Normalize Arabic text for use as a Whisper fine-tuning target.

    Steps applied (in order):
    1. Remove tashkeel (diacritics) — Whisper never outputs them; training with
       diacritics in labels penalizes correct predictions.
    2. Remove tatweel/kashida (U+0640) — decorative character, not spoken.
    3. Unify Alef variants (أ إ آ ٱ → ا) — same phoneme written differently.
    4. Normalize hamza-on-waw (ؤ → و) and hamza-on-ya (ئ → ي) — Egyptian
       informal writing often omits the hamza; Whisper follows this convention.
    5. Strip dialogue dashes at line start — transcription-tool artifacts.
    6. Remove punctuation marks — never present in Whisper's Arabic output.
    7. Collapse extra whitespace left behind by the previous steps.
    """
    text = _DIACRITICS_RE.sub("", text)
    text = _TATWEEL_RE.sub("", text)
    text = _ALEF_RE.sub("ا", text)
    text = _HAMZA_WAW_RE.sub("و", text)
    text = _HAMZA_YA_RE.sub("ي", text)
    text = _DIALOGUE_DASH_RE.sub("", text)
    text = _PUNCTUATION_RE.sub("", text)
    text = _MULTI_SPACE_RE.sub(" ", text)
    return text.strip()


# ---------------------------------------------------------------------------
# Parsing
# ---------------------------------------------------------------------------

def parse_transcript_file(path: Path | str) -> List[TranscriptEntry]:
    """
    Read a JSON transcript file and return a list of TranscriptEntry objects
    sorted by start time.

    Each JSON entry is expected to have:
      "start"    : number  — start time in seconds
      "duration" : number  — length of this entry in seconds
      "text"     : string  — transcript text (may contain internal \\n)

    Internal newlines inside a single entry's text (e.g. a two-line subtitle)
    are replaced with a space before normalization — they represent a single
    continuous utterance, not separate dialogue turns.

    Empty entries (text becomes empty after normalization) are silently skipped.
    """
    path = Path(path)
    with path.open(encoding="utf-8") as fh:
        data = json.load(fh)

    entries: List[TranscriptEntry] = []
    for item in data["transcript"]:
        start = float(item["start"])
        end = start + float(item["duration"])

        # Internal \n within one JSON entry = line-wrapped subtitle, not a new speaker.
        raw_text = item["text"].replace("\n", " ")
        text = normalize_arabic(raw_text)

        if not text:  # skip entries that are empty after normalization
            continue

        entries.append(TranscriptEntry(start=start, end=end, text=text))

    entries.sort(key=lambda e: e.start)
    return entries


# ---------------------------------------------------------------------------
# Segmentation
# ---------------------------------------------------------------------------

def build_segments(
    entries: List[TranscriptEntry],
    source_audio: str,
    max_duration: float = 30.0,
    min_duration: float = 1.0,
) -> List[TranscriptSegment]:
    """
    Group TranscriptEntry objects into contiguous TranscriptSegments.

    Goals:
    - Each segment is as long as possible to give the model rich context.
    - No segment exceeds max_duration seconds (default 30 s) — this keeps
      every audio chunk inside Whisper's 30-second encoder window.
    - Segments shorter than min_duration seconds (default 1 s) are discarded.

    How it works (greedy grouping):
    - Walk through entries in order.
    - Before adding entry i, check whether doing so would push the segment
      duration (from seg_start to entry.end) over max_duration.
    - If yes: seal the current segment at the previous entry's end, start a
      new segment beginning at entry.start, then add entry i to the new one.
    - After the loop, seal whatever remains.
    """
    if not entries:
        return []

    segments: List[TranscriptSegment] = []
    seg_id = 0
    seg_start = entries[0].start
    seg_texts: List[str] = []
    seg_last_end = entries[0].end

    for entry in entries:
        if seg_texts and (entry.end - seg_start) > max_duration:
            if (seg_last_end - seg_start) >= min_duration:
                segments.append(TranscriptSegment(
                    segment_id=seg_id,
                    start=seg_start,
                    end=seg_last_end,
                    text=" ".join(seg_texts),
                    source_audio=source_audio,
                ))
                seg_id += 1
            seg_start = entry.start
            seg_texts = []

        seg_texts.append(entry.text)
        seg_last_end = entry.end

    # Seal the final segment
    if seg_texts and (seg_last_end - seg_start) >= min_duration:
        segments.append(TranscriptSegment(
            segment_id=seg_id,
            start=seg_start,
            end=seg_last_end,
            text=" ".join(seg_texts),
            source_audio=source_audio,
        ))

    return segments