Spaces:

MIP-Tech
/

Speach-To-Text

Sleeping

App Files Files Community

Speach-To-Text / src /data_preparation /parse_transcripts.py

MIP-Tech

Deploy to HF Spaces

0db822c 23 days ago

raw

history blame contribute delete

8.46 kB

	"""
	Parse JSON transcript files into timed segments.

	Expected JSON format (one file per audio):
	{
	"video_id": "...",
	"title": "...",
	"transcript": [
	{"start": 1.605, "duration": 1.557, "text": "خير يا بيريوم؟"},
	{"start": 4.301, "duration": 3.45, "text": "مصادرنا بتؤكد إن فيه\nمؤامرة اغتيال ضد حضرتك."},
	...
	]
	}

	Each entry carries:
	- start : float — start time in seconds
	- duration : float — length of this entry in seconds
	- text : str — transcript text (may contain \\n within a single entry)
	"""

	from __future__ import annotations

	import json
	import re
	from dataclasses import dataclass
	from pathlib import Path
	from typing import List


	# ---------------------------------------------------------------------------
	# Data classes
	# ---------------------------------------------------------------------------

	@dataclass
	class TranscriptEntry:
	start: float # start time in seconds
	end: float # end time in seconds (= start + duration)
	text: str # normalized transcript text


	@dataclass
	class TranscriptSegment:
	segment_id: int
	start: float # seconds — used to slice the audio
	end: float # seconds — used to slice the audio
	text: str # full normalized text for this segment
	source_audio: str # stem of the original audio file (for traceability)


	# ---------------------------------------------------------------------------
	# Compiled regular expressions used by normalize_arabic()
	# ---------------------------------------------------------------------------

	# Arabic diacritics (tashkeel / harakat) — full Unicode range
	# U+0610–U+061A : Arabic honorifics and signs
	# U+064B–U+065F : Standard harakat (fathah, dammah, kasrah, tanwin, shadda, sukun …)
	# U+0670 : Superscript alef
	# U+06D6–U+06E4, U+06E7, U+06E8, U+06EA–U+06ED : Extended Arabic marks
	_DIACRITICS_RE = re.compile(
	r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED]"
	)

	# Alef variants with hamza or madda → bare alef ا
	# أ (U+0623) إ (U+0625) آ (U+0622) ٱ (U+0671)
	_ALEF_RE = re.compile(r"[أإآٱ]")

	# Hamza on waw → plain waw (ؤ U+0624 → و U+0648)
	# Hamza on ya → plain ya (ئ U+0626 → ي U+064A)
	# These are written inconsistently in Egyptian informal text;
	# Whisper tends to output the base letter without hamza.
	_HAMZA_WAW_RE = re.compile(r"ؤ")
	_HAMZA_YA_RE = re.compile(r"ئ")

	# Tatweel / Kashida (U+0640) — decorative elongation, carries no phonetic value
	_TATWEEL_RE = re.compile(r"\u0640")

	# Dialogue dash at the very start of a string (speaker-turn marker)
	_DIALOGUE_DASH_RE = re.compile(r"^\s[-–—]\s")

	# Arabic punctuation characters and common Western punctuation that appear
	# in transcript files but are never spoken. We remove them so that labels
	# and Whisper predictions can be compared without punctuation mismatch.
	# Kept intentionally narrow — only characters Whisper never outputs for Arabic.
	_PUNCTUATION_RE = re.compile(
	r"[،؛؟!\"\'«»\(\)\[\]\{\}\.\,\:\;\-–—…]"
	)

	# Collapse any run of two or more spaces into one
	_MULTI_SPACE_RE = re.compile(r" {2,}")


	# ---------------------------------------------------------------------------
	# Text normalization
	# ---------------------------------------------------------------------------

	def normalize_arabic(text: str) -> str:
	"""
	Normalize Arabic text for use as a Whisper fine-tuning target.

	Steps applied (in order):
	1. Remove tashkeel (diacritics) — Whisper never outputs them; training with
	diacritics in labels penalizes correct predictions.
	2. Remove tatweel/kashida (U+0640) — decorative character, not spoken.
	3. Unify Alef variants (أ إ آ ٱ → ا) — same phoneme written differently.
	4. Normalize hamza-on-waw (ؤ → و) and hamza-on-ya (ئ → ي) — Egyptian
	informal writing often omits the hamza; Whisper follows this convention.
	5. Strip dialogue dashes at line start — transcription-tool artifacts.
	6. Remove punctuation marks — never present in Whisper's Arabic output.
	7. Collapse extra whitespace left behind by the previous steps.
	"""
	text = _DIACRITICS_RE.sub("", text)
	text = _TATWEEL_RE.sub("", text)
	text = _ALEF_RE.sub("ا", text)
	text = _HAMZA_WAW_RE.sub("و", text)
	text = _HAMZA_YA_RE.sub("ي", text)
	text = _DIALOGUE_DASH_RE.sub("", text)
	text = _PUNCTUATION_RE.sub("", text)
	text = _MULTI_SPACE_RE.sub(" ", text)
	return text.strip()


	# ---------------------------------------------------------------------------
	# Parsing
	# ---------------------------------------------------------------------------

	def parse_transcript_file(path: Path \| str) -> List[TranscriptEntry]:
	"""
	Read a JSON transcript file and return a list of TranscriptEntry objects
	sorted by start time.

	Each JSON entry is expected to have:
	"start" : number — start time in seconds
	"duration" : number — length of this entry in seconds
	"text" : string — transcript text (may contain internal \\n)

	Internal newlines inside a single entry's text (e.g. a two-line subtitle)
	are replaced with a space before normalization — they represent a single
	continuous utterance, not separate dialogue turns.

	Empty entries (text becomes empty after normalization) are silently skipped.
	"""
	path = Path(path)
	with path.open(encoding="utf-8") as fh:
	data = json.load(fh)

	entries: List[TranscriptEntry] = []
	for item in data["transcript"]:
	start = float(item["start"])
	end = start + float(item["duration"])

	# Internal \n within one JSON entry = line-wrapped subtitle, not a new speaker.
	raw_text = item["text"].replace("\n", " ")
	text = normalize_arabic(raw_text)

	if not text: # skip entries that are empty after normalization
	continue

	entries.append(TranscriptEntry(start=start, end=end, text=text))

	entries.sort(key=lambda e: e.start)
	return entries


	# ---------------------------------------------------------------------------
	# Segmentation
	# ---------------------------------------------------------------------------

	def build_segments(
	entries: List[TranscriptEntry],
	source_audio: str,
	max_duration: float = 30.0,
	min_duration: float = 1.0,
	) -> List[TranscriptSegment]:
	"""
	Group TranscriptEntry objects into contiguous TranscriptSegments.

	Goals:
	- Each segment is as long as possible to give the model rich context.
	- No segment exceeds max_duration seconds (default 30 s) — this keeps
	every audio chunk inside Whisper's 30-second encoder window.
	- Segments shorter than min_duration seconds (default 1 s) are discarded.

	How it works (greedy grouping):
	- Walk through entries in order.
	- Before adding entry i, check whether doing so would push the segment
	duration (from seg_start to entry.end) over max_duration.
	- If yes: seal the current segment at the previous entry's end, start a
	new segment beginning at entry.start, then add entry i to the new one.
	- After the loop, seal whatever remains.
	"""
	if not entries:
	return []

	segments: List[TranscriptSegment] = []
	seg_id = 0
	seg_start = entries[0].start
	seg_texts: List[str] = []
	seg_last_end = entries[0].end

	for entry in entries:
	if seg_texts and (entry.end - seg_start) > max_duration:
	if (seg_last_end - seg_start) >= min_duration:
	segments.append(TranscriptSegment(
	segment_id=seg_id,
	start=seg_start,
	end=seg_last_end,
	text=" ".join(seg_texts),
	source_audio=source_audio,
	))
	seg_id += 1
	seg_start = entry.start
	seg_texts = []

	seg_texts.append(entry.text)
	seg_last_end = entry.end

	# Seal the final segment
	if seg_texts and (seg_last_end - seg_start) >= min_duration:
	segments.append(TranscriptSegment(
	segment_id=seg_id,
	start=seg_start,
	end=seg_last_end,
	text=" ".join(seg_texts),
	source_audio=source_audio,
	))

	return segments