Spaces:

hetchyy
/

quranic-universal-aligner

Running on Zero

App Files Files Community

quranic-universal-aligner / src /alignment /special_segments.py

hetchyy

Upload folder using huggingface_hub

602b5d3 verified 14 days ago

raw

history blame contribute delete

18 kB

	"""
	Phoneme-based special segment detection for Basmala and Isti'adha.

	These are common recitation openers that need special handling:
	- Isti'adha: "أَعُوذُ بِٱللَّهِ مِنَ الشَّيْطَانِ الرَّجِيم" (I seek refuge in Allah)
	- Basmala: "بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيم" (In the name of Allah)

	Detection uses phoneme edit distance for robustness against ASR errors.
	"""

	from __future__ import annotations

	from typing import List, Tuple, Optional

	# =============================================================================
	# Constants
	# =============================================================================

	from config import MAX_SPECIAL_EDIT_DISTANCE, MAX_TRANSITION_EDIT_DISTANCE
	from src.core.debug_collector import get_debug_collector

	# Special phoneme sequences
	SPECIAL_PHONEMES = {
	"Isti'adha": [
	"ʔ", "a", "ʕ", "u:", "ð", "u", "b", "i", "ll", "a:", "h", "i",
	"m", "i", "n", "a", "ʃʃ", "a", "j", "tˤ", "aˤ:", "n", "i",
	"rˤrˤ", "aˤ", "ʒ", "i:", "m"
	],
	"Basmala": [
	"b", "i", "s", "m", "i", "ll", "a:", "h", "i", "rˤrˤ", "aˤ",
	"ħ", "m", "a:", "n", "i", "rˤrˤ", "aˤ", "ħ", "i:", "m"
	],
	}

	# Combined = Isti'adha + Basmala (for detecting both in one segment)
	COMBINED_PHONEMES = SPECIAL_PHONEMES["Isti'adha"] + SPECIAL_PHONEMES["Basmala"]

	# Arabic text for display
	SPECIAL_TEXT = {
	"Isti'adha": "أَعُوذُ بِٱللَّهِ مِنَ الشَّيْطَانِ الرَّجِيم",
	"Basmala": "بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيم",
	}

	# Transition phoneme sequences (non-Quranic phrases within recitations)
	TRANSITION_PHONEMES = {
	"Amin": ["ʔ", "a:", "m", "i:", "n"],

	"Takbir": [
	"ʔ", "a", "lˤlˤ", "aˤ:", "h", "u",
	"ʔ", "a", "k", "b", "a", "rˤ",
	],

	"Takbir_double": [
	"ʔ", "a", "lˤlˤ", "aˤ:", "h", "u",
	"ʔ", "a", "k", "b", "a", "rˤ",
	"ʔ", "a", "lˤlˤ", "aˤ:", "h", "u",
	"ʔ", "a", "k", "b", "a", "rˤ",
	],

	"Tahmeed": [
	"s", "a", "m", "i", "ʕ", "a",
	"lˤlˤ", "aˤ:", "h", "u",
	"l", "i", "m", "a", "n",
	"ħ", "a", "m", "i", "d", "a", "h",
	],

	"Tahmeed_combined": [
	"s", "a", "m", "i", "ʕ", "a",
	"lˤlˤ", "aˤ:", "h", "u",
	"l", "i", "m", "a", "n",
	"ħ", "a", "m", "i", "d", "a", "h", "u",
	"rˤ", "aˤ", "bb", "a", "n", "a:",
	"w", "a", "l", "a", "k", "a",
	"l", "ħ", "a", "m", "d",
	],

	"Tahmeed_response": [
	"rˤ", "aˤ", "bb", "a", "n", "a:",
	"w", "a", "l", "a", "k", "a",
	"l", "ħ", "a", "m", "d",
	],

	"Tasleem": [
	"ʔ", "a", "ss", "a", "l", "a", "m", "u",
	"ʕ", "a", "l", "a", "j", "k", "u", "m",
	"w", "a", "rˤ", "aˤ", "ħ", "m", "a", "t", "u", "lˤlˤ", "aˤ:", "h",
	],

	"Sadaqa": [
	"sˤ", "aˤ", "d", "a", "q", "aˤ",
	"lˤlˤ", "aˤ:", "h", "u",
	"l", "ʕ", "a", "ðˤ", "i:", "m",
	],
	}

	TRANSITION_TEXT = {
	"Amin": "آمِين",
	"Takbir": "اللَّهُ أَكْبَر",
	"Tahmeed": "سَمِعَ اللَّهُ لِمَنْ حَمِدَه",
	"Tasleem": "ٱلسَّلَامُ عَلَيْكُمْ وَرَحْمَةُ ٱللَّه",
	"Sadaqa": "صَدَقَ ٱللَّهُ ٱلْعَظِيم",
	}

	# All special segment reference names (for unified rendering)
	ALL_SPECIAL_REFS = {"Basmala", "Isti'adha", "Isti'adha+Basmala", "Amin", "Takbir", "Tahmeed", "Tasleem", "Sadaqa"}


	# =============================================================================
	# Levenshtein Distance
	# =============================================================================

	def levenshtein_distance(seq1: List[str], seq2: List[str]) -> int:
	"""
	Compute standard Levenshtein edit distance between two sequences.

	Args:
	seq1: First sequence (list of phonemes)
	seq2: Second sequence (list of phonemes)

	Returns:
	Edit distance (number of insertions, deletions, substitutions)
	"""
	m, n = len(seq1), len(seq2)

	# Handle edge cases
	if m == 0:
	return n
	if n == 0:
	return m

	# Use two-row optimization for memory efficiency
	prev = list(range(n + 1))
	curr = [0] * (n + 1)

	for i in range(1, m + 1):
	curr[0] = i
	for j in range(1, n + 1):
	if seq1[i - 1] == seq2[j - 1]:
	curr[j] = prev[j - 1] # No operation needed
	else:
	curr[j] = 1 + min(
	prev[j], # Deletion
	curr[j - 1], # Insertion
	prev[j - 1], # Substitution
	)
	prev, curr = curr, prev

	return prev[n]


	def phoneme_edit_distance(asr_phonemes: List[str], ref_phonemes: List[str]) -> float:
	"""
	Compute normalized edit distance between two phoneme sequences.

	Args:
	asr_phonemes: ASR output phoneme sequence
	ref_phonemes: Reference phoneme sequence

	Returns:
	Normalized edit distance (0.0 = identical, 1.0 = completely different)
	"""
	if not asr_phonemes or not ref_phonemes:
	return 1.0

	edit_dist = levenshtein_distance(asr_phonemes, ref_phonemes)
	max_len = max(len(asr_phonemes), len(ref_phonemes))

	return edit_dist / max_len


	# =============================================================================
	# Special Segment Detection
	# =============================================================================

	def detect_special_segments(
	phoneme_texts: List[List[str]],
	vad_segments: List,
	segment_audios: List,
	) -> Tuple[List, List, List[Tuple[str, float, str]], int]:
	"""
	Detect special segments (Isti'adha/Basmala) using phoneme edit distance.

	Detection order:
	1. Try COMBINED (Isti'adha + Basmala) on segment 0 → split if match
	2. Else try Isti'adha on segment 0 → if match, try Basmala on segment 1
	3. Else try Basmala on segment 0
	4. Else no specials

	Args:
	phoneme_texts: List of phoneme lists from ASR
	vad_segments: List of VadSegment objects
	segment_audios: List of audio arrays

	Returns:
	(updated_vad_segments, updated_audios, special_results, first_quran_idx)

	special_results: List of tuples (matched_text, score, ref) for compatibility
	first_quran_idx: Index where Quran segments start (after specials)
	"""
	# Import here to avoid circular imports
	from ..core.segment_types import VadSegment

	if not phoneme_texts or not vad_segments or not segment_audios:
	return vad_segments, segment_audios, [], 0

	special_results: List[Tuple[str, float, str]] = []

	# ==========================================================================
	# 0. Check segment 0 for Takbir (recitation opener before Isti'adha/Basmala)
	# ==========================================================================
	takbir_offset = 0
	seg0_phonemes = phoneme_texts[0] if phoneme_texts[0] else []
	takbir_name, takbir_conf = detect_transition_segment(seg0_phonemes, allowed={"Takbir"})
	_dc = get_debug_collector()
	if takbir_name:
	print(f"[SPECIAL] Takbir detected on segment 0 (conf={takbir_conf:.2f})")
	if _dc is not None:
	_dc.add_special_candidate(0, "Takbir", 1.0 - takbir_conf,
	MAX_TRANSITION_EDIT_DISTANCE, True)
	_dc.add_special_detected(0, "Takbir", takbir_conf)
	special_results.append((TRANSITION_TEXT["Takbir"], takbir_conf, "Takbir"))
	takbir_offset = 1
	# Re-point to the next segment for Isti'adha/Basmala detection
	if len(phoneme_texts) > 1:
	seg0_phonemes = phoneme_texts[1] if phoneme_texts[1] else []
	else:
	return vad_segments, segment_audios, special_results, takbir_offset

	# seg0_phonemes now points to the first non-Takbir segment
	# (segment 0 if no Takbir, segment 1 if Takbir detected)
	check_idx = takbir_offset # Index into phoneme_texts for Isti'adha/Basmala detection

	# ==========================================================================
	# 1. Try COMBINED (Isti'adha + Basmala in one segment)
	# ==========================================================================
	combined_dist = phoneme_edit_distance(seg0_phonemes, COMBINED_PHONEMES)

	if _dc is not None:
	_dc.add_special_candidate(check_idx, "Combined Isti'adha+Basmala",
	combined_dist, MAX_SPECIAL_EDIT_DISTANCE,
	combined_dist <= MAX_SPECIAL_EDIT_DISTANCE)

	if combined_dist <= MAX_SPECIAL_EDIT_DISTANCE:
	print(f"[SPECIAL] Combined Isti'adha+Basmala detected (dist={combined_dist:.2f})")

	# Return as a single combined entry — post-processing will split via MFA
	confidence = 1.0 - combined_dist
	combined_text = SPECIAL_TEXT["Isti'adha"] + " ۝ " + SPECIAL_TEXT["Basmala"]
	special_results.append(
	(combined_text, confidence, "Isti'adha+Basmala")
	)

	if _dc is not None:
	_dc.add_special_detected(check_idx, "Isti'adha+Basmala", confidence)
	_dc.specials["first_quran_idx"] = takbir_offset + 1

	return vad_segments, segment_audios, special_results, takbir_offset + 1

	# ==========================================================================
	# 2. Try Isti'adha on the check segment
	# ==========================================================================
	istiadha_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Isti'adha"])

	if _dc is not None:
	_dc.add_special_candidate(check_idx, "Isti'adha", istiadha_dist,
	MAX_SPECIAL_EDIT_DISTANCE,
	istiadha_dist <= MAX_SPECIAL_EDIT_DISTANCE)

	if istiadha_dist <= MAX_SPECIAL_EDIT_DISTANCE:
	print(f"[SPECIAL] Isti'adha detected on segment {check_idx} (dist={istiadha_dist:.2f})")
	special_results.append(
	(SPECIAL_TEXT["Isti'adha"], 1.0 - istiadha_dist, "Isti'adha")
	)
	if _dc is not None:
	_dc.add_special_detected(check_idx, "Isti'adha", 1.0 - istiadha_dist)

	# Try Basmala on the next segment
	next_idx = check_idx + 1
	if next_idx < len(phoneme_texts) and phoneme_texts[next_idx]:
	seg1_phonemes = phoneme_texts[next_idx]
	basmala_dist = phoneme_edit_distance(seg1_phonemes, SPECIAL_PHONEMES["Basmala"])

	if _dc is not None:
	_dc.add_special_candidate(next_idx, "Basmala", basmala_dist,
	MAX_SPECIAL_EDIT_DISTANCE,
	basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE)

	if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE:
	print(f"[SPECIAL] Basmala detected on segment {next_idx} (dist={basmala_dist:.2f})")
	special_results.append(
	(SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala")
	)
	if _dc is not None:
	_dc.add_special_detected(next_idx, "Basmala", 1.0 - basmala_dist)
	_dc.specials["first_quran_idx"] = takbir_offset + 2
	return vad_segments, segment_audios, special_results, takbir_offset + 2
	else:
	print(f"[SPECIAL] No Basmala on segment {next_idx} (dist={basmala_dist:.2f})")

	if _dc is not None:
	_dc.specials["first_quran_idx"] = takbir_offset + 1
	return vad_segments, segment_audios, special_results, takbir_offset + 1

	# ==========================================================================
	# 3. Try Basmala on the check segment
	# ==========================================================================
	basmala_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Basmala"])

	if _dc is not None:
	_dc.add_special_candidate(check_idx, "Basmala", basmala_dist,
	MAX_SPECIAL_EDIT_DISTANCE,
	basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE)

	if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE:
	print(f"[SPECIAL] Basmala detected on segment {check_idx} (dist={basmala_dist:.2f})")
	special_results.append(
	(SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala")
	)
	if _dc is not None:
	_dc.add_special_detected(check_idx, "Basmala", 1.0 - basmala_dist)
	_dc.specials["first_quran_idx"] = takbir_offset + 1
	return vad_segments, segment_audios, special_results, takbir_offset + 1

	# ==========================================================================
	# 4. No specials detected (beyond Takbir if any)
	# ==========================================================================
	if _dc is not None:
	_dc.specials["first_quran_idx"] = takbir_offset
	if takbir_offset > 0:
	print(f"[SPECIAL] Only Takbir detected, no Isti'adha/Basmala "
	f"(istiadha={istiadha_dist:.2f}, basmala={basmala_dist:.2f})")
	return vad_segments, segment_audios, special_results, takbir_offset

	print(f"[SPECIAL] No special segments detected "
	f"(istiadha={istiadha_dist:.2f}, basmala={basmala_dist:.2f})")

	return vad_segments, segment_audios, [], 0


	def detect_inter_chapter_specials(
	phoneme_texts: List[List[str]],
	) -> Tuple[List[Tuple[str, float, str]], int]:
	"""
	Detect special segments between chapters (phoneme-only, no audio splitting).

	Same detection order as detect_special_segments:
	1. Try COMBINED on segment 0
	2. Else try Isti'adha on seg 0 -> if match, try Basmala on seg 1
	3. Else try Basmala on seg 0
	4. Else no specials

	Returns:
	(special_results, num_consumed)
	special_results: List of (matched_text, score, ref) tuples
	num_consumed: Number of segments consumed as specials
	"""
	if not phoneme_texts or not phoneme_texts[0]:
	return [], 0

	seg0_phonemes = phoneme_texts[0]

	# 1. Try COMBINED (Isti'adha + Basmala in one segment)
	combined_dist = phoneme_edit_distance(seg0_phonemes, COMBINED_PHONEMES)
	if combined_dist <= MAX_SPECIAL_EDIT_DISTANCE:
	print(f"[INTER-CHAPTER] Combined Isti'adha+Basmala detected (dist={combined_dist:.2f})")
	combined_text = SPECIAL_TEXT["Isti'adha"] + " ۝ " + SPECIAL_TEXT["Basmala"]
	return [(combined_text, 1.0 - combined_dist, "Isti'adha+Basmala")], 1

	# 2. Try Isti'adha on segment 0
	istiadha_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Isti'adha"])
	if istiadha_dist <= MAX_SPECIAL_EDIT_DISTANCE:
	print(f"[INTER-CHAPTER] Isti'adha detected (dist={istiadha_dist:.2f})")
	results = [(SPECIAL_TEXT["Isti'adha"], 1.0 - istiadha_dist, "Isti'adha")]
	consumed = 1

	# Try Basmala on segment 1
	if len(phoneme_texts) >= 2 and phoneme_texts[1]:
	seg1_phonemes = phoneme_texts[1]
	basmala_dist = phoneme_edit_distance(seg1_phonemes, SPECIAL_PHONEMES["Basmala"])
	if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE:
	print(f"[INTER-CHAPTER] Basmala detected on next segment (dist={basmala_dist:.2f})")
	results.append((SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala"))
	consumed = 2
	else:
	print(f"[INTER-CHAPTER] No Basmala on next segment (dist={basmala_dist:.2f})")

	return results, consumed

	# 3. Try Basmala on segment 0
	basmala_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Basmala"])
	if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE:
	print(f"[INTER-CHAPTER] Basmala detected (dist={basmala_dist:.2f})")
	return [(SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala")], 1

	# 4. No specials
	print(f"[INTER-CHAPTER] No special segments detected "
	f"(istiadha={istiadha_dist:.2f}, basmala={basmala_dist:.2f})")
	return [], 0


	# =============================================================================
	# Transition Segment Detection
	# =============================================================================

	# Mapping from variant names to their base/display name
	_TRANSITION_BASE_NAMES = {
	"Takbir_double": "Takbir",
	"Tahmeed_combined": "Tahmeed",
	"Tahmeed_response": "Tahmeed",
	}


	def detect_transition_segment(
	asr_phonemes: List[str],
	allowed: Optional[set] = None,
	) -> Tuple[Optional[str], float]:
	"""Best-match transition (lowest edit dist under threshold).

	Compares against all TRANSITION_PHONEMES entries. For entries with variant
	suffixes (e.g. Takbir_double), the returned name is the base name (Takbir)
	— variants only affect internal matching, not display. Best match = lowest
	normalized edit distance.

	Args:
	asr_phonemes: ASR output phoneme sequence for one segment
	allowed: Optional set of base names to restrict detection to
	(e.g. {"Amin"} to only check Amin)

	Returns:
	(name, confidence) where name is the base transition name or None,
	and confidence = 1 - normalized_edit_distance.
	"""
	if not asr_phonemes:
	return None, 0.0

	best_name = None
	best_dist = float("inf")

	for key, ref_phonemes in TRANSITION_PHONEMES.items():
	base_name = _TRANSITION_BASE_NAMES.get(key, key)
	if allowed is not None and base_name not in allowed:
	continue

	dist = phoneme_edit_distance(asr_phonemes, ref_phonemes)
	if dist < best_dist:
	best_dist = dist
	best_name = base_name

	if best_dist <= MAX_TRANSITION_EDIT_DISTANCE and best_name is not None:
	return best_name, 1.0 - best_dist

	return None, 0.0