quranic-universal-aligner / src /alignment /special_segments.py
hetchyy's picture
Upload folder using huggingface_hub
602b5d3 verified
"""
Phoneme-based special segment detection for Basmala and Isti'adha.
These are common recitation openers that need special handling:
- Isti'adha: "أَعُوذُ بِٱللَّهِ مِنَ الشَّيْطَانِ الرَّجِيم" (I seek refuge in Allah)
- Basmala: "بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيم" (In the name of Allah)
Detection uses phoneme edit distance for robustness against ASR errors.
"""
from __future__ import annotations
from typing import List, Tuple, Optional
# =============================================================================
# Constants
# =============================================================================
from config import MAX_SPECIAL_EDIT_DISTANCE, MAX_TRANSITION_EDIT_DISTANCE
from src.core.debug_collector import get_debug_collector
# Special phoneme sequences
SPECIAL_PHONEMES = {
"Isti'adha": [
"ʔ", "a", "ʕ", "u:", "ð", "u", "b", "i", "ll", "a:", "h", "i",
"m", "i", "n", "a", "ʃʃ", "a", "j", "tˤ", "aˤ:", "n", "i",
"rˤrˤ", "aˤ", "ʒ", "i:", "m"
],
"Basmala": [
"b", "i", "s", "m", "i", "ll", "a:", "h", "i", "rˤrˤ", "aˤ",
"ħ", "m", "a:", "n", "i", "rˤrˤ", "aˤ", "ħ", "i:", "m"
],
}
# Combined = Isti'adha + Basmala (for detecting both in one segment)
COMBINED_PHONEMES = SPECIAL_PHONEMES["Isti'adha"] + SPECIAL_PHONEMES["Basmala"]
# Arabic text for display
SPECIAL_TEXT = {
"Isti'adha": "أَعُوذُ بِٱللَّهِ مِنَ الشَّيْطَانِ الرَّجِيم",
"Basmala": "بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيم",
}
# Transition phoneme sequences (non-Quranic phrases within recitations)
TRANSITION_PHONEMES = {
"Amin": ["ʔ", "a:", "m", "i:", "n"],
"Takbir": [
"ʔ", "a", "lˤlˤ", "aˤ:", "h", "u",
"ʔ", "a", "k", "b", "a", "rˤ",
],
"Takbir_double": [
"ʔ", "a", "lˤlˤ", "aˤ:", "h", "u",
"ʔ", "a", "k", "b", "a", "rˤ",
"ʔ", "a", "lˤlˤ", "aˤ:", "h", "u",
"ʔ", "a", "k", "b", "a", "rˤ",
],
"Tahmeed": [
"s", "a", "m", "i", "ʕ", "a",
"lˤlˤ", "aˤ:", "h", "u",
"l", "i", "m", "a", "n",
"ħ", "a", "m", "i", "d", "a", "h",
],
"Tahmeed_combined": [
"s", "a", "m", "i", "ʕ", "a",
"lˤlˤ", "aˤ:", "h", "u",
"l", "i", "m", "a", "n",
"ħ", "a", "m", "i", "d", "a", "h", "u",
"rˤ", "aˤ", "bb", "a", "n", "a:",
"w", "a", "l", "a", "k", "a",
"l", "ħ", "a", "m", "d",
],
"Tahmeed_response": [
"rˤ", "aˤ", "bb", "a", "n", "a:",
"w", "a", "l", "a", "k", "a",
"l", "ħ", "a", "m", "d",
],
"Tasleem": [
"ʔ", "a", "ss", "a", "l", "a", "m", "u",
"ʕ", "a", "l", "a", "j", "k", "u", "m",
"w", "a", "rˤ", "aˤ", "ħ", "m", "a", "t", "u", "lˤlˤ", "aˤ:", "h",
],
"Sadaqa": [
"sˤ", "aˤ", "d", "a", "q", "aˤ",
"lˤlˤ", "aˤ:", "h", "u",
"l", "ʕ", "a", "ðˤ", "i:", "m",
],
}
TRANSITION_TEXT = {
"Amin": "آمِين",
"Takbir": "اللَّهُ أَكْبَر",
"Tahmeed": "سَمِعَ اللَّهُ لِمَنْ حَمِدَه",
"Tasleem": "ٱلسَّلَامُ عَلَيْكُمْ وَرَحْمَةُ ٱللَّه",
"Sadaqa": "صَدَقَ ٱللَّهُ ٱلْعَظِيم",
}
# All special segment reference names (for unified rendering)
ALL_SPECIAL_REFS = {"Basmala", "Isti'adha", "Isti'adha+Basmala", "Amin", "Takbir", "Tahmeed", "Tasleem", "Sadaqa"}
# =============================================================================
# Levenshtein Distance
# =============================================================================
def levenshtein_distance(seq1: List[str], seq2: List[str]) -> int:
"""
Compute standard Levenshtein edit distance between two sequences.
Args:
seq1: First sequence (list of phonemes)
seq2: Second sequence (list of phonemes)
Returns:
Edit distance (number of insertions, deletions, substitutions)
"""
m, n = len(seq1), len(seq2)
# Handle edge cases
if m == 0:
return n
if n == 0:
return m
# Use two-row optimization for memory efficiency
prev = list(range(n + 1))
curr = [0] * (n + 1)
for i in range(1, m + 1):
curr[0] = i
for j in range(1, n + 1):
if seq1[i - 1] == seq2[j - 1]:
curr[j] = prev[j - 1] # No operation needed
else:
curr[j] = 1 + min(
prev[j], # Deletion
curr[j - 1], # Insertion
prev[j - 1], # Substitution
)
prev, curr = curr, prev
return prev[n]
def phoneme_edit_distance(asr_phonemes: List[str], ref_phonemes: List[str]) -> float:
"""
Compute normalized edit distance between two phoneme sequences.
Args:
asr_phonemes: ASR output phoneme sequence
ref_phonemes: Reference phoneme sequence
Returns:
Normalized edit distance (0.0 = identical, 1.0 = completely different)
"""
if not asr_phonemes or not ref_phonemes:
return 1.0
edit_dist = levenshtein_distance(asr_phonemes, ref_phonemes)
max_len = max(len(asr_phonemes), len(ref_phonemes))
return edit_dist / max_len
# =============================================================================
# Special Segment Detection
# =============================================================================
def detect_special_segments(
phoneme_texts: List[List[str]],
vad_segments: List,
segment_audios: List,
) -> Tuple[List, List, List[Tuple[str, float, str]], int]:
"""
Detect special segments (Isti'adha/Basmala) using phoneme edit distance.
Detection order:
1. Try COMBINED (Isti'adha + Basmala) on segment 0 → split if match
2. Else try Isti'adha on segment 0 → if match, try Basmala on segment 1
3. Else try Basmala on segment 0
4. Else no specials
Args:
phoneme_texts: List of phoneme lists from ASR
vad_segments: List of VadSegment objects
segment_audios: List of audio arrays
Returns:
(updated_vad_segments, updated_audios, special_results, first_quran_idx)
special_results: List of tuples (matched_text, score, ref) for compatibility
first_quran_idx: Index where Quran segments start (after specials)
"""
# Import here to avoid circular imports
from ..core.segment_types import VadSegment
if not phoneme_texts or not vad_segments or not segment_audios:
return vad_segments, segment_audios, [], 0
special_results: List[Tuple[str, float, str]] = []
# ==========================================================================
# 0. Check segment 0 for Takbir (recitation opener before Isti'adha/Basmala)
# ==========================================================================
takbir_offset = 0
seg0_phonemes = phoneme_texts[0] if phoneme_texts[0] else []
takbir_name, takbir_conf = detect_transition_segment(seg0_phonemes, allowed={"Takbir"})
_dc = get_debug_collector()
if takbir_name:
print(f"[SPECIAL] Takbir detected on segment 0 (conf={takbir_conf:.2f})")
if _dc is not None:
_dc.add_special_candidate(0, "Takbir", 1.0 - takbir_conf,
MAX_TRANSITION_EDIT_DISTANCE, True)
_dc.add_special_detected(0, "Takbir", takbir_conf)
special_results.append((TRANSITION_TEXT["Takbir"], takbir_conf, "Takbir"))
takbir_offset = 1
# Re-point to the next segment for Isti'adha/Basmala detection
if len(phoneme_texts) > 1:
seg0_phonemes = phoneme_texts[1] if phoneme_texts[1] else []
else:
return vad_segments, segment_audios, special_results, takbir_offset
# seg0_phonemes now points to the first non-Takbir segment
# (segment 0 if no Takbir, segment 1 if Takbir detected)
check_idx = takbir_offset # Index into phoneme_texts for Isti'adha/Basmala detection
# ==========================================================================
# 1. Try COMBINED (Isti'adha + Basmala in one segment)
# ==========================================================================
combined_dist = phoneme_edit_distance(seg0_phonemes, COMBINED_PHONEMES)
if _dc is not None:
_dc.add_special_candidate(check_idx, "Combined Isti'adha+Basmala",
combined_dist, MAX_SPECIAL_EDIT_DISTANCE,
combined_dist <= MAX_SPECIAL_EDIT_DISTANCE)
if combined_dist <= MAX_SPECIAL_EDIT_DISTANCE:
print(f"[SPECIAL] Combined Isti'adha+Basmala detected (dist={combined_dist:.2f})")
# Return as a single combined entry — post-processing will split via MFA
confidence = 1.0 - combined_dist
combined_text = SPECIAL_TEXT["Isti'adha"] + " ۝ " + SPECIAL_TEXT["Basmala"]
special_results.append(
(combined_text, confidence, "Isti'adha+Basmala")
)
if _dc is not None:
_dc.add_special_detected(check_idx, "Isti'adha+Basmala", confidence)
_dc.specials["first_quran_idx"] = takbir_offset + 1
return vad_segments, segment_audios, special_results, takbir_offset + 1
# ==========================================================================
# 2. Try Isti'adha on the check segment
# ==========================================================================
istiadha_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Isti'adha"])
if _dc is not None:
_dc.add_special_candidate(check_idx, "Isti'adha", istiadha_dist,
MAX_SPECIAL_EDIT_DISTANCE,
istiadha_dist <= MAX_SPECIAL_EDIT_DISTANCE)
if istiadha_dist <= MAX_SPECIAL_EDIT_DISTANCE:
print(f"[SPECIAL] Isti'adha detected on segment {check_idx} (dist={istiadha_dist:.2f})")
special_results.append(
(SPECIAL_TEXT["Isti'adha"], 1.0 - istiadha_dist, "Isti'adha")
)
if _dc is not None:
_dc.add_special_detected(check_idx, "Isti'adha", 1.0 - istiadha_dist)
# Try Basmala on the next segment
next_idx = check_idx + 1
if next_idx < len(phoneme_texts) and phoneme_texts[next_idx]:
seg1_phonemes = phoneme_texts[next_idx]
basmala_dist = phoneme_edit_distance(seg1_phonemes, SPECIAL_PHONEMES["Basmala"])
if _dc is not None:
_dc.add_special_candidate(next_idx, "Basmala", basmala_dist,
MAX_SPECIAL_EDIT_DISTANCE,
basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE)
if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE:
print(f"[SPECIAL] Basmala detected on segment {next_idx} (dist={basmala_dist:.2f})")
special_results.append(
(SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala")
)
if _dc is not None:
_dc.add_special_detected(next_idx, "Basmala", 1.0 - basmala_dist)
_dc.specials["first_quran_idx"] = takbir_offset + 2
return vad_segments, segment_audios, special_results, takbir_offset + 2
else:
print(f"[SPECIAL] No Basmala on segment {next_idx} (dist={basmala_dist:.2f})")
if _dc is not None:
_dc.specials["first_quran_idx"] = takbir_offset + 1
return vad_segments, segment_audios, special_results, takbir_offset + 1
# ==========================================================================
# 3. Try Basmala on the check segment
# ==========================================================================
basmala_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Basmala"])
if _dc is not None:
_dc.add_special_candidate(check_idx, "Basmala", basmala_dist,
MAX_SPECIAL_EDIT_DISTANCE,
basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE)
if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE:
print(f"[SPECIAL] Basmala detected on segment {check_idx} (dist={basmala_dist:.2f})")
special_results.append(
(SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala")
)
if _dc is not None:
_dc.add_special_detected(check_idx, "Basmala", 1.0 - basmala_dist)
_dc.specials["first_quran_idx"] = takbir_offset + 1
return vad_segments, segment_audios, special_results, takbir_offset + 1
# ==========================================================================
# 4. No specials detected (beyond Takbir if any)
# ==========================================================================
if _dc is not None:
_dc.specials["first_quran_idx"] = takbir_offset
if takbir_offset > 0:
print(f"[SPECIAL] Only Takbir detected, no Isti'adha/Basmala "
f"(istiadha={istiadha_dist:.2f}, basmala={basmala_dist:.2f})")
return vad_segments, segment_audios, special_results, takbir_offset
print(f"[SPECIAL] No special segments detected "
f"(istiadha={istiadha_dist:.2f}, basmala={basmala_dist:.2f})")
return vad_segments, segment_audios, [], 0
def detect_inter_chapter_specials(
phoneme_texts: List[List[str]],
) -> Tuple[List[Tuple[str, float, str]], int]:
"""
Detect special segments between chapters (phoneme-only, no audio splitting).
Same detection order as detect_special_segments:
1. Try COMBINED on segment 0
2. Else try Isti'adha on seg 0 -> if match, try Basmala on seg 1
3. Else try Basmala on seg 0
4. Else no specials
Returns:
(special_results, num_consumed)
special_results: List of (matched_text, score, ref) tuples
num_consumed: Number of segments consumed as specials
"""
if not phoneme_texts or not phoneme_texts[0]:
return [], 0
seg0_phonemes = phoneme_texts[0]
# 1. Try COMBINED (Isti'adha + Basmala in one segment)
combined_dist = phoneme_edit_distance(seg0_phonemes, COMBINED_PHONEMES)
if combined_dist <= MAX_SPECIAL_EDIT_DISTANCE:
print(f"[INTER-CHAPTER] Combined Isti'adha+Basmala detected (dist={combined_dist:.2f})")
combined_text = SPECIAL_TEXT["Isti'adha"] + " ۝ " + SPECIAL_TEXT["Basmala"]
return [(combined_text, 1.0 - combined_dist, "Isti'adha+Basmala")], 1
# 2. Try Isti'adha on segment 0
istiadha_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Isti'adha"])
if istiadha_dist <= MAX_SPECIAL_EDIT_DISTANCE:
print(f"[INTER-CHAPTER] Isti'adha detected (dist={istiadha_dist:.2f})")
results = [(SPECIAL_TEXT["Isti'adha"], 1.0 - istiadha_dist, "Isti'adha")]
consumed = 1
# Try Basmala on segment 1
if len(phoneme_texts) >= 2 and phoneme_texts[1]:
seg1_phonemes = phoneme_texts[1]
basmala_dist = phoneme_edit_distance(seg1_phonemes, SPECIAL_PHONEMES["Basmala"])
if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE:
print(f"[INTER-CHAPTER] Basmala detected on next segment (dist={basmala_dist:.2f})")
results.append((SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala"))
consumed = 2
else:
print(f"[INTER-CHAPTER] No Basmala on next segment (dist={basmala_dist:.2f})")
return results, consumed
# 3. Try Basmala on segment 0
basmala_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Basmala"])
if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE:
print(f"[INTER-CHAPTER] Basmala detected (dist={basmala_dist:.2f})")
return [(SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala")], 1
# 4. No specials
print(f"[INTER-CHAPTER] No special segments detected "
f"(istiadha={istiadha_dist:.2f}, basmala={basmala_dist:.2f})")
return [], 0
# =============================================================================
# Transition Segment Detection
# =============================================================================
# Mapping from variant names to their base/display name
_TRANSITION_BASE_NAMES = {
"Takbir_double": "Takbir",
"Tahmeed_combined": "Tahmeed",
"Tahmeed_response": "Tahmeed",
}
def detect_transition_segment(
asr_phonemes: List[str],
allowed: Optional[set] = None,
) -> Tuple[Optional[str], float]:
"""Best-match transition (lowest edit dist under threshold).
Compares against all TRANSITION_PHONEMES entries. For entries with variant
suffixes (e.g. Takbir_double), the returned name is the base name (Takbir)
— variants only affect internal matching, not display. Best match = lowest
normalized edit distance.
Args:
asr_phonemes: ASR output phoneme sequence for one segment
allowed: Optional set of base names to restrict detection to
(e.g. {"Amin"} to only check Amin)
Returns:
(name, confidence) where name is the base transition name or None,
and confidence = 1 - normalized_edit_distance.
"""
if not asr_phonemes:
return None, 0.0
best_name = None
best_dist = float("inf")
for key, ref_phonemes in TRANSITION_PHONEMES.items():
base_name = _TRANSITION_BASE_NAMES.get(key, key)
if allowed is not None and base_name not in allowed:
continue
dist = phoneme_edit_distance(asr_phonemes, ref_phonemes)
if dist < best_dist:
best_dist = dist
best_name = base_name
if best_dist <= MAX_TRANSITION_EDIT_DISTANCE and best_name is not None:
return best_name, 1.0 - best_dist
return None, 0.0