Spaces:
Running on Zero
Running on Zero
| """ | |
| Phoneme-based special segment detection for Basmala and Isti'adha. | |
| These are common recitation openers that need special handling: | |
| - Isti'adha: "أَعُوذُ بِٱللَّهِ مِنَ الشَّيْطَانِ الرَّجِيم" (I seek refuge in Allah) | |
| - Basmala: "بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيم" (In the name of Allah) | |
| Detection uses phoneme edit distance for robustness against ASR errors. | |
| """ | |
| from __future__ import annotations | |
| from typing import List, Tuple, Optional | |
| # ============================================================================= | |
| # Constants | |
| # ============================================================================= | |
| from config import MAX_SPECIAL_EDIT_DISTANCE, MAX_TRANSITION_EDIT_DISTANCE | |
| from src.core.debug_collector import get_debug_collector | |
| # Special phoneme sequences | |
| SPECIAL_PHONEMES = { | |
| "Isti'adha": [ | |
| "ʔ", "a", "ʕ", "u:", "ð", "u", "b", "i", "ll", "a:", "h", "i", | |
| "m", "i", "n", "a", "ʃʃ", "a", "j", "tˤ", "aˤ:", "n", "i", | |
| "rˤrˤ", "aˤ", "ʒ", "i:", "m" | |
| ], | |
| "Basmala": [ | |
| "b", "i", "s", "m", "i", "ll", "a:", "h", "i", "rˤrˤ", "aˤ", | |
| "ħ", "m", "a:", "n", "i", "rˤrˤ", "aˤ", "ħ", "i:", "m" | |
| ], | |
| } | |
| # Combined = Isti'adha + Basmala (for detecting both in one segment) | |
| COMBINED_PHONEMES = SPECIAL_PHONEMES["Isti'adha"] + SPECIAL_PHONEMES["Basmala"] | |
| # Arabic text for display | |
| SPECIAL_TEXT = { | |
| "Isti'adha": "أَعُوذُ بِٱللَّهِ مِنَ الشَّيْطَانِ الرَّجِيم", | |
| "Basmala": "بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيم", | |
| } | |
| # Transition phoneme sequences (non-Quranic phrases within recitations) | |
| TRANSITION_PHONEMES = { | |
| "Amin": ["ʔ", "a:", "m", "i:", "n"], | |
| "Takbir": [ | |
| "ʔ", "a", "lˤlˤ", "aˤ:", "h", "u", | |
| "ʔ", "a", "k", "b", "a", "rˤ", | |
| ], | |
| "Takbir_double": [ | |
| "ʔ", "a", "lˤlˤ", "aˤ:", "h", "u", | |
| "ʔ", "a", "k", "b", "a", "rˤ", | |
| "ʔ", "a", "lˤlˤ", "aˤ:", "h", "u", | |
| "ʔ", "a", "k", "b", "a", "rˤ", | |
| ], | |
| "Tahmeed": [ | |
| "s", "a", "m", "i", "ʕ", "a", | |
| "lˤlˤ", "aˤ:", "h", "u", | |
| "l", "i", "m", "a", "n", | |
| "ħ", "a", "m", "i", "d", "a", "h", | |
| ], | |
| "Tahmeed_combined": [ | |
| "s", "a", "m", "i", "ʕ", "a", | |
| "lˤlˤ", "aˤ:", "h", "u", | |
| "l", "i", "m", "a", "n", | |
| "ħ", "a", "m", "i", "d", "a", "h", "u", | |
| "rˤ", "aˤ", "bb", "a", "n", "a:", | |
| "w", "a", "l", "a", "k", "a", | |
| "l", "ħ", "a", "m", "d", | |
| ], | |
| "Tahmeed_response": [ | |
| "rˤ", "aˤ", "bb", "a", "n", "a:", | |
| "w", "a", "l", "a", "k", "a", | |
| "l", "ħ", "a", "m", "d", | |
| ], | |
| "Tasleem": [ | |
| "ʔ", "a", "ss", "a", "l", "a", "m", "u", | |
| "ʕ", "a", "l", "a", "j", "k", "u", "m", | |
| "w", "a", "rˤ", "aˤ", "ħ", "m", "a", "t", "u", "lˤlˤ", "aˤ:", "h", | |
| ], | |
| "Sadaqa": [ | |
| "sˤ", "aˤ", "d", "a", "q", "aˤ", | |
| "lˤlˤ", "aˤ:", "h", "u", | |
| "l", "ʕ", "a", "ðˤ", "i:", "m", | |
| ], | |
| } | |
| TRANSITION_TEXT = { | |
| "Amin": "آمِين", | |
| "Takbir": "اللَّهُ أَكْبَر", | |
| "Tahmeed": "سَمِعَ اللَّهُ لِمَنْ حَمِدَه", | |
| "Tasleem": "ٱلسَّلَامُ عَلَيْكُمْ وَرَحْمَةُ ٱللَّه", | |
| "Sadaqa": "صَدَقَ ٱللَّهُ ٱلْعَظِيم", | |
| } | |
| # All special segment reference names (for unified rendering) | |
| ALL_SPECIAL_REFS = {"Basmala", "Isti'adha", "Isti'adha+Basmala", "Amin", "Takbir", "Tahmeed", "Tasleem", "Sadaqa"} | |
| # ============================================================================= | |
| # Levenshtein Distance | |
| # ============================================================================= | |
| def levenshtein_distance(seq1: List[str], seq2: List[str]) -> int: | |
| """ | |
| Compute standard Levenshtein edit distance between two sequences. | |
| Args: | |
| seq1: First sequence (list of phonemes) | |
| seq2: Second sequence (list of phonemes) | |
| Returns: | |
| Edit distance (number of insertions, deletions, substitutions) | |
| """ | |
| m, n = len(seq1), len(seq2) | |
| # Handle edge cases | |
| if m == 0: | |
| return n | |
| if n == 0: | |
| return m | |
| # Use two-row optimization for memory efficiency | |
| prev = list(range(n + 1)) | |
| curr = [0] * (n + 1) | |
| for i in range(1, m + 1): | |
| curr[0] = i | |
| for j in range(1, n + 1): | |
| if seq1[i - 1] == seq2[j - 1]: | |
| curr[j] = prev[j - 1] # No operation needed | |
| else: | |
| curr[j] = 1 + min( | |
| prev[j], # Deletion | |
| curr[j - 1], # Insertion | |
| prev[j - 1], # Substitution | |
| ) | |
| prev, curr = curr, prev | |
| return prev[n] | |
| def phoneme_edit_distance(asr_phonemes: List[str], ref_phonemes: List[str]) -> float: | |
| """ | |
| Compute normalized edit distance between two phoneme sequences. | |
| Args: | |
| asr_phonemes: ASR output phoneme sequence | |
| ref_phonemes: Reference phoneme sequence | |
| Returns: | |
| Normalized edit distance (0.0 = identical, 1.0 = completely different) | |
| """ | |
| if not asr_phonemes or not ref_phonemes: | |
| return 1.0 | |
| edit_dist = levenshtein_distance(asr_phonemes, ref_phonemes) | |
| max_len = max(len(asr_phonemes), len(ref_phonemes)) | |
| return edit_dist / max_len | |
| # ============================================================================= | |
| # Special Segment Detection | |
| # ============================================================================= | |
| def detect_special_segments( | |
| phoneme_texts: List[List[str]], | |
| vad_segments: List, | |
| segment_audios: List, | |
| ) -> Tuple[List, List, List[Tuple[str, float, str]], int]: | |
| """ | |
| Detect special segments (Isti'adha/Basmala) using phoneme edit distance. | |
| Detection order: | |
| 1. Try COMBINED (Isti'adha + Basmala) on segment 0 → split if match | |
| 2. Else try Isti'adha on segment 0 → if match, try Basmala on segment 1 | |
| 3. Else try Basmala on segment 0 | |
| 4. Else no specials | |
| Args: | |
| phoneme_texts: List of phoneme lists from ASR | |
| vad_segments: List of VadSegment objects | |
| segment_audios: List of audio arrays | |
| Returns: | |
| (updated_vad_segments, updated_audios, special_results, first_quran_idx) | |
| special_results: List of tuples (matched_text, score, ref) for compatibility | |
| first_quran_idx: Index where Quran segments start (after specials) | |
| """ | |
| # Import here to avoid circular imports | |
| from ..core.segment_types import VadSegment | |
| if not phoneme_texts or not vad_segments or not segment_audios: | |
| return vad_segments, segment_audios, [], 0 | |
| special_results: List[Tuple[str, float, str]] = [] | |
| # ========================================================================== | |
| # 0. Check segment 0 for Takbir (recitation opener before Isti'adha/Basmala) | |
| # ========================================================================== | |
| takbir_offset = 0 | |
| seg0_phonemes = phoneme_texts[0] if phoneme_texts[0] else [] | |
| takbir_name, takbir_conf = detect_transition_segment(seg0_phonemes, allowed={"Takbir"}) | |
| _dc = get_debug_collector() | |
| if takbir_name: | |
| print(f"[SPECIAL] Takbir detected on segment 0 (conf={takbir_conf:.2f})") | |
| if _dc is not None: | |
| _dc.add_special_candidate(0, "Takbir", 1.0 - takbir_conf, | |
| MAX_TRANSITION_EDIT_DISTANCE, True) | |
| _dc.add_special_detected(0, "Takbir", takbir_conf) | |
| special_results.append((TRANSITION_TEXT["Takbir"], takbir_conf, "Takbir")) | |
| takbir_offset = 1 | |
| # Re-point to the next segment for Isti'adha/Basmala detection | |
| if len(phoneme_texts) > 1: | |
| seg0_phonemes = phoneme_texts[1] if phoneme_texts[1] else [] | |
| else: | |
| return vad_segments, segment_audios, special_results, takbir_offset | |
| # seg0_phonemes now points to the first non-Takbir segment | |
| # (segment 0 if no Takbir, segment 1 if Takbir detected) | |
| check_idx = takbir_offset # Index into phoneme_texts for Isti'adha/Basmala detection | |
| # ========================================================================== | |
| # 1. Try COMBINED (Isti'adha + Basmala in one segment) | |
| # ========================================================================== | |
| combined_dist = phoneme_edit_distance(seg0_phonemes, COMBINED_PHONEMES) | |
| if _dc is not None: | |
| _dc.add_special_candidate(check_idx, "Combined Isti'adha+Basmala", | |
| combined_dist, MAX_SPECIAL_EDIT_DISTANCE, | |
| combined_dist <= MAX_SPECIAL_EDIT_DISTANCE) | |
| if combined_dist <= MAX_SPECIAL_EDIT_DISTANCE: | |
| print(f"[SPECIAL] Combined Isti'adha+Basmala detected (dist={combined_dist:.2f})") | |
| # Return as a single combined entry — post-processing will split via MFA | |
| confidence = 1.0 - combined_dist | |
| combined_text = SPECIAL_TEXT["Isti'adha"] + " " + SPECIAL_TEXT["Basmala"] | |
| special_results.append( | |
| (combined_text, confidence, "Isti'adha+Basmala") | |
| ) | |
| if _dc is not None: | |
| _dc.add_special_detected(check_idx, "Isti'adha+Basmala", confidence) | |
| _dc.specials["first_quran_idx"] = takbir_offset + 1 | |
| return vad_segments, segment_audios, special_results, takbir_offset + 1 | |
| # ========================================================================== | |
| # 2. Try Isti'adha on the check segment | |
| # ========================================================================== | |
| istiadha_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Isti'adha"]) | |
| if _dc is not None: | |
| _dc.add_special_candidate(check_idx, "Isti'adha", istiadha_dist, | |
| MAX_SPECIAL_EDIT_DISTANCE, | |
| istiadha_dist <= MAX_SPECIAL_EDIT_DISTANCE) | |
| if istiadha_dist <= MAX_SPECIAL_EDIT_DISTANCE: | |
| print(f"[SPECIAL] Isti'adha detected on segment {check_idx} (dist={istiadha_dist:.2f})") | |
| special_results.append( | |
| (SPECIAL_TEXT["Isti'adha"], 1.0 - istiadha_dist, "Isti'adha") | |
| ) | |
| if _dc is not None: | |
| _dc.add_special_detected(check_idx, "Isti'adha", 1.0 - istiadha_dist) | |
| # Try Basmala on the next segment | |
| next_idx = check_idx + 1 | |
| if next_idx < len(phoneme_texts) and phoneme_texts[next_idx]: | |
| seg1_phonemes = phoneme_texts[next_idx] | |
| basmala_dist = phoneme_edit_distance(seg1_phonemes, SPECIAL_PHONEMES["Basmala"]) | |
| if _dc is not None: | |
| _dc.add_special_candidate(next_idx, "Basmala", basmala_dist, | |
| MAX_SPECIAL_EDIT_DISTANCE, | |
| basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE) | |
| if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE: | |
| print(f"[SPECIAL] Basmala detected on segment {next_idx} (dist={basmala_dist:.2f})") | |
| special_results.append( | |
| (SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala") | |
| ) | |
| if _dc is not None: | |
| _dc.add_special_detected(next_idx, "Basmala", 1.0 - basmala_dist) | |
| _dc.specials["first_quran_idx"] = takbir_offset + 2 | |
| return vad_segments, segment_audios, special_results, takbir_offset + 2 | |
| else: | |
| print(f"[SPECIAL] No Basmala on segment {next_idx} (dist={basmala_dist:.2f})") | |
| if _dc is not None: | |
| _dc.specials["first_quran_idx"] = takbir_offset + 1 | |
| return vad_segments, segment_audios, special_results, takbir_offset + 1 | |
| # ========================================================================== | |
| # 3. Try Basmala on the check segment | |
| # ========================================================================== | |
| basmala_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Basmala"]) | |
| if _dc is not None: | |
| _dc.add_special_candidate(check_idx, "Basmala", basmala_dist, | |
| MAX_SPECIAL_EDIT_DISTANCE, | |
| basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE) | |
| if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE: | |
| print(f"[SPECIAL] Basmala detected on segment {check_idx} (dist={basmala_dist:.2f})") | |
| special_results.append( | |
| (SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala") | |
| ) | |
| if _dc is not None: | |
| _dc.add_special_detected(check_idx, "Basmala", 1.0 - basmala_dist) | |
| _dc.specials["first_quran_idx"] = takbir_offset + 1 | |
| return vad_segments, segment_audios, special_results, takbir_offset + 1 | |
| # ========================================================================== | |
| # 4. No specials detected (beyond Takbir if any) | |
| # ========================================================================== | |
| if _dc is not None: | |
| _dc.specials["first_quran_idx"] = takbir_offset | |
| if takbir_offset > 0: | |
| print(f"[SPECIAL] Only Takbir detected, no Isti'adha/Basmala " | |
| f"(istiadha={istiadha_dist:.2f}, basmala={basmala_dist:.2f})") | |
| return vad_segments, segment_audios, special_results, takbir_offset | |
| print(f"[SPECIAL] No special segments detected " | |
| f"(istiadha={istiadha_dist:.2f}, basmala={basmala_dist:.2f})") | |
| return vad_segments, segment_audios, [], 0 | |
| def detect_inter_chapter_specials( | |
| phoneme_texts: List[List[str]], | |
| ) -> Tuple[List[Tuple[str, float, str]], int]: | |
| """ | |
| Detect special segments between chapters (phoneme-only, no audio splitting). | |
| Same detection order as detect_special_segments: | |
| 1. Try COMBINED on segment 0 | |
| 2. Else try Isti'adha on seg 0 -> if match, try Basmala on seg 1 | |
| 3. Else try Basmala on seg 0 | |
| 4. Else no specials | |
| Returns: | |
| (special_results, num_consumed) | |
| special_results: List of (matched_text, score, ref) tuples | |
| num_consumed: Number of segments consumed as specials | |
| """ | |
| if not phoneme_texts or not phoneme_texts[0]: | |
| return [], 0 | |
| seg0_phonemes = phoneme_texts[0] | |
| # 1. Try COMBINED (Isti'adha + Basmala in one segment) | |
| combined_dist = phoneme_edit_distance(seg0_phonemes, COMBINED_PHONEMES) | |
| if combined_dist <= MAX_SPECIAL_EDIT_DISTANCE: | |
| print(f"[INTER-CHAPTER] Combined Isti'adha+Basmala detected (dist={combined_dist:.2f})") | |
| combined_text = SPECIAL_TEXT["Isti'adha"] + " " + SPECIAL_TEXT["Basmala"] | |
| return [(combined_text, 1.0 - combined_dist, "Isti'adha+Basmala")], 1 | |
| # 2. Try Isti'adha on segment 0 | |
| istiadha_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Isti'adha"]) | |
| if istiadha_dist <= MAX_SPECIAL_EDIT_DISTANCE: | |
| print(f"[INTER-CHAPTER] Isti'adha detected (dist={istiadha_dist:.2f})") | |
| results = [(SPECIAL_TEXT["Isti'adha"], 1.0 - istiadha_dist, "Isti'adha")] | |
| consumed = 1 | |
| # Try Basmala on segment 1 | |
| if len(phoneme_texts) >= 2 and phoneme_texts[1]: | |
| seg1_phonemes = phoneme_texts[1] | |
| basmala_dist = phoneme_edit_distance(seg1_phonemes, SPECIAL_PHONEMES["Basmala"]) | |
| if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE: | |
| print(f"[INTER-CHAPTER] Basmala detected on next segment (dist={basmala_dist:.2f})") | |
| results.append((SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala")) | |
| consumed = 2 | |
| else: | |
| print(f"[INTER-CHAPTER] No Basmala on next segment (dist={basmala_dist:.2f})") | |
| return results, consumed | |
| # 3. Try Basmala on segment 0 | |
| basmala_dist = phoneme_edit_distance(seg0_phonemes, SPECIAL_PHONEMES["Basmala"]) | |
| if basmala_dist <= MAX_SPECIAL_EDIT_DISTANCE: | |
| print(f"[INTER-CHAPTER] Basmala detected (dist={basmala_dist:.2f})") | |
| return [(SPECIAL_TEXT["Basmala"], 1.0 - basmala_dist, "Basmala")], 1 | |
| # 4. No specials | |
| print(f"[INTER-CHAPTER] No special segments detected " | |
| f"(istiadha={istiadha_dist:.2f}, basmala={basmala_dist:.2f})") | |
| return [], 0 | |
| # ============================================================================= | |
| # Transition Segment Detection | |
| # ============================================================================= | |
| # Mapping from variant names to their base/display name | |
| _TRANSITION_BASE_NAMES = { | |
| "Takbir_double": "Takbir", | |
| "Tahmeed_combined": "Tahmeed", | |
| "Tahmeed_response": "Tahmeed", | |
| } | |
| def detect_transition_segment( | |
| asr_phonemes: List[str], | |
| allowed: Optional[set] = None, | |
| ) -> Tuple[Optional[str], float]: | |
| """Best-match transition (lowest edit dist under threshold). | |
| Compares against all TRANSITION_PHONEMES entries. For entries with variant | |
| suffixes (e.g. Takbir_double), the returned name is the base name (Takbir) | |
| — variants only affect internal matching, not display. Best match = lowest | |
| normalized edit distance. | |
| Args: | |
| asr_phonemes: ASR output phoneme sequence for one segment | |
| allowed: Optional set of base names to restrict detection to | |
| (e.g. {"Amin"} to only check Amin) | |
| Returns: | |
| (name, confidence) where name is the base transition name or None, | |
| and confidence = 1 - normalized_edit_distance. | |
| """ | |
| if not asr_phonemes: | |
| return None, 0.0 | |
| best_name = None | |
| best_dist = float("inf") | |
| for key, ref_phonemes in TRANSITION_PHONEMES.items(): | |
| base_name = _TRANSITION_BASE_NAMES.get(key, key) | |
| if allowed is not None and base_name not in allowed: | |
| continue | |
| dist = phoneme_edit_distance(asr_phonemes, ref_phonemes) | |
| if dist < best_dist: | |
| best_dist = dist | |
| best_name = base_name | |
| if best_dist <= MAX_TRANSITION_EDIT_DISTANCE and best_name is not None: | |
| return best_name, 1.0 - best_dist | |
| return None, 0.0 | |