diff --git "a/app.py" "b/app.py" new file mode 100644--- /dev/null +++ "b/app.py" @@ -0,0 +1,5191 @@ +""" +Quran Aligner — Automatic Quran recitation segmentation and alignment. +""" +import os +import sys +import json +import time +import unicodedata +from pathlib import Path + +# Add paths for imports BEFORE importing anything else +_app_path = Path(__file__).parent.resolve() +sys.path.insert(0, str(_app_path)) + +# Build Cython extensions in-place (falls back to pure Python if it fails) +import subprocess +subprocess.run( + [sys.executable, str(_app_path / "setup.py"), "build_ext", "--inplace"], + cwd=str(_app_path), + capture_output=True, +) + +import gradio as gr +import numpy as np +import librosa + +from config import ( + PORT, IS_HF_SPACE, + get_vad_duration, get_asr_duration, + MIN_SILENCE_MIN, MIN_SILENCE_MAX, MIN_SILENCE_STEP, + MIN_SPEECH_MIN, MIN_SPEECH_MAX, MIN_SPEECH_STEP, + PAD_MIN, PAD_MAX, PAD_STEP, + PRESET_MUJAWWAD, PRESET_MURATTAL, PRESET_FAST, + CONFIDENCE_HIGH, CONFIDENCE_MED, REVIEW_SUMMARY_MAX_SEGMENTS, + UNDERSEG_MIN_WORDS, UNDERSEG_MIN_AYAH_SPAN, UNDERSEG_MIN_DURATION, + QURAN_TEXT_SIZE_PX, ARABIC_WORD_SPACING, + SURAH_INFO_PATH, + PHONEME_ALIGNMENT_PROFILING, + RESAMPLE_TYPE, + SEGMENT_AUDIO_DIR, DELETE_CACHE_FREQUENCY, DELETE_CACHE_AGE, AUDIO_PRELOAD_COUNT, + ANIM_WORD_COLOR, ANIM_STYLE_ROW_SCALES, + ANIM_DISPLAY_MODES, ANIM_DISPLAY_MODE_DEFAULT, ANIM_OPACITY_PREV_DEFAULT, ANIM_OPACITY_AFTER_DEFAULT, ANIM_OPACITY_STEP, ANIM_PRESETS, + ANIM_GRANULARITIES, ANIM_GRANULARITY_DEFAULT, + ANIM_WINDOW_PREV_DEFAULT, ANIM_WINDOW_AFTER_DEFAULT, + ANIM_WINDOW_PREV_MIN, ANIM_WINDOW_PREV_MAX, + ANIM_WINDOW_AFTER_MIN, ANIM_WINDOW_AFTER_MAX, + MEGA_WORD_SPACING_MIN, MEGA_WORD_SPACING_MAX, MEGA_WORD_SPACING_STEP, MEGA_WORD_SPACING_DEFAULT, + MEGA_TEXT_SIZE_MIN, MEGA_TEXT_SIZE_MAX, MEGA_TEXT_SIZE_STEP, MEGA_TEXT_SIZE_DEFAULT, + MEGA_LINE_SPACING_MIN, MEGA_LINE_SPACING_MAX, MEGA_LINE_SPACING_STEP, MEGA_LINE_SPACING_DEFAULT, + MEGA_SURAH_LIGATURE_SIZE, + PROGRESS_PROCESS_AUDIO, PROGRESS_RESEGMENT, PROGRESS_RETRANSCRIBE, + MFA_SPACE_URL, MFA_TIMEOUT, MFA_PROGRESS_SEGMENT_RATE, + LEFT_COLUMN_SCALE, RIGHT_COLUMN_SCALE, +) +from src.zero_gpu import gpu_with_fallback, ZERO_GPU_AVAILABLE, is_quota_exhausted, is_user_forced_cpu, get_quota_reset_time +from src.segment_processor import ( + load_segmenter, + ensure_models_on_gpu, + detect_speech_segments, + run_phoneme_matching, + test_vad_aoti_export, + apply_aoti_compiled, + VadSegment, SegmentInfo, ProfilingData +) +from config import ANCHOR_SEGMENTS +from data.font_data import DIGITAL_KHATT_FONT_B64, SURAH_NAME_FONT_B64 + +# Load surah name ligature map +with open(Path(__file__).parent / "data" / "ligatures.json") as _f: + _SURAH_LIGATURES = json.load(_f) + + +# ============================================================================= +# GPU-decorated processing functions +# ============================================================================= + +def _combined_duration(audio, sample_rate, *_args, **_kwargs): + """Lease duration for VAD+ASR: sum of independent estimates.""" + minutes = len(audio) / sample_rate / 60 + model_name = _args[3] if len(_args) > 3 else _kwargs.get("model_name", "Base") + return get_vad_duration(minutes) + get_asr_duration(minutes, model_name) + +def _asr_only_duration(segment_audios, sample_rate, *_args, **_kwargs): + """Lease duration for standalone ASR.""" + minutes = sum(len(s) for s in segment_audios) / sample_rate / 60 + model_name = _args[0] if _args else _kwargs.get("model_name", "Base") + return get_asr_duration(minutes, model_name) + + +def _run_asr_core(segment_audios, sample_rate, model_name="Base"): + """Core ASR logic: load, move to GPU, transcribe. No GPU decorator.""" + from src.alignment.phoneme_asr import load_phoneme_asr, transcribe_batch + + t_gpu_start = time.time() + load_phoneme_asr(model_name) + t_move = time.time() + ensure_models_on_gpu(asr_model_name=model_name) + gpu_move_time = time.time() - t_move + print(f"[PHONEME ASR] GPU move: {gpu_move_time:.3f}s") + results, batch_profiling, sorting_time, batch_build_time = transcribe_batch(segment_audios, sample_rate, model_name) + gpu_time = time.time() - t_gpu_start + return results, batch_profiling, sorting_time, batch_build_time, gpu_move_time, gpu_time + + +@gpu_with_fallback(duration=_combined_duration) +def run_vad_and_asr_gpu(audio, sample_rate, min_silence_ms, min_speech_ms, pad_ms, model_name="Base"): + """Single GPU lease: VAD segmentation + Phoneme ASR.""" + t_gpu_start = time.time() + + # --- VAD phase --- + load_segmenter() + vad_move_time = ensure_models_on_gpu() + intervals, vad_profiling, raw_speech_intervals, raw_is_complete = detect_speech_segments(audio, sample_rate, min_silence_ms, min_speech_ms, pad_ms) + vad_profiling["model_move_time"] = vad_move_time + vad_gpu_time = time.time() - t_gpu_start + + if not intervals: + return (intervals, vad_profiling, vad_gpu_time, raw_speech_intervals, raw_is_complete, + None, None, None, None, 0.0, 0.0) + + # --- ASR phase --- + segment_audios = [audio[int(s * sample_rate):int(e * sample_rate)] for s, e in intervals] + asr_results = _run_asr_core(segment_audios, sample_rate, model_name) + + return (intervals, vad_profiling, vad_gpu_time, raw_speech_intervals, raw_is_complete, *asr_results) + + +@gpu_with_fallback(duration=_asr_only_duration) +def run_phoneme_asr_gpu(segment_audios, sample_rate, model_name="Base"): + """Standalone ASR GPU lease (used by resegment/retranscribe paths).""" + return _run_asr_core(segment_audios, sample_rate, model_name) + + +@gpu_with_fallback(duration=lambda: 300) # 5 min lease for compilation test +def test_aoti_compilation_gpu(): + """ + Test AoT compilation for VAD model on GPU. + Called at startup to verify torch.export works. + """ + load_segmenter() + ensure_models_on_gpu() + return test_vad_aoti_export() + + +# ============================================================================= +# Segment rendering +# ============================================================================= + +def format_timestamp(seconds: float) -> str: + """Format seconds as MM:SS.ms""" + minutes = int(seconds // 60) + secs = seconds % 60 + return f"{minutes}:{secs:04.1f}" + + +def get_confidence_class(score: float) -> str: + """Get CSS class based on confidence score.""" + if score >= CONFIDENCE_HIGH: + return "segment-high" + elif score >= CONFIDENCE_MED: + return "segment-med" + else: + return "segment-low" + + +def get_segment_word_stats(matched_ref: str) -> tuple[int, int]: + """Return (word_count, ayah_span) for a matched ref. (0, 1) if unparseable.""" + if not matched_ref or "-" not in matched_ref: + return 0, 1 + try: + start_ref, end_ref = matched_ref.split("-", 1) + start_parts = start_ref.split(":") + end_parts = end_ref.split(":") + if len(start_parts) < 3 or len(end_parts) < 3: + return 0, 1 + + # Ayah span + start_ayah = (int(start_parts[0]), int(start_parts[1])) + end_ayah = (int(end_parts[0]), int(end_parts[1])) + ayah_span = 1 + if start_ayah != end_ayah: + ayah_span = abs(end_ayah[1] - start_ayah[1]) + 1 if start_ayah[0] == end_ayah[0] else 2 + + # Word count via index + word_count = 0 + from src.quran_index import get_quran_index + index = get_quran_index() + indices = index.ref_to_indices(matched_ref) + if indices: + word_count = indices[1] - indices[0] + 1 + + return word_count, ayah_span + except Exception: + return 0, 1 + + +def check_undersegmented(matched_ref: str, duration: float) -> bool: + """Check if a segment is potentially undersegmented. + + Criteria: (word_count >= threshold OR ayah_span >= threshold) AND duration >= threshold. + """ + if duration < UNDERSEG_MIN_DURATION: + return False + word_count, ayah_span = get_segment_word_stats(matched_ref) + return word_count >= UNDERSEG_MIN_WORDS or ayah_span >= UNDERSEG_MIN_AYAH_SPAN + + +# Arabic-Indic digits for verse markers +ARABIC_DIGITS = { + '0': '٠', '1': '١', '2': '٢', '3': '٣', '4': '٤', + '5': '٥', '6': '٦', '7': '٧', '8': '٨', '9': '٩', +} + +def to_arabic_numeral(number: int) -> str: + """Convert an integer to Arabic-Indic numerals.""" + return ''.join(ARABIC_DIGITS[d] for d in str(number)) + + +def format_verse_marker(verse_num: int) -> str: + """ + Format a verse number as an Arabic verse marker. + Uses U+06DD (Arabic End of Ayah) which renders as a decorated marker + in DigitalKhatt (combines U+06DD + digit into a single glyph). + """ + numeral = to_arabic_numeral(verse_num) + end_of_ayah = '\u06DD' + return f'{end_of_ayah}{numeral}' + + +# Cached verse word counts from surah_info.json +_verse_word_counts_cache: dict[int, dict[int, int]] | None = None + + +def _load_verse_word_counts() -> dict[int, dict[int, int]]: + """Load and cache verse word counts from surah_info.json.""" + global _verse_word_counts_cache + if _verse_word_counts_cache is not None: + return _verse_word_counts_cache + + with open(SURAH_INFO_PATH, 'r', encoding='utf-8') as f: + surah_info = json.load(f) + + _verse_word_counts_cache = {} + for surah_num, data in surah_info.items(): + surah_int = int(surah_num) + _verse_word_counts_cache[surah_int] = {} + for verse_data in data.get('verses', []): + verse_num = verse_data.get('verse') + num_words = verse_data.get('num_words', 0) + if verse_num: + _verse_word_counts_cache[surah_int][verse_num] = num_words + + return _verse_word_counts_cache + + +def split_into_char_groups(text): + """Split text into groups of base character + following combining marks. + + Each group is one visible "letter" — a base character followed by any + diacritics (tashkeel) or other combining marks attached to it. + """ + groups = [] + current = "" + for ch in text: + if unicodedata.category(ch).startswith('M') and ch != '\u0670': + current += ch + else: + if current: + groups.append(current) + current = ch + if current: + groups.append(current) + return groups + + +ZWSP = '\u200b' +DAGGER_ALEF = '\u0670' + +def _wrap_word_with_chars(word_text, pos=None): + """Wrap a word in with nested per letter group.""" + # Strip tatweel (U+0640) — MFA doesn't output it, so keeping it causes + # index misalignment during timestamp injection + word_text = word_text.replace('\u0640', '') + # Insert ZWSP before dagger alef so it can be highlighted independently + spans = [] + for g in split_into_char_groups(word_text): + if g.startswith(DAGGER_ALEF): + spans.append(f'{ZWSP}{g}') + else: + spans.append(f'{g}') + char_spans = "".join(spans) + pos_attr = f' data-pos="{pos}"' if pos else '' + return f'{char_spans}' + + +def get_text_with_markers(matched_ref: str) -> str | None: + """ + Generate matched text with verse markers inserted at verse boundaries. + + Uses position-based detection: iterates words and inserts an HTML marker + after the last word of each verse (matching recitation_app approach). + + Args: + matched_ref: Reference like "2:255:1-2:255:5" + + Returns: + Text with verse markers, or None if ref is invalid + """ + if not matched_ref: + return None + + from src.quran_index import get_quran_index + index = get_quran_index() + + indices = index.ref_to_indices(matched_ref) + if not indices: + return None + + start_idx, end_idx = indices + verse_word_counts = _load_verse_word_counts() + + parts = [] + for w in index.words[start_idx:end_idx + 1]: + parts.append(_wrap_word_with_chars(w.display_text, pos=f"{w.surah}:{w.ayah}:{w.word}")) + # Check if this is the last word of its verse + num_words = verse_word_counts.get(w.surah, {}).get(w.ayah, 0) + if num_words > 0 and w.word == num_words: + parts.append(format_verse_marker(w.ayah)) + + return " ".join(parts) + + +def simplify_ref(ref: str) -> str: + """Simplify a matched_ref like '84:9:1-84:9:4' to '84:9:1-4' when same verse.""" + if not ref or "-" not in ref: + return ref + parts = ref.split("-") + if len(parts) != 2: + return ref + start, end = parts + start_parts = start.split(":") + end_parts = end.split(":") + if len(start_parts) == 3 and len(end_parts) == 3: + if start_parts[0] == end_parts[0] and start_parts[1] == end_parts[1]: + return f"{start}-{end_parts[2]}" + return ref + + +def render_segment_card(seg: SegmentInfo, idx: int, audio_int16: np.ndarray = None, sample_rate: int = 0, render_key: str = "", segment_dir: Path = None, audio_preload: str = "metadata", audio_inline: bool = False) -> str: + """Render a single segment as an HTML card with optional audio player. + + Args: + seg: Segment info + idx: Segment index + audio_int16: Full audio as int16 array for writing per-segment WAV files + sample_rate: Audio sample rate in Hz + render_key: Unique key to prevent browser caching between renders + segment_dir: Directory to write segment WAV files into + """ + confidence_class = get_confidence_class(seg.match_score) + confidence_badge_class = confidence_class # preserve original for badge color + if seg.has_missing_words: + confidence_class = "segment-low" + if seg.potentially_undersegmented and confidence_class != "segment-low": + confidence_class = "segment-underseg" + + timestamp = f"{format_timestamp(seg.start_time)} - {format_timestamp(seg.end_time)}" + duration = seg.end_time - seg.start_time + + # Format reference (simplify same-verse refs) + ref_display = simplify_ref(seg.matched_ref) if seg.matched_ref else "" + + # Confidence percentage with label + confidence_pct = f"Confidence: {seg.match_score:.0%}" + + # Undersegmented badge + underseg_badge = "" + if seg.potentially_undersegmented: + underseg_badge = '
Potentially Undersegmented
' + + # Missing words badge + missing_badge = "" + if seg.has_missing_words: + missing_badge = '
Missing Words
' + + # Error display + error_html = "" + if seg.error: + error_html = f'
{seg.error}
' + + # Audio player HTML — each segment gets its own WAV file served by Gradio. + audio_html = "" + if audio_int16 is not None and sample_rate > 0 and segment_dir is not None: + audio_src = encode_segment_audio(audio_int16, sample_rate, seg.start_time, seg.end_time, segment_dir, idx, inline=audio_inline) + # Add animate button only if segment has matched_ref (Quran text with word spans) + animate_btn = "" + if seg.matched_ref: + animate_btn = f'' + audio_html = f''' +
+ + + {animate_btn} +
+ ''' + + # Build matched text with verse markers at all verse boundaries + BASMALA_TEXT = "بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيم" + ISTIATHA_TEXT = "أَعُوذُ بِٱللَّهِ مِنَ الشَّيْطَانِ الرَّجِيم" + COMBINED_PREFIX = ISTIATHA_TEXT + " ۝ " + BASMALA_TEXT + _SPECIAL_PREFIXES = [COMBINED_PREFIX, ISTIATHA_TEXT, BASMALA_TEXT] + + # Helper to wrap words in spans + def wrap_words_in_spans(text): + return " ".join(_wrap_word_with_chars(w) for w in text.split()) + + if seg.matched_ref: + # Generate text with markers from the index + text_html = get_text_with_markers(seg.matched_ref) + if text_html and seg.matched_text: + # Check for any special prefix (fused or forward-merged) + for _sp_name, _sp in [("Isti'adha+Basmala", COMBINED_PREFIX), + ("Isti'adha", ISTIATHA_TEXT), + ("Basmala", BASMALA_TEXT)]: + if seg.matched_text.startswith(_sp): + mfa_prefix = f"{_sp_name}+{seg.matched_ref}" + words = _sp.replace(" ۝ ", " ").split() + prefix_html = " ".join( + _wrap_word_with_chars(w, pos=f"{mfa_prefix}:0:0:{i+1}") + for i, w in enumerate(words) + ) + text_html = prefix_html + " " + text_html + break + elif not text_html: + # Special ref (Basmala/Isti'adha): wrap words with indexed data-pos + # so MFA timestamps can be injected later + if seg.matched_ref and seg.matched_text: + words = seg.matched_text.split() + text_html = " ".join( + _wrap_word_with_chars(w, pos=f"{seg.matched_ref}:0:0:{i+1}") + for i, w in enumerate(words) + ) + else: + text_html = seg.matched_text or "" + elif seg.matched_text: + # Special segments (Basmala/Isti'adha) have text but no ref + text_html = wrap_words_in_spans(seg.matched_text) + else: + text_html = "" + + confidence_badge = "" if seg.has_missing_words else f'
{confidence_pct}
' + + # Build inline header: Segment N | ref | duration | time range + header_parts = [f"Segment {idx + 1}"] + if ref_display: + header_parts.append(ref_display) + header_parts.append(f"{duration:.1f}s") + header_parts.append(timestamp) + header_text = " | ".join(header_parts) + + html = f''' +
+
+
{header_text}
+
+ {underseg_badge} + {confidence_badge} + {missing_badge} +
+
+ + {audio_html} + +
+ {text_html} +
+ + {error_html} +
+ ''' + return html + + +def render_segments(segments: list, audio_int16: np.ndarray = None, sample_rate: int = 0, cpu_fallback: bool = False, segment_dir: Path = None) -> str: + """Render all segments as HTML with optional audio players. + + Args: + segments: List of SegmentInfo objects + audio_int16: Full audio as int16 array for writing per-segment WAV files + sample_rate: Audio sample rate in Hz + cpu_fallback: If True, show warning that GPU quota was exhausted + segment_dir: Directory containing per-segment WAV files + """ + import time + import wave + + if not segments: + return '
No segments detected
' + + # Generate unique key for this render to prevent audio caching + render_key = str(int(time.time() * 1000)) + + # Write full audio file for unified megacard playback + full_audio_url = "" + if audio_int16 is not None and sample_rate > 0 and segment_dir: + full_path = segment_dir / "full.wav" + with wave.open(str(full_path), 'wb') as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(sample_rate) + wf.writeframes(audio_int16.tobytes()) + full_audio_url = f"/gradio_api/file={full_path}" + + # Categorize segments by confidence level (1-indexed for display) + med_segments = [i + 1 for i, s in enumerate(segments) if CONFIDENCE_MED <= s.match_score < CONFIDENCE_HIGH] + low_segments = [i + 1 for i, s in enumerate(segments) if s.match_score < CONFIDENCE_MED] + + # Build header with confidence summary + header_parts = [] + + # GPU quota warning banner + if cpu_fallback: + reset_time = get_quota_reset_time() + reset_msg = f' Resets in {reset_time}.' if reset_time else '' + header_parts.append( + '
' + f'Daily GPU quota reached. Processing on CPU (slower performance).{reset_msg}' + '
' + ) + + header_parts.append(f'
Found {len(segments)} segments
') + + # Combined review summary: merge medium and low confidence segments into one color-coded list + low_set = set(low_segments) + all_review = sorted(set(med_segments) | low_set) + if all_review: + def _span(n: int) -> str: + css = "segment-low-text" if n in low_set else "segment-med-text" + return f'{n}' + + if len(all_review) <= REVIEW_SUMMARY_MAX_SEGMENTS: + seg_html = ", ".join(_span(n) for n in all_review) + else: + seg_html = ", ".join(_span(n) for n in all_review[:REVIEW_SUMMARY_MAX_SEGMENTS]) + remaining = len(all_review) - REVIEW_SUMMARY_MAX_SEGMENTS + seg_html += f" ... and {remaining} more" + + header_parts.append( + f'
' + f'Needs review: {len(all_review)} (segments {seg_html})' + f'
' + ) + + missing_segments = [i + 1 for i, s in enumerate(segments) if s.has_missing_words] + if missing_segments: + # Group consecutive segment numbers into pairs (gaps always flag both neighbors) + missing_pairs = [] + i = 0 + while i < len(missing_segments): + if i + 1 < len(missing_segments) and missing_segments[i + 1] == missing_segments[i] + 1: + missing_pairs.append(f"{missing_segments[i]}/{missing_segments[i + 1]}") + i += 2 + else: + missing_pairs.append(str(missing_segments[i])) + i += 1 + + if len(missing_pairs) <= REVIEW_SUMMARY_MAX_SEGMENTS: + pairs_display = ", ".join(missing_pairs) + else: + pairs_display = ", ".join(missing_pairs[:REVIEW_SUMMARY_MAX_SEGMENTS]) + remaining = len(missing_pairs) - REVIEW_SUMMARY_MAX_SEGMENTS + pairs_display += f" ... and {remaining} more" + + header_parts.append( + f'
' + f'Segments with missing words: {len(missing_pairs)} (segments {pairs_display})' + f'
' + ) + + underseg_segments = [i + 1 for i, s in enumerate(segments) if s.potentially_undersegmented] + if underseg_segments: + if len(underseg_segments) <= REVIEW_SUMMARY_MAX_SEGMENTS: + underseg_display = ", ".join(str(n) for n in underseg_segments) + else: + underseg_display = ", ".join(str(n) for n in underseg_segments[:REVIEW_SUMMARY_MAX_SEGMENTS]) + remaining = len(underseg_segments) - REVIEW_SUMMARY_MAX_SEGMENTS + underseg_display += f" ... and {remaining} more" + + header_parts.append( + f'
' + f'Potentially undersegmented: {len(underseg_segments)} (segments {underseg_display})' + f'
' + ) + + html_parts = [ + f'
', + "\n".join(header_parts), + ] + + for idx, seg in enumerate(segments): + inline = idx < AUDIO_PRELOAD_COUNT + preload = "auto" if inline else "metadata" + html_parts.append(render_segment_card(seg, idx, audio_int16, sample_rate, render_key, segment_dir, audio_preload=preload, audio_inline=inline)) + + html_parts.append('
') + + return "\n".join(html_parts) + + +# ============================================================================= +# Main processing +# ============================================================================= + +def encode_segment_audio( + audio_int16: np.ndarray, sample_rate: int, + start_time: float, end_time: float, + segment_dir: Path, segment_idx: int, + inline: bool = False, +) -> str: + """Write a segment's audio slice as a WAV file and return a src URL. + + Args: + audio_int16: Full audio already converted to int16 (avoids per-segment conversion). + sample_rate: Sample rate in Hz. + start_time: Segment start in seconds. + end_time: Segment end in seconds. + segment_dir: Directory to write the WAV file into. + segment_idx: Segment index (used for filename). + inline: If True, return a base64 data URI instead of a file URL. + + Returns a ``data:`` URI (inline) or ``/gradio_api/file=`` URL. + """ + import wave + import io + + start_sample = int(start_time * sample_rate) + end_sample = int(end_time * sample_rate) + segment_audio = audio_int16[start_sample:end_sample] + + # Always write WAV to disk (needed by MFA timestamp computation) + path = segment_dir / f"seg_{segment_idx}.wav" + with wave.open(str(path), 'wb') as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(sample_rate) + wf.writeframes(segment_audio.tobytes()) + + if inline: + import base64 + with open(path, 'rb') as f: + b64 = base64.b64encode(f.read()).decode('ascii') + return f"data:audio/wav;base64,{b64}" + + return f"/gradio_api/file={path}" + + +def is_end_of_verse(matched_ref: str) -> bool: + """ + Check if a reference ends at the last word of a verse. + Expects formats like "2:255:1-2:255:5" or "2:255:5". + """ + if not matched_ref or ":" not in matched_ref: + return False + + try: + # Take the end part of the range (or the single ref) + end_ref = matched_ref.split("-")[-1] + parts = end_ref.split(":") + if len(parts) < 3: + return False + + surah = int(parts[0]) + ayah = int(parts[1]) + word = int(parts[2]) + + verse_word_counts = _load_verse_word_counts() + if surah not in verse_word_counts: + return False + + num_words = verse_word_counts[surah].get(ayah, 0) + return word >= num_words + except Exception as e: + print(f"Error checking end of verse: {e}") + + return False + + +def _run_post_vad_pipeline( + audio, sample_rate, intervals, + model_name, device, profiling, pipeline_start, progress_steps, + progress=gr.Progress(), + precomputed_asr=None, + min_silence_ms=0, min_speech_ms=0, pad_ms=0, + request=None, log_row=None, +): + """Shared pipeline after VAD: ASR → specials → anchor → matching → results. + + Args: + audio: Preprocessed float32 mono 16kHz audio array + sample_rate: Sample rate (16000) + intervals: List of (start, end) tuples from VAD cleaning + model_name: ASR model name ("Base" or "Large") + device: Device string ("gpu" or "cpu") + profiling: ProfilingData instance to populate + pipeline_start: time.time() when pipeline started + precomputed_asr: Optional tuple of (results, batch_profiling, sorting_time, + batch_build_time, gpu_move_time, gpu_time) from a combined GPU lease. + If provided, skips the standalone ASR GPU call. + + Returns: + (html, json_output, segment_dir) tuple + """ + import time + + if not intervals: + return "
No speech segments detected in audio
", {"segments": []}, None + + # Build VAD segments and extract audio arrays + vad_segments = [] + segment_audios = [] + for idx, (start, end) in enumerate(intervals): + vad_segments.append(VadSegment(start_time=start, end_time=end, segment_idx=idx)) + start_sample = int(start * sample_rate) + end_sample = int(end * sample_rate) + segment_audios.append(audio[start_sample:end_sample]) + + print(f"[VAD] {len(vad_segments)} segments") + + if precomputed_asr is not None: + # ASR already ran within the combined GPU lease + phoneme_texts, asr_batch_profiling, asr_sorting_time, asr_batch_build_time, asr_gpu_move_time, asr_gpu_time = precomputed_asr + print(f"[PHONEME ASR] {len(phoneme_texts)} results (precomputed, gpu {asr_gpu_time:.2f}s)") + else: + # Standalone ASR GPU lease (resegment/retranscribe paths) + progress(*progress_steps["asr"]) + print(f"[STAGE] Running ASR...") + + phoneme_asr_start = time.time() + phoneme_texts, asr_batch_profiling, asr_sorting_time, asr_batch_build_time, asr_gpu_move_time, asr_gpu_time = run_phoneme_asr_gpu(segment_audios, sample_rate, model_name) + phoneme_asr_time = time.time() - phoneme_asr_start + profiling.asr_time = phoneme_asr_time + profiling.asr_gpu_time = asr_gpu_time + profiling.asr_model_move_time = asr_gpu_move_time + profiling.asr_sorting_time = asr_sorting_time + profiling.asr_batch_build_time = asr_batch_build_time + profiling.asr_batch_profiling = asr_batch_profiling + print(f"[PHONEME ASR] {len(phoneme_texts)} results in {phoneme_asr_time:.2f}s (gpu {asr_gpu_time:.2f}s)") + + if asr_batch_profiling: + for b in asr_batch_profiling: + print(f" Batch {b['batch_num']:>2}: {b['size']:>3} segs | " + f"{b['time']:.3f}s | " + f"{b['min_dur']:.2f}-{b['max_dur']:.2f}s " + f"(A {b['avg_dur']:.2f}s, T {b['total_seconds']:.1f}s, W {b['pad_waste']:.0%})") + + # Phoneme-based special segment detection + progress(*progress_steps["special_segments"]) + print(f"[STAGE] Detecting special segments...") + from src.alignment.special_segments import detect_special_segments + vad_segments, segment_audios, special_results, first_quran_idx = detect_special_segments( + phoneme_texts, vad_segments, segment_audios + ) + + # If segments were split (combined Isti'adha+Basmala), pad phoneme_texts + # with empty placeholders so indices stay aligned. + if len(vad_segments) != len(phoneme_texts): + phoneme_texts = [[], []] + phoneme_texts[1:] + + # Anchor detection via phoneme n-gram voting + progress(*progress_steps["anchor"]) + print(f"[STAGE] Anchor detection...") + anchor_start = time.time() + from src.alignment.phoneme_anchor import find_anchor_by_voting, verse_to_word_index + from src.alignment.ngram_index import get_ngram_index + from src.alignment.phoneme_matcher_cache import get_chapter_reference + + surah, ayah = find_anchor_by_voting( + phoneme_texts[first_quran_idx:], + get_ngram_index(), + ANCHOR_SEGMENTS, + ) + + if surah == 0: + raise ValueError("Could not anchor to any chapter - no n-gram matches found") + + profiling.anchor_time = time.time() - anchor_start + print(f"[ANCHOR] Anchored to Surah {surah}, Ayah {ayah}") + + # Build chapter reference and set pointer + chapter_ref = get_chapter_reference(surah) + pointer = verse_to_word_index(chapter_ref, ayah) + + progress(*progress_steps["matching"]) + print(f"[STAGE] Text Matching...") + + # Phoneme-based DP alignment + match_start = time.time() + match_results, match_profiling, gap_segments = run_phoneme_matching( + phoneme_texts, + surah, + first_quran_idx, + special_results, + start_pointer=pointer, + ) + match_time = time.time() - match_start + profiling.match_wall_time = match_time + print(f"[MATCH] {len(match_results)} phoneme alignments in {match_time:.2f}s") + + # Populate phoneme alignment profiling (if enabled) + if PHONEME_ALIGNMENT_PROFILING: + profiling.phoneme_total_time = match_profiling.get("total_time", 0.0) + profiling.phoneme_ref_build_time = match_profiling.get("ref_build_time", 0.0) + profiling.phoneme_dp_total_time = match_profiling.get("dp_total_time", 0.0) + profiling.phoneme_dp_min_time = match_profiling.get("dp_min_time", 0.0) + profiling.phoneme_dp_max_time = match_profiling.get("dp_max_time", 0.0) + profiling.phoneme_window_setup_time = match_profiling.get("window_setup_time", 0.0) + profiling.phoneme_result_build_time = match_profiling.get("result_build_time", 0.0) + profiling.phoneme_num_segments = match_profiling.get("num_segments", 0) + + # Retry / reanchor counters (always available) + profiling.tier1_attempts = match_profiling.get("tier1_attempts", 0) + profiling.tier1_passed = match_profiling.get("tier1_passed", 0) + profiling.tier1_segments = match_profiling.get("tier1_segments", []) + profiling.tier2_attempts = match_profiling.get("tier2_attempts", 0) + profiling.tier2_passed = match_profiling.get("tier2_passed", 0) + profiling.tier2_segments = match_profiling.get("tier2_segments", []) + profiling.consec_reanchors = match_profiling.get("consec_reanchors", 0) + profiling.special_merges = match_profiling.get("special_merges", 0) + profiling.segments_attempted = match_profiling.get("segments_attempted", 0) + profiling.segments_passed = match_profiling.get("segments_passed", 0) + + progress(*progress_steps["building"]) + print(f"[STAGE] Building results...") + + # Build SegmentInfo list + segments = [] + result_build_start = time.time() + + # Convert full audio to int16 once + t_wav = time.time() + audio_int16 = (audio * 32767).astype(np.int16) + audio_encode_time = time.time() - t_wav + + # Create a per-request directory for segment WAV files + import uuid + segment_dir = SEGMENT_AUDIO_DIR / uuid.uuid4().hex + segment_dir.mkdir(parents=True, exist_ok=True) + + last_display_idx = len(vad_segments) - 1 + + # Tracking lists for segment stats logging + _seg_word_counts: list[int] = [] + _seg_durations: list[float] = [] + _seg_phoneme_counts: list[int] = [] + _seg_ayah_spans: list[int] = [] + _underseg_indices: list[int] = [] + _underseg_by_words: list[int] = [] + _underseg_by_ayah: list[int] = [] + + for idx, (seg, (matched_text, score, matched_ref)) in enumerate( + zip(vad_segments, match_results) + ): + if idx == last_display_idx and matched_ref: + if not is_end_of_verse(matched_ref): + score = max(0.0, score - 0.25) + + error = None + phoneme_text = " ".join(phoneme_texts[idx]) if idx < len(phoneme_texts) else "" + + if score <= 0.0: + matched_text = "" + matched_ref = "" + error = f"Low confidence ({score:.0%})" + + duration = seg.end_time - seg.start_time + word_count, ayah_span = get_segment_word_stats(matched_ref) + underseg = check_undersegmented(matched_ref, duration) + + segments.append(SegmentInfo( + start_time=seg.start_time, + end_time=seg.end_time, + transcribed_text=phoneme_text, + matched_text=matched_text, + matched_ref=matched_ref, + match_score=score, + error=error, + has_missing_words=idx in gap_segments, + potentially_undersegmented=underseg, + )) + + # Track per-segment stats for logging + _seg_word_counts.append(word_count) + _seg_durations.append(duration) + _seg_phoneme_counts.append(len(phoneme_texts[idx]) if idx < len(phoneme_texts) else 0) + _seg_ayah_spans.append(ayah_span) + if underseg: + _underseg_indices.append(idx + 1) + if word_count >= UNDERSEG_MIN_WORDS: + _underseg_by_words.append(idx + 1) + if ayah_span >= UNDERSEG_MIN_AYAH_SPAN: + _underseg_by_ayah.append(idx + 1) + + # Recompute from actual output + profiling.segments_attempted = len(segments) + profiling.segments_passed = sum(1 for s in segments if s.match_score > 0.0) + + result_build_total_time = time.time() - result_build_start + profiling.result_build_time = result_build_total_time + profiling.result_audio_encode_time = audio_encode_time + + progress(*progress_steps["done"]) + print("[STAGE] Done!") + + # Print profiling summary + profiling.total_time = time.time() - pipeline_start + print(profiling.summary()) + + # Segment distribution stats + matched_words = [w for w in _seg_word_counts if w > 0] + matched_durs = [d for i, d in enumerate(_seg_durations) if _seg_word_counts[i] > 0] + matched_phonemes = [p for i, p in enumerate(_seg_phoneme_counts) if _seg_word_counts[i] > 0] + pauses = [vad_segments[i + 1].start_time - vad_segments[i].end_time + for i in range(len(vad_segments) - 1)] + pauses = [p for p in pauses if p > 0] + if matched_words: + def _std(vals): + n = len(vals) + if n < 2: + return 0.0 + mean = sum(vals) / n + return (sum((v - mean) ** 2 for v in vals) / n) ** 0.5 + + avg_w = sum(matched_words) / len(matched_words) + std_w = _std(matched_words) + min_w, max_w = min(matched_words), max(matched_words) + avg_d = sum(matched_durs) / len(matched_durs) + std_d = _std(matched_durs) + min_d, max_d = min(matched_durs), max(matched_durs) + total_speech_sec = sum(matched_durs) + total_words = sum(matched_words) + total_phonemes = sum(matched_phonemes) + wpm = total_words / (total_speech_sec / 60) if total_speech_sec > 0 else 0 + pps = total_phonemes / total_speech_sec if total_speech_sec > 0 else 0 + print(f"\n[SEGMENT STATS] {len(segments)} total segments, {len(matched_words)} matched") + print(f" Words/segment : min={min_w}, max={max_w}, avg={avg_w:.1f}\u00b1{std_w:.1f}") + print(f" Duration (s) : min={min_d:.1f}, max={max_d:.1f}, avg={avg_d:.1f}\u00b1{std_d:.1f}") + if pauses: + avg_p = sum(pauses) / len(pauses) + std_p = _std(pauses) + print(f" Pause (s) : min={min(pauses):.1f}, max={max(pauses):.1f}, avg={avg_p:.1f}\u00b1{std_p:.1f}") + print(f" Speech pace : {wpm:.1f} words/min, {pps:.1f} phonemes/sec (speech time only)") + if _underseg_indices: + print(f" Undersegmented: {len(_underseg_indices)} (segments {', '.join(str(n) for n in _underseg_indices)})") + if _underseg_by_words: + print(f" by word count (>={UNDERSEG_MIN_WORDS}): {', '.join(str(n) for n in _underseg_by_words)}") + if _underseg_by_ayah: + print(f" by ayah span (>={UNDERSEG_MIN_AYAH_SPAN}): {', '.join(str(n) for n in _underseg_by_ayah)}") + else: + print(f" Undersegmented: 0") + + # --- Usage logging --- + try: + from utils.usage_logger import log_alignment, update_alignment_row + + # Reciter stats (default 0.0 when no matched segments) + _log_wpm = wpm if matched_words else 0.0 + _log_pps = pps if matched_words else 0.0 + _log_avg_d = avg_d if matched_words else 0.0 + _log_std_d = std_d if matched_words else 0.0 + _log_avg_p = avg_p if (matched_words and pauses) else 0.0 + _log_std_p = std_p if (matched_words and pauses) else 0.0 + + # Mean confidence across all segments + all_scores = [seg.match_score for seg in segments] + _log_mean_conf = sum(all_scores) / len(all_scores) if all_scores else 0.0 + + # Build per-segment objects for logging + _log_segments = [] + for i, seg in enumerate(segments): + sp_type = None + if i < len(special_results) and special_results[i]: + sp_type = special_results[i] + _log_segments.append({ + "idx": i + 1, + "start": round(seg.start_time, 2), + "end": round(seg.end_time, 2), + "duration": round(seg.end_time - seg.start_time, 2), + "ref": seg.matched_ref or "", + "confidence": round(seg.match_score, 2), + "word_count": _seg_word_counts[i] if i < len(_seg_word_counts) else 0, + "ayah_span": _seg_ayah_spans[i] if i < len(_seg_ayah_spans) else 0, + "phoneme_count": _seg_phoneme_counts[i] if i < len(_seg_phoneme_counts) else 0, + "undersegmented": seg.potentially_undersegmented, + "missing_words": seg.has_missing_words, + "special_type": sp_type, + "error": seg.error, + }) + + _r = lambda v: round(v, 2) + _log_kwargs = dict( + audio_duration_s=_r(len(audio) / sample_rate), + num_segments=len(segments), + surah=surah, + min_silence_ms=min_silence_ms, + min_speech_ms=min_speech_ms, + pad_ms=pad_ms, + asr_model=model_name, + device=device, + total_time=_r(profiling.total_time), + vad_queue_time=_r(getattr(profiling, "vad_wall_time", 0.0) - getattr(profiling, "vad_gpu_time", 0.0)), + vad_gpu_time=_r(getattr(profiling, "vad_gpu_time", 0.0)), + asr_gpu_time=_r(getattr(profiling, "asr_gpu_time", 0.0)), + dp_total_time=_r(getattr(profiling, "phoneme_dp_total_time", 0.0)), + segments_passed=getattr(profiling, "segments_passed", 0), + segments_failed=getattr(profiling, "segments_attempted", 0) - getattr(profiling, "segments_passed", 0), + mean_confidence=_r(_log_mean_conf), + tier1_retries=getattr(profiling, "tier1_attempts", 0), + tier1_passed=getattr(profiling, "tier1_passed", 0), + tier2_retries=getattr(profiling, "tier2_attempts", 0), + tier2_passed=getattr(profiling, "tier2_passed", 0), + reanchors=getattr(profiling, "consec_reanchors", 0), + special_merges=getattr(profiling, "special_merges", 0), + words_per_minute=_r(_log_wpm), + phonemes_per_second=_r(_log_pps), + avg_segment_duration=_r(_log_avg_d), + std_segment_duration=_r(_log_std_d), + avg_pause_duration=_r(_log_avg_p), + std_pause_duration=_r(_log_std_p), + log_segments=_log_segments, + ) + + if log_row is not None: + # Resegment / retranscribe: mutate existing row in-place + _action = "retranscribe" if log_row.get("asr_model") != model_name else "resegment" + update_alignment_row(log_row, action=_action, **_log_kwargs) + else: + # Initial run: create new row + log_row = log_alignment( + audio=audio, + sample_rate=sample_rate, + request=request, + **_log_kwargs, + ) + except Exception as e: + print(f"[USAGE_LOG] Failed: {e}") + + # Build JSON output for API consumers + def parse_ref(matched_ref): + if not matched_ref: + return "", "" + if "-" in matched_ref: + parts = matched_ref.split("-") + return parts[0], parts[1] if len(parts) > 1 else parts[0] + return matched_ref, matched_ref + + segments_list = [] + for i, seg in enumerate(segments): + segment_data = { + "segment": i + 1, + "time_from": round(seg.start_time, 3), + "time_to": round(seg.end_time, 3), + "ref_from": parse_ref(seg.matched_ref)[0], + "ref_to": parse_ref(seg.matched_ref)[1], + "matched_text": seg.matched_text or "", + "confidence": round(seg.match_score, 3), + "potentially_undersegmented": seg.potentially_undersegmented, + "error": seg.error + } + segments_list.append(segment_data) + + json_output = {"segments": segments_list} + + # Check if we fell back to CPU due to quota exhaustion + cpu_fallback = is_quota_exhausted() and not is_user_forced_cpu() + + return render_segments(segments, audio_int16, sample_rate, cpu_fallback=cpu_fallback, segment_dir=segment_dir), json_output, str(segment_dir), log_row + + +def process_audio( + audio_data, + min_silence_ms, + min_speech_ms, + pad_ms, + model_name="Base", + device="GPU", + request: gr.Request = None, + progress=gr.Progress(), +): + """Process uploaded audio and extract segments with automatic verse detection. + + Returns: + (html, json_output, raw_speech_intervals, raw_is_complete, preprocessed_audio, sample_rate, intervals, segment_dir, log_row) + """ + import time + + if audio_data is None: + return "
Please upload an audio file
", None, None, None, None, None, None, None, None + + # Normalize device label to lowercase for downstream checks + device = device.lower() + + # Reset per-request so each request retries GPU fresh + from src.zero_gpu import reset_quota_flag, force_cpu_mode + reset_quota_flag() + + if device == "cpu": + force_cpu_mode() + + print(f"\n{'='*60}") + print(f"Processing audio with automatic verse detection") + print(f"Settings: silence={min_silence_ms}ms, speech={min_speech_ms}ms, pad={pad_ms}ms, device={device}") + print(f"{'='*60}") + + # Initialize profiling data + profiling = ProfilingData() + pipeline_start = time.time() + + sample_rate, audio = audio_data + + # Convert to float32 + if audio.dtype == np.int16: + audio = audio.astype(np.float32) / 32768.0 + elif audio.dtype == np.int32: + audio = audio.astype(np.float32) / 2147483648.0 + + # Convert stereo to mono + if len(audio.shape) > 1: + audio = audio.mean(axis=1) + + # Resample to 16kHz once (both VAD and ASR models require 16kHz) + if sample_rate != 16000: + resample_start = time.time() + audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000, res_type=RESAMPLE_TYPE) + profiling.resample_time = time.time() - resample_start + print(f"[PROFILE] Resampling {sample_rate}Hz -> 16000Hz took {profiling.resample_time:.3f}s (audio length: {len(audio)/16000:.1f}s, res_type={RESAMPLE_TYPE})") + sample_rate = 16000 + + progress(*PROGRESS_PROCESS_AUDIO["vad_asr"]) + print("[STAGE] Running VAD + ASR...") + + # Single GPU lease: VAD + ASR + gpu_start = time.time() + (intervals, vad_profiling, vad_gpu_time, raw_speech_intervals, raw_is_complete, + asr_results, asr_batch_profiling, asr_sorting_time, asr_batch_build_time, + asr_gpu_move_time, asr_gpu_time) = run_vad_and_asr_gpu( + audio, sample_rate, int(min_silence_ms), int(min_speech_ms), int(pad_ms), model_name + ) + wall_time = time.time() - gpu_start + + # VAD profiling: queue wait is attributed to VAD (it happens before VAD runs) + profiling.vad_model_load_time = vad_profiling.get("model_load_time", 0.0) + profiling.vad_model_move_time = vad_profiling.get("model_move_time", 0.0) + profiling.vad_inference_time = vad_profiling.get("inference_time", 0.0) + profiling.vad_gpu_time = vad_gpu_time + profiling.vad_wall_time = wall_time - asr_gpu_time + print(f"[GPU] VAD completed in {profiling.vad_wall_time:.2f}s (gpu {vad_gpu_time:.2f}s)") + + if not intervals: + return "
No speech segments detected in audio
", None, None, None, None, None, None, None, None + + # ASR profiling: no separate queue (ran within same lease) + profiling.asr_time = asr_gpu_time + profiling.asr_gpu_time = asr_gpu_time + profiling.asr_model_move_time = asr_gpu_move_time + profiling.asr_sorting_time = asr_sorting_time + profiling.asr_batch_build_time = asr_batch_build_time + profiling.asr_batch_profiling = asr_batch_profiling + print(f"[GPU] ASR completed in {asr_gpu_time:.2f}s") + + # Run post-VAD pipeline (ASR already done, pass results) + html, json_output, seg_dir, log_row = _run_post_vad_pipeline( + audio, sample_rate, intervals, + model_name, device, profiling, pipeline_start, PROGRESS_PROCESS_AUDIO, + progress=progress, + precomputed_asr=(asr_results, asr_batch_profiling, asr_sorting_time, asr_batch_build_time, asr_gpu_move_time, asr_gpu_time), + min_silence_ms=min_silence_ms, min_speech_ms=min_speech_ms, pad_ms=pad_ms, + request=request, + ) + + return html, json_output, raw_speech_intervals, raw_is_complete, audio, sample_rate, intervals, seg_dir, log_row + + +def resegment_audio( + cached_speech_intervals, cached_is_complete, + cached_audio, cached_sample_rate, + min_silence_ms, min_speech_ms, pad_ms, + model_name="Base", device="GPU", + cached_log_row=None, + request: gr.Request = None, + progress=gr.Progress(), +): + """Re-run segmentation with different settings using cached VAD data. + + Skips the heavy VAD model inference — only re-cleans speech intervals + and re-runs ASR + downstream pipeline. + + Returns: + (html, json_output, cached_speech_intervals, cached_is_complete, cached_audio, cached_sample_rate, intervals, segment_dir, log_row) + """ + import time + + if cached_speech_intervals is None or cached_audio is None: + return "
No cached data. Please run Extract Segments first.
", None, None, None, None, None, None, None, None + + # Normalize device label + device = device.lower() + + from src.zero_gpu import reset_quota_flag, force_cpu_mode + reset_quota_flag() + if device == "cpu": + force_cpu_mode() + + print(f"\n{'='*60}") + print(f"RESEGMENTING with different settings") + print(f"Settings: silence={min_silence_ms}ms, speech={min_speech_ms}ms, pad={pad_ms}ms") + print(f"{'='*60}") + + profiling = ProfilingData() + pipeline_start = time.time() + + progress(*PROGRESS_RESEGMENT["resegment"]) + print("[STAGE] Resegmenting...") + + # Re-clean speech intervals with new parameters (CPU, no GPU needed) + from recitations_segmenter import clean_speech_intervals + clean_out = clean_speech_intervals( + cached_speech_intervals, + cached_is_complete, + min_silence_duration_ms=int(min_silence_ms), + min_speech_duration_ms=int(min_speech_ms), + pad_duration_ms=int(pad_ms), + return_seconds=True, + ) + + intervals = clean_out.clean_speech_intervals.tolist() + intervals = [(start, end) for start, end in intervals] + + raw_count = len(cached_speech_intervals) + final_count = len(intervals) + removed = raw_count - final_count + print(f"[RESEGMENT] Raw intervals: {raw_count}, after cleaning: {final_count} " + f"({removed} removed by silence merge + min_speech={min_speech_ms}ms filter)") + + if not intervals: + return "
No speech segments detected with these settings
", None, cached_speech_intervals, cached_is_complete, cached_audio, cached_sample_rate, None, None, cached_log_row + + # Run post-VAD pipeline + html, json_output, seg_dir, log_row = _run_post_vad_pipeline( + cached_audio, cached_sample_rate, intervals, + model_name, device, profiling, pipeline_start, PROGRESS_RESEGMENT, + progress=progress, + min_silence_ms=min_silence_ms, min_speech_ms=min_speech_ms, pad_ms=pad_ms, + request=request, log_row=cached_log_row, + ) + + # Pass through cached state unchanged, but update intervals + return html, json_output, cached_speech_intervals, cached_is_complete, cached_audio, cached_sample_rate, intervals, seg_dir, log_row + + +def retranscribe_audio( + cached_intervals, + cached_audio, cached_sample_rate, + cached_speech_intervals, cached_is_complete, + model_name, + device="GPU", + cached_log_row=None, + min_silence_ms=0, min_speech_ms=0, pad_ms=0, + request: gr.Request = None, + progress=gr.Progress(), +): + """Re-run ASR + downstream with a different model using cached intervals. + + Uses the same segment boundaries but a different ASR model. + + Returns: + (html, json_output, cached_speech_intervals, cached_is_complete, + cached_audio, cached_sample_rate, cached_intervals, segment_dir, log_row) + """ + import time + + if cached_intervals is None or cached_audio is None: + return "
No cached data. Please run Extract Segments first.
", None, None, None, None, None, None, None, None + + device = device.lower() + + from src.zero_gpu import reset_quota_flag, force_cpu_mode + reset_quota_flag() + if device == "cpu": + force_cpu_mode() + + print(f"\n{'='*60}") + print(f"RETRANSCRIBING with {model_name} model") + print(f"{'='*60}") + + profiling = ProfilingData() + pipeline_start = time.time() + + pct, desc = PROGRESS_RETRANSCRIBE["retranscribe"] + progress(pct, desc=desc.format(model=model_name)) + print(f"[STAGE] Retranscribing with {model_name} model...") + + html, json_output, seg_dir, log_row = _run_post_vad_pipeline( + cached_audio, cached_sample_rate, cached_intervals, + model_name, device, profiling, pipeline_start, PROGRESS_RETRANSCRIBE, + progress=progress, + min_silence_ms=min_silence_ms, min_speech_ms=min_speech_ms, pad_ms=pad_ms, + request=request, log_row=cached_log_row, + ) + + # Pass through all cached state unchanged + return html, json_output, cached_speech_intervals, cached_is_complete, cached_audio, cached_sample_rate, cached_intervals, seg_dir, log_row + + +def _retranscribe_wrapper( + cached_intervals, cached_audio, cached_sample_rate, + cached_speech_intervals, cached_is_complete, + cached_model_name, device, + cached_log_row=None, + min_silence_ms=0, min_speech_ms=0, pad_ms=0, + request: gr.Request = None, + progress=gr.Progress(), +): + """Compute opposite model from cached_model_name and run retranscribe.""" + opposite = "Large" if cached_model_name == "Base" else "Base" + return retranscribe_audio( + cached_intervals, cached_audio, cached_sample_rate, + cached_speech_intervals, cached_is_complete, + opposite, device, + cached_log_row=cached_log_row, + min_silence_ms=min_silence_ms, min_speech_ms=min_speech_ms, pad_ms=pad_ms, + request=request, + progress=progress, + ) + + +def process_audio_json(audio_data, min_silence_ms, min_speech_ms, pad_ms, model_name="Base", device="GPU"): + """API-only endpoint that returns just JSON (no HTML).""" + result = process_audio(audio_data, min_silence_ms, min_speech_ms, pad_ms, model_name, device) + return result[1] # json_output is at index 1 + + +def save_json_export(json_data): + """Save JSON results to a temp file for download.""" + import tempfile + import json + + if not json_data or not json_data.get("segments"): + return None + + # Create temp file with JSON + with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False, encoding='utf-8') as f: + json.dump(json_data, f, indent=2, ensure_ascii=False) + return f.name + + +def _mfa_upload_and_submit(refs, audio_paths): + """Upload audio files and submit alignment batch to the MFA Space. + + Returns (event_id, headers, base_url) so the caller can yield a progress + update before blocking on the SSE result stream. + """ + import requests + + hf_token = os.environ.get("HF_TOKEN", "") + headers = {} + if hf_token: + headers["Authorization"] = f"Bearer {hf_token}" + print(f"[MFA_TS] HF_TOKEN={'set' if hf_token else 'NOT SET'}") + + base = MFA_SPACE_URL + print(f"[MFA_TS] MFA base URL: {base}") + + # Upload all audio files in a single batched request + files_payload = [] + open_handles = [] + for path in audio_paths: + fh = open(path, "rb") + open_handles.append(fh) + files_payload.append(("files", (os.path.basename(path), fh, "audio/wav"))) + try: + resp = requests.post( + f"{base}/gradio_api/upload", + headers=headers, + files=files_payload, + timeout=MFA_TIMEOUT, + ) + resp.raise_for_status() + uploaded_paths = resp.json() + finally: + for fh in open_handles: + fh.close() + + # Build FileData objects + file_data_list = [ + {"path": p, "meta": {"_type": "gradio.FileData"}} + for p in uploaded_paths + ] + + # Submit batch alignment + submit_resp = requests.post( + f"{base}/gradio_api/call/align_batch", + headers={**headers, "Content-Type": "application/json"}, + json={"data": [refs, file_data_list]}, + timeout=MFA_TIMEOUT, + ) + submit_resp.raise_for_status() + event_id = submit_resp.json()["event_id"] + print(f"[MFA_TS] Submitted batch, event_id={event_id}") + + return event_id, headers, base + + +def _mfa_wait_result(event_id, headers, base): + """Wait for the MFA SSE stream and return parsed results list.""" + import requests + import json + + sse_resp = requests.get( + f"{base}/gradio_api/call/align_batch/{event_id}", + headers=headers, + stream=True, + timeout=MFA_TIMEOUT, + ) + sse_resp.raise_for_status() + + result_data = None + for line in sse_resp.iter_lines(decode_unicode=True): + if line and line.startswith("data: "): + result_data = line[6:] # strip "data: " prefix + + if result_data is None: + raise RuntimeError("No data received from MFA align_batch SSE stream") + + parsed = json.loads(result_data) + # Gradio wraps the return value in a list + if isinstance(parsed, list) and len(parsed) == 1: + parsed = parsed[0] + + if parsed.get("status") != "ok": + raise RuntimeError(f"MFA align_batch failed: {parsed.get('error', parsed)}") + + return parsed["results"] + + +def _ts_progress_bar_html(total_segments, rate, animated=True): + """Return HTML for a progress bar showing Segment x/N. + + When *animated* is False the bar is static at 0 %. When True the CSS fill + animation runs and an img-onerror trick drives the text counter (since + Gradio innerHTML doesn't execute + """ + + js = js.replace('__SURAH_LIGATURES_JSON__', json.dumps(_SURAH_LIGATURES)) + + with gr.Blocks(title="Quran Multi-Aligner", css=css, head=js, delete_cache=(DELETE_CACHE_FREQUENCY, DELETE_CACHE_AGE)) as app: + gr.Markdown("# 🎙️ Quran Multi-Aligner") + gr.Markdown(""" +- Transcribe and split any recitation by pauses within 1-2 minutes +- Get precise pause-, verse-, word- and character-level timestamps, exportable as JSON +- GPU-powered API usage with daily quotas +- Reliable confidence system to flag uncertain segments and missed words — no silent errors +- Robust tolerance to noise, speaker variation and suboptimal audio quality, particularly with the large model +- Not intended for incorrect or fragmented recitations; most suited for correct, continuous recitations (repetitions handled) +- [Feedback and contributions are welcome](https://huggingface.co/spaces/hetchyy/Quran-aligner/discussions) +""") + + # API Documentation accordion + with gr.Accordion("📡 API Usage", open=False): + gr.Markdown("In progress") + + with gr.Row(elem_id="main-row"): + # Left column: Input + with gr.Column(scale=LEFT_COLUMN_SCALE, elem_id="left-col"): + audio_input = gr.Audio( + label="Upload Recitation", + sources=["upload", "microphone"], + type="numpy" + ) + + # Example audio files — short surahs use CPU, long ones use GPU + with gr.Row(): + btn_ex_112 = gr.Button("112", size="sm", min_width=0) + btn_ex_84 = gr.Button("84", size="sm", min_width=0) + btn_ex_7 = gr.Button("7", size="sm", min_width=0) + btn_ex_juz30 = gr.Button("Juz' 30", size="sm", min_width=0) + + with gr.Accordion("Animation Settings", open=False, elem_id="anim-settings-accordion"): + with gr.Row(elem_id="anim-style-row"): + anim_granularity_radio = gr.Radio( + choices=ANIM_GRANULARITIES, + value=ANIM_GRANULARITY_DEFAULT, + label="Granularity", + scale=ANIM_STYLE_ROW_SCALES[0], + ) + anim_mode_radio = gr.Radio( + choices=ANIM_DISPLAY_MODES, + value=ANIM_DISPLAY_MODE_DEFAULT, + label="Animation Style", + scale=ANIM_STYLE_ROW_SCALES[1], + ) + anim_verse_checkbox = gr.Checkbox( + value=False, + label="Verse Only", + elem_id="anim-verse-mode", + scale=ANIM_STYLE_ROW_SCALES[2], min_width=90, + ) + anim_color_picker = gr.ColorPicker( + value=ANIM_WORD_COLOR, + label="Color", + scale=ANIM_STYLE_ROW_SCALES[3], + ) + _is_custom = (ANIM_DISPLAY_MODE_DEFAULT == "Custom") + _preset = ANIM_PRESETS.get(ANIM_DISPLAY_MODE_DEFAULT, {}) + with gr.Row(): + anim_opacity_prev_slider = gr.Slider( + minimum=0, maximum=1, step=ANIM_OPACITY_STEP, + value=_preset.get("prev_opacity", ANIM_OPACITY_PREV_DEFAULT), + label="Before Opacity", + interactive=_is_custom, + elem_id="anim-opacity-prev", + ) + anim_opacity_after_slider = gr.Slider( + minimum=0, maximum=1, step=ANIM_OPACITY_STEP, + value=_preset.get("after_opacity", ANIM_OPACITY_AFTER_DEFAULT), + label="After Opacity", + interactive=_is_custom, + elem_id="anim-opacity-after", + ) + with gr.Row(): + anim_window_prev_slider = gr.Slider( + minimum=ANIM_WINDOW_PREV_MIN, maximum=ANIM_WINDOW_PREV_MAX, step=1, + value=_preset.get("prev_words", ANIM_WINDOW_PREV_DEFAULT), + label="Before Words", elem_id="anim-window-prev", + interactive=_is_custom, + ) + anim_window_after_slider = gr.Slider( + minimum=ANIM_WINDOW_AFTER_MIN, maximum=ANIM_WINDOW_AFTER_MAX, step=1, + value=_preset.get("after_words", ANIM_WINDOW_AFTER_DEFAULT), + label="After Words", elem_id="anim-window-after", + interactive=_is_custom, + ) + with gr.Row(elem_id="mega-styling-row"): + anim_word_spacing_slider = gr.Slider( + minimum=MEGA_WORD_SPACING_MIN, maximum=MEGA_WORD_SPACING_MAX, + step=MEGA_WORD_SPACING_STEP, value=MEGA_WORD_SPACING_DEFAULT, + label="Word Spacing", elem_id="anim-word-spacing", + ) + anim_text_size_slider = gr.Slider( + minimum=MEGA_TEXT_SIZE_MIN, maximum=MEGA_TEXT_SIZE_MAX, + step=MEGA_TEXT_SIZE_STEP, value=MEGA_TEXT_SIZE_DEFAULT, + label="Text Size", elem_id="anim-text-size", + ) + anim_line_spacing_slider = gr.Slider( + minimum=MEGA_LINE_SPACING_MIN, maximum=MEGA_LINE_SPACING_MAX, + step=MEGA_LINE_SPACING_STEP, value=MEGA_LINE_SPACING_DEFAULT, + label="Line Spacing", elem_id="anim-line-spacing", + ) + anim_cached_settings = gr.JSON(value=None, visible=False) + with gr.Accordion("Model Settings", open=True): + with gr.Row(): + model_radio = gr.Radio( + choices=["Base", "Large"], + value="Base", + label="ASR Model", + info="Large: more robust to noisy/non-studio recitations but much slower (10x bigger)" + ) + device_radio = gr.Radio( + choices=["GPU", "CPU"], + value="GPU", + label="Device", + info="Daily GPU usage limits. Unlimitted CPU usage but slower" + ) + + # Helper to create segmentation settings (preset buttons + sliders) + def _create_segmentation_settings(id_suffix=""): + """Create preset buttons and sliders. Returns (silence, speech, pad, btn_muj, btn_mur, btn_fast).""" + _default_silence, _default_speech, _default_pad = PRESET_MURATTAL + with gr.Row(): + with gr.Column(scale=1, min_width=0): + btn_muj = gr.Button("Mujawwad (Slow)", size="sm", variant="secondary", + elem_id=f"preset-mujawwad{id_suffix}") + with gr.Column(scale=1, min_width=0): + btn_mur = gr.Button("Murattal (Normal)", size="sm", variant="primary", + elem_id=f"preset-murattal{id_suffix}") + with gr.Column(scale=1, min_width=0): + btn_fast = gr.Button("Hadr (Fast)", size="sm", variant="secondary", + elem_id=f"preset-fast{id_suffix}") + + silence = gr.Slider( + minimum=MIN_SILENCE_MIN, maximum=MIN_SILENCE_MAX, + value=_default_silence, step=MIN_SILENCE_STEP, + label="Min Silence Duration (ms)", + info="Shorter = more segments. Decrease for reciters who have short pauses" + ) + speech = gr.Slider( + minimum=MIN_SPEECH_MIN, maximum=MIN_SPEECH_MAX, + value=_default_speech, step=MIN_SPEECH_STEP, + label="Min Speech Duration (ms)", + info="Speech segments shorter than this are discarded. Increase to filter out false detections" + ) + pad = gr.Slider( + minimum=PAD_MIN, maximum=PAD_MAX, + value=_default_pad, step=PAD_STEP, + label="Padding (ms)", + info="Extra audio kept before/after each segment to avoid clipping speech edges" + ) + return silence, speech, pad, btn_muj, btn_mur, btn_fast + + def _wire_presets(btn_muj, btn_mur, btn_fast, silence, speech, pad): + """Wire preset button click handlers to sliders.""" + presets = { + "mujawwad": PRESET_MUJAWWAD, + "murattal": PRESET_MURATTAL, + "fast": PRESET_FAST, + } + + def apply_preset(name): + s, sp, p = presets[name] + return ( + s, sp, p, + gr.update(variant="primary" if name == "mujawwad" else "secondary"), + gr.update(variant="primary" if name == "murattal" else "secondary"), + gr.update(variant="primary" if name == "fast" else "secondary"), + ) + + outputs = [silence, speech, pad, btn_muj, btn_mur, btn_fast] + btn_muj.click(fn=lambda: apply_preset("mujawwad"), inputs=[], outputs=outputs, api_name=False) + btn_mur.click(fn=lambda: apply_preset("murattal"), inputs=[], outputs=outputs, api_name=False) + btn_fast.click(fn=lambda: apply_preset("fast"), inputs=[], outputs=outputs, api_name=False) + + with gr.Accordion("Segmentation Settings", open=True): + min_silence_slider, min_speech_slider, pad_slider, \ + preset_mujawwad, preset_murattal, preset_fast = _create_segmentation_settings() + + _wire_presets(preset_mujawwad, preset_murattal, preset_fast, + min_silence_slider, min_speech_slider, pad_slider) + + # JSON download appears here after extraction + export_file = gr.File(label="📥 Download JSON", visible=True, interactive=False) + + # Right column: Output + with gr.Column(scale=RIGHT_COLUMN_SCALE): + extract_btn = gr.Button("Extract Segments", variant="primary", size="lg") + with gr.Row(elem_id="action-btns-row"): + resegment_toggle_btn = gr.Button( + "Resegment with New Settings", variant="primary", size="lg", visible=False + ) + retranscribe_btn = gr.Button( + "Retranscribe with Large Model", variant="primary", size="lg", visible=False + ) + with gr.Row(elem_id="ts-row"): + compute_ts_btn = gr.Button( + "Compute Timestamps", variant="secondary", size="lg", interactive=False, visible=False + ) + compute_ts_progress = gr.HTML(value="", visible=False) + animate_all_html = gr.HTML(value="", visible=False) + + with gr.Column(visible=False) as resegment_panel: + gr.Markdown( + "Uses cached data, skipping the heavy computation, " + "so it's much faster. Useful if results are over-segmented " + "or under-segmented" + ) + rs_silence, rs_speech, rs_pad, \ + rs_btn_muj, rs_btn_mur, rs_btn_fast = _create_segmentation_settings(id_suffix="-rs") + resegment_btn = gr.Button("Resegment", variant="primary", size="lg") + + _wire_presets(rs_btn_muj, rs_btn_mur, rs_btn_fast, + rs_silence, rs_speech, rs_pad) + + output_html = gr.HTML( + value='
Upload audio and click "Extract Segments" to begin
', + elem_classes=["output-html"] + ) + # Hidden JSON output for API consumers + output_json = gr.JSON(visible=False, label="JSON Output") + + # State components for caching VAD data between runs + cached_speech_intervals = gr.State(value=None) + cached_is_complete = gr.State(value=None) + cached_audio = gr.State(value=None) + cached_sample_rate = gr.State(value=None) + cached_intervals = gr.State(value=None) # cleaned (start,end) list from last run + cached_model_name = gr.State(value=None) # model used in last run (for retranscribe label) + cached_segment_dir = gr.State(value=None) # segment audio dir from last run (for MFA timestamps) + cached_log_row = gr.State(value=None) # usage log row dict (mutated in-place before push) + + # Event handlers + # D. Clear everything when new audio is uploaded/recorded + _empty_placeholder = '
Upload audio and click "Extract Segments" to begin
' + audio_input.change( + fn=lambda: ( + _empty_placeholder, None, None, + None, None, None, None, None, None, None, None, + gr.update(visible=True), # show extract_btn + gr.update(visible=False, interactive=False, variant="secondary"), # hide+reset compute_ts_btn + gr.update(visible=False), # hide compute_ts_progress + gr.update(visible=False), # hide animate_all_html + gr.update(visible=False), # hide resegment_toggle_btn + gr.update(visible=False), # hide retranscribe_btn + gr.update(visible=False), # hide resegment_panel + ), + inputs=[], + outputs=[ + output_html, output_json, export_file, + cached_speech_intervals, cached_is_complete, cached_audio, cached_sample_rate, + cached_intervals, cached_model_name, cached_segment_dir, cached_log_row, + extract_btn, compute_ts_btn, compute_ts_progress, animate_all_html, + resegment_toggle_btn, retranscribe_btn, resegment_panel, + ], + api_name=False, show_progress="hidden" + ) + + # Example recitation buttons + btn_ex_112.click(fn=lambda: ("data/112.mp3", "GPU"), inputs=[], outputs=[audio_input, device_radio], api_name=False) + btn_ex_84.click(fn=lambda: ("data/84.mp3", "GPU"), inputs=[], outputs=[audio_input, device_radio], api_name=False) + btn_ex_7.click(fn=lambda: ("data/7.mp3", "GPU"), inputs=[], outputs=[audio_input, device_radio], api_name=False) + btn_ex_juz30.click(fn=lambda: ("data/Juz' 30.mp3", "GPU"), inputs=[], outputs=[audio_input, device_radio], api_name=False) + + # A. Extract button click chain + extract_btn.click( + fn=process_audio, + inputs=[ + audio_input, + min_silence_slider, + min_speech_slider, + pad_slider, + model_radio, + device_radio + ], + outputs=[ + output_html, output_json, + cached_speech_intervals, cached_is_complete, + cached_audio, cached_sample_rate, + cached_intervals, cached_segment_dir, + cached_log_row, + ], + api_name=False, show_progress="minimal" + ).then( + fn=save_json_export, + inputs=[output_json], + outputs=[export_file], + show_progress="hidden" + ).then( + fn=lambda silence, speech, pad, model: ( + gr.update(visible=False), # hide extract_btn + gr.update(visible=True, interactive=True, variant="primary"), # show+enable compute_ts_btn + gr.update(visible=True), # show resegment_toggle_btn + gr.update( # show retranscribe_btn with opposite model label + visible=True, + value=f"Retranscribe with {'Large' if model == 'Base' else 'Base'} Model" + ), + silence, speech, pad, # sync slider values to resegment panel + model, # store in cached_model_name + ), + inputs=[min_silence_slider, min_speech_slider, pad_slider, model_radio], + outputs=[extract_btn, compute_ts_btn, resegment_toggle_btn, retranscribe_btn, + rs_silence, rs_speech, rs_pad, cached_model_name], + api_name=False, show_progress="hidden" + ) + + # A2. Compute Timestamps — MFA forced alignment adds data-start/data-end to word spans + compute_ts_btn.click( + fn=compute_mfa_timestamps, + inputs=[output_html, output_json, cached_segment_dir, cached_log_row], + outputs=[output_html, compute_ts_btn, animate_all_html, compute_ts_progress, output_json], + api_name=False, show_progress="hidden" + ).then( + fn=save_json_export, + inputs=[output_json], + outputs=[export_file], + show_progress="hidden" + ) + + # B. Toggle resegment panel visibility + _resegment_panel_visible = gr.State(value=False) + + def _toggle_resegment_panel(currently_visible): + new_visible = not currently_visible + return gr.update(visible=new_visible), new_visible + + resegment_toggle_btn.click( + fn=_toggle_resegment_panel, + inputs=[_resegment_panel_visible], + outputs=[resegment_panel, _resegment_panel_visible], + api_name=False, show_progress="hidden" + ) + + # C. Resegment button click chain + resegment_btn.click( + fn=resegment_audio, + inputs=[ + cached_speech_intervals, cached_is_complete, + cached_audio, cached_sample_rate, + rs_silence, rs_speech, rs_pad, + model_radio, device_radio, + cached_log_row, + ], + outputs=[ + output_html, output_json, + cached_speech_intervals, cached_is_complete, + cached_audio, cached_sample_rate, + cached_intervals, cached_segment_dir, + cached_log_row, + ], + api_name=False, show_progress="minimal" + ).then( + fn=lambda: (gr.update(visible=False), False), + inputs=[], + outputs=[resegment_panel, _resegment_panel_visible], + api_name=False, show_progress="hidden" + ).then( + fn=save_json_export, + inputs=[output_json], + outputs=[export_file], + show_progress="hidden" + ).then( + fn=lambda silence, speech, pad, model: ( + silence, speech, pad, # sync sliders back to main panel + model, # update cached_model_name to model_radio + gr.update(visible=True, interactive=True, variant="primary"), # show+re-enable compute_ts_btn + gr.update(visible=False), # hide animate_all_html (new segments, no timestamps) + gr.update( # re-show retranscribe with opposite label + visible=True, + value=f"Retranscribe with {'Large' if model == 'Base' else 'Base'} Model" + ), + ), + inputs=[rs_silence, rs_speech, rs_pad, model_radio], + outputs=[min_silence_slider, min_speech_slider, pad_slider, + cached_model_name, compute_ts_btn, animate_all_html, retranscribe_btn], + api_name=False, show_progress="hidden" + ) + + # D. Retranscribe button click chain + retranscribe_btn.click( + fn=_retranscribe_wrapper, + inputs=[ + cached_intervals, cached_audio, cached_sample_rate, + cached_speech_intervals, cached_is_complete, + cached_model_name, device_radio, + cached_log_row, + min_silence_slider, min_speech_slider, pad_slider, + ], + outputs=[ + output_html, output_json, + cached_speech_intervals, cached_is_complete, + cached_audio, cached_sample_rate, + cached_intervals, cached_segment_dir, + cached_log_row, + ], + api_name=False, show_progress="minimal" + ).then( + fn=save_json_export, + inputs=[output_json], + outputs=[export_file], + show_progress="hidden" + ).then( + fn=lambda model_name: ( + gr.update(visible=False), # hide retranscribe_btn + gr.update(visible=True, interactive=True, variant="primary"), # show+re-enable compute_ts_btn + gr.update(visible=False), # hide animate_all_html (new segments, no timestamps) + "Large" if model_name == "Base" else "Base", # update cached_model_name to opposite + ), + inputs=[cached_model_name], + outputs=[retranscribe_btn, compute_ts_btn, animate_all_html, cached_model_name], + api_name=False, show_progress="hidden" + ) + + # E. Animation granularity change handler — update JS global (client-side only) + anim_granularity_radio.change( + fn=None, + inputs=[anim_granularity_radio], + outputs=[], + api_name=False, show_progress="hidden", + js="""(g) => { + window.ANIM_GRANULARITY = g; + document.querySelectorAll('.segment-card').forEach(card => { + if (card.querySelector('.animate-btn.active')) { + if (g === 'Characters') { + card.classList.add('anim-chars'); + } else { + card.classList.remove('anim-chars'); + } + } + }); + // Also update mega card if Animate All is active + var mega = document.querySelector('.mega-card'); + if (mega) { + if (g === 'Characters') { + mega.classList.add('anim-chars'); + } else { + mega.classList.remove('anim-chars'); + } + } + // Update slider labels based on granularity + var unit = g === 'Characters' ? 'Characters' : 'Words'; + var prevEl = document.getElementById('anim-window-prev'); + if (prevEl) { + var lbl = prevEl.querySelector('label span, label'); + if (lbl) lbl.textContent = 'Previous ' + unit; + } + var afterEl = document.getElementById('anim-window-after'); + if (afterEl) { + var lbl = afterEl.querySelector('label span, label'); + if (lbl) lbl.textContent = 'After ' + unit; + } + saveAnimSettings(); + }""" + ) + + # F. Animation display mode change handler — apply preset values + toggle slider interactivity + def _on_mode_change(mode, verse_on, op_prev, op_after, w_prev, w_after): + preset = ANIM_PRESETS.get(mode) + is_custom = not preset + return ( + gr.update(value=op_prev, interactive=is_custom), + gr.update(value=op_after, interactive=is_custom), + gr.update(value=w_prev, interactive=is_custom and not verse_on), + gr.update(value=w_after, interactive=is_custom and not verse_on), + ) + + anim_mode_radio.change( + fn=_on_mode_change, + inputs=[anim_mode_radio, anim_verse_checkbox, + anim_opacity_prev_slider, anim_opacity_after_slider, + anim_window_prev_slider, anim_window_after_slider], + outputs=[anim_opacity_prev_slider, anim_opacity_after_slider, anim_window_prev_slider, anim_window_after_slider], + api_name=False, show_progress="hidden", + js="""(mode, verseOn, opPrev, opAfter, wPrev, wAfter) => { + // Save current Custom values before switching away + var prevMode = window.ANIM_DISPLAY_MODE; + if (prevMode === 'Custom') { + saveAnimSettings(); + } + window.ANIM_DISPLAY_MODE = mode; + var preset = window.ANIM_PRESETS[mode]; + if (preset) { + window.ANIM_OPACITY_PREV = preset.prev_opacity; + window.ANIM_OPACITY_AFTER = preset.after_opacity; + window.ANIM_WINDOW_PREV = preset.prev_words; + window.ANIM_WINDOW_AFTER = preset.after_words; + opPrev = preset.prev_opacity; + opAfter = preset.after_opacity; + wPrev = preset.prev_words; + wAfter = preset.after_words; + } else { + // Entering Custom: restore saved Custom values from localStorage + var s = loadAnimSettings(); + if (s && s.custom) { + window.ANIM_OPACITY_PREV = s.custom.prevOpacity; + window.ANIM_OPACITY_AFTER = s.custom.afterOpacity; + window.ANIM_WINDOW_PREV = s.custom.prevWords; + window.ANIM_WINDOW_AFTER = s.custom.afterWords; + opPrev = s.custom.prevOpacity; + opAfter = s.custom.afterOpacity; + wPrev = s.custom.prevWords; + wAfter = s.custom.afterWords; + } + } + rebuildWindowGradient(); + reapplyWindowNow(); + updateWindowMaxLabel('anim-window-prev', window.ANIM_WINDOW_PREV, window.ANIM_WINDOW_PREV_MAX); + updateWindowMaxLabel('anim-window-after', window.ANIM_WINDOW_AFTER, window.ANIM_WINDOW_AFTER_MAX); + saveAnimSettings(); + return [mode, verseOn, opPrev, opAfter, wPrev, wAfter]; + }""" + ) + + # G. Before/After opacity slider change handlers + anim_opacity_prev_slider.change( + fn=None, + inputs=[anim_opacity_prev_slider], + outputs=[], + api_name=False, show_progress="hidden", + js="(val) => { window.ANIM_OPACITY_PREV = val; rebuildWindowGradient(); reapplyWindowNow(); window._windowSettingsVersion++; saveAnimSettings(); }" + ) + anim_opacity_after_slider.change( + fn=None, + inputs=[anim_opacity_after_slider], + outputs=[], + api_name=False, show_progress="hidden", + js="(val) => { window.ANIM_OPACITY_AFTER = val; rebuildWindowGradient(); reapplyWindowNow(); window._windowSettingsVersion++; saveAnimSettings(); }" + ) + + # G2. Prev/After word count slider change handlers + anim_window_prev_slider.change( + fn=None, + inputs=[anim_window_prev_slider], + outputs=[], + api_name=False, show_progress="hidden", + js="""(val) => { + window.ANIM_WINDOW_PREV = val; + rebuildWindowGradient(); reapplyWindowNow(); + updateWindowMaxLabel('anim-window-prev', val, window.ANIM_WINDOW_PREV_MAX); + window._windowSettingsVersion++; + saveAnimSettings(); + }""" + ) + anim_window_after_slider.change( + fn=None, + inputs=[anim_window_after_slider], + outputs=[], + api_name=False, show_progress="hidden", + js="""(val) => { + window.ANIM_WINDOW_AFTER = val; + rebuildWindowGradient(); reapplyWindowNow(); + updateWindowMaxLabel('anim-window-after', val, window.ANIM_WINDOW_AFTER_MAX); + window._windowSettingsVersion++; + saveAnimSettings(); + }""" + ) + + # G3. Verse checkbox change handler + def _on_verse_toggle(verse_on, mode): + if mode != "Custom": + return gr.update(), gr.update() + return ( + gr.update(interactive=not verse_on), + gr.update(interactive=not verse_on), + ) + + anim_verse_checkbox.change( + fn=_on_verse_toggle, + inputs=[anim_verse_checkbox, anim_mode_radio], + outputs=[anim_window_prev_slider, anim_window_after_slider], + api_name=False, show_progress="hidden", + js="""(val, mode) => { + window.ANIM_VERSE_MODE = val; + reapplyWindowNow(); + window._windowSettingsVersion++; + saveAnimSettings(); + return [val, mode]; + }""" + ) + + # H. Word spacing slider change handler + anim_word_spacing_slider.change( + fn=None, inputs=[anim_word_spacing_slider], outputs=[], + api_name=False, show_progress="hidden", + js="(val) => { var m=document.querySelector('.mega-card'); if(m) m.style.wordSpacing=val+'em'; saveAnimSettings(); }" + ) + + # J. Text size slider change handler + anim_text_size_slider.change( + fn=None, inputs=[anim_text_size_slider], outputs=[], + api_name=False, show_progress="hidden", + js="(val) => { var m=document.querySelector('.mega-card'); if(m) m.style.fontSize=val+'px'; saveAnimSettings(); }" + ) + + # K. Line spacing slider change handler + anim_line_spacing_slider.change( + fn=None, inputs=[anim_line_spacing_slider], outputs=[], + api_name=False, show_progress="hidden", + js="(val) => { var m=document.querySelector('.mega-card'); if(m) m.style.lineHeight=val; saveAnimSettings(); }" + ) + + # L. Active color picker change handler — update CSS variable (client-side only) + anim_color_picker.change( + fn=None, + inputs=[anim_color_picker], + outputs=[], + api_name=False, show_progress="hidden", + js="(val) => { document.documentElement.style.setProperty('--anim-word-color', val); saveAnimSettings(); }" + ) + + + + + # M. Restore animation settings from localStorage on page load + def _restore_anim_settings(cached): + """Restore animation settings from localStorage via hidden JSON bridge.""" + if not cached: + return (gr.update(),) * 11 # No saved settings — keep defaults + mode = cached.get("mode", ANIM_DISPLAY_MODE_DEFAULT) + preset = ANIM_PRESETS.get(mode) + is_custom = not preset + verse_on = bool(cached.get("verseOnly", False)) + if preset: + op_prev = preset["prev_opacity"] + op_after = preset["after_opacity"] + w_prev = preset["prev_words"] + w_after = preset["after_words"] + elif cached.get("custom"): + c = cached["custom"] + op_prev = c.get("prevOpacity", ANIM_OPACITY_PREV_DEFAULT) + op_after = c.get("afterOpacity", ANIM_OPACITY_AFTER_DEFAULT) + w_prev = c.get("prevWords", ANIM_WINDOW_PREV_DEFAULT) + w_after = c.get("afterWords", ANIM_WINDOW_AFTER_DEFAULT) + else: + op_prev = ANIM_OPACITY_PREV_DEFAULT + op_after = ANIM_OPACITY_AFTER_DEFAULT + w_prev = ANIM_WINDOW_PREV_DEFAULT + w_after = ANIM_WINDOW_AFTER_DEFAULT + return ( + gr.update(value=cached.get("granularity", ANIM_GRANULARITY_DEFAULT)), + gr.update(value=mode), + gr.update(value=verse_on), + gr.update(value=cached.get("color", ANIM_WORD_COLOR)), + gr.update(value=op_prev, interactive=is_custom), + gr.update(value=op_after, interactive=is_custom), + gr.update(value=w_prev, interactive=is_custom and not verse_on), + gr.update(value=w_after, interactive=is_custom and not verse_on), + gr.update(value=cached.get("wordSpacing", MEGA_WORD_SPACING_DEFAULT)), + gr.update(value=cached.get("textSize", MEGA_TEXT_SIZE_DEFAULT)), + gr.update(value=cached.get("lineSpacing", MEGA_LINE_SPACING_DEFAULT)), + ) + + app.load( + fn=_restore_anim_settings, + inputs=[anim_cached_settings], + outputs=[ + anim_granularity_radio, anim_mode_radio, anim_verse_checkbox, + anim_color_picker, + anim_opacity_prev_slider, anim_opacity_after_slider, + anim_window_prev_slider, anim_window_after_slider, + anim_word_spacing_slider, anim_text_size_slider, anim_line_spacing_slider, + ], + show_progress="hidden", + js="""(ignored) => { + var s = loadAnimSettings(); + if (s && s.color) document.documentElement.style.setProperty('--anim-word-color', s.color); + // Update window max labels and slider labels after Gradio renders + if (s) setTimeout(function() { + updateWindowMaxLabel('anim-window-prev', window.ANIM_WINDOW_PREV, window.ANIM_WINDOW_PREV_MAX); + updateWindowMaxLabel('anim-window-after', window.ANIM_WINDOW_AFTER, window.ANIM_WINDOW_AFTER_MAX); + if (s.granularity === 'Characters') { + var prevEl = document.getElementById('anim-window-prev'); + if (prevEl) { var lbl = prevEl.querySelector('label span, label'); if (lbl) lbl.textContent = 'Previous Characters'; } + var afterEl = document.getElementById('anim-window-after'); + if (afterEl) { var lbl = afterEl.querySelector('label span, label'); if (lbl) lbl.textContent = 'After Characters'; } + } + }, 200); + return s; + }""" + ) + + # Hidden API-only endpoint for JSON output + gr.Button(visible=False).click( + fn=process_audio_json, + inputs=[audio_input, min_silence_slider, min_speech_slider, pad_slider, model_radio, device_radio], + outputs=[output_json], + api_name="process_audio_json" + ) + + + return app + + +# ============================================================================= +# Module-level demo for Gradio hot-reload (`gradio app.py`) +# ============================================================================= +demo = build_interface() + +# ============================================================================= +# Main +# ============================================================================= + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--share", action="store_true", help="Create public link") + parser.add_argument("--port", type=int, default=PORT, help="Port to run on") + args = parser.parse_args() + + port = 7860 + + print(f"ZeroGPU available: {ZERO_GPU_AVAILABLE}") + print(f"Launching Gradio on port {port}") + + # Preload models and caches at startup so first request is fast + from src.segment_processor import load_segmenter + from src.alignment.phoneme_asr import load_phoneme_asr + from src.alignment.ngram_index import get_ngram_index + from src.alignment.phoneme_matcher_cache import preload_all_chapters + print("Preloading models...") + load_segmenter() + load_phoneme_asr("Base") + load_phoneme_asr("Large") + print("Models preloaded.") + print("Preloading caches...") + get_ngram_index() + preload_all_chapters() + print("Caches preloaded.") + + # Warm up soxr resampler so first request doesn't pay initialization cost + import librosa + _dummy = librosa.resample(np.zeros(1600, dtype=np.float32), orig_sr=44100, target_sr=16000, res_type=RESAMPLE_TYPE) + del _dummy + print("Resampler warmed up.") + + # AoT compilation for VAD model (requires GPU lease) + if IS_HF_SPACE and ZERO_GPU_AVAILABLE: + print("Running AoT compilation for VAD model...") + try: + aoti_result = test_aoti_compilation_gpu() + print(f"AoT compile result: {aoti_result}") + # Apply compiled model OUTSIDE GPU lease (critical for persistence) + if aoti_result.get("compiled"): + apply_aoti_compiled(aoti_result["compiled"]) + except Exception as e: + print(f"AoT compilation failed (non-fatal): {e}") + + demo.launch( + server_name="0.0.0.0", + server_port=port, + share=args.share, + allowed_paths=["/tmp"], + )