Spaces:
Running on Zero
Running on Zero
| """Segment rendering and text formatting helpers.""" | |
| import json | |
| import time | |
| import unicodedata | |
| from config import ( | |
| CONFIDENCE_HIGH, CONFIDENCE_MED, | |
| REVIEW_SUMMARY_MAX_SEGMENTS, | |
| SURAH_INFO_PATH, | |
| ) | |
| from src.core.segment_types import SegmentInfo | |
| from src.alignment.special_segments import ALL_SPECIAL_REFS | |
| def format_timestamp(seconds: float) -> str: | |
| """Format seconds as MM:SS.ms""" | |
| minutes = int(seconds // 60) | |
| secs = seconds % 60 | |
| return f"{minutes}:{secs:04.1f}" | |
| def get_confidence_class(score: float) -> str: | |
| """Get CSS class based on confidence score.""" | |
| if score >= CONFIDENCE_HIGH: | |
| return "segment-high" | |
| elif score >= CONFIDENCE_MED: | |
| return "segment-med" | |
| else: | |
| return "segment-low" | |
| def get_segment_word_stats(matched_ref: str) -> tuple[int, int]: | |
| """Return (word_count, ayah_span) for a matched ref. (0, 1) if unparseable.""" | |
| if not matched_ref or "-" not in matched_ref: | |
| return 0, 1 | |
| try: | |
| start_ref, end_ref = matched_ref.split("-", 1) | |
| start_parts = start_ref.split(":") | |
| end_parts = end_ref.split(":") | |
| if len(start_parts) < 3 or len(end_parts) < 3: | |
| return 0, 1 | |
| # Ayah span | |
| start_ayah = (int(start_parts[0]), int(start_parts[1])) | |
| end_ayah = (int(end_parts[0]), int(end_parts[1])) | |
| ayah_span = 1 | |
| if start_ayah != end_ayah: | |
| ayah_span = abs(end_ayah[1] - start_ayah[1]) + 1 if start_ayah[0] == end_ayah[0] else 2 | |
| # Word count via index | |
| word_count = 0 | |
| from src.core.quran_index import get_quran_index | |
| index = get_quran_index() | |
| indices = index.ref_to_indices(matched_ref) | |
| if indices: | |
| word_count = indices[1] - indices[0] + 1 | |
| return word_count, ayah_span | |
| except Exception: | |
| return 0, 1 | |
| # Arabic-Indic digits for verse markers | |
| ARABIC_DIGITS = { | |
| '0': '٠', '1': '١', '2': '٢', '3': '٣', '4': '٤', | |
| '5': '٥', '6': '٦', '7': '٧', '8': '٨', '9': '٩', | |
| } | |
| def to_arabic_numeral(number: int) -> str: | |
| """Convert an integer to Arabic-Indic numerals.""" | |
| return ''.join(ARABIC_DIGITS[d] for d in str(number)) | |
| def format_verse_marker(verse_num: int) -> str: | |
| """ | |
| Format a verse number as an Arabic verse marker. | |
| Uses U+06DD (Arabic End of Ayah) which renders as a decorated marker | |
| in DigitalKhatt (combines U+06DD + digit into a single glyph). | |
| """ | |
| numeral = to_arabic_numeral(verse_num) | |
| end_of_ayah = '\u06DD' | |
| return f'{end_of_ayah}{numeral}' | |
| # Cached verse word counts from surah_info.json | |
| _verse_word_counts_cache: dict[int, dict[int, int]] | None = None | |
| def _load_verse_word_counts() -> dict[int, dict[int, int]]: | |
| """Load and cache verse word counts from surah_info.json.""" | |
| global _verse_word_counts_cache | |
| if _verse_word_counts_cache is not None: | |
| return _verse_word_counts_cache | |
| with open(SURAH_INFO_PATH, 'r', encoding='utf-8') as f: | |
| surah_info = json.load(f) | |
| _verse_word_counts_cache = {} | |
| for surah_num, data in surah_info.items(): | |
| surah_int = int(surah_num) | |
| _verse_word_counts_cache[surah_int] = {} | |
| for verse_data in data.get('verses', []): | |
| verse_num = verse_data.get('verse') | |
| num_words = verse_data.get('num_words', 0) | |
| if verse_num: | |
| _verse_word_counts_cache[surah_int][verse_num] = num_words | |
| return _verse_word_counts_cache | |
| def _parse_ref_endpoints(matched_ref: str): | |
| """Parse ref like '2:255:1-2:255:5' into (surah, ayah, word_from, word_to). | |
| Returns None for cross-verse refs or unparseable strings. | |
| """ | |
| if not matched_ref or "-" not in matched_ref: | |
| return None | |
| try: | |
| start_ref, end_ref = matched_ref.split("-", 1) | |
| sp = start_ref.split(":") | |
| ep = end_ref.split(":") | |
| if len(sp) < 3 or len(ep) < 3: | |
| return None | |
| s_surah, s_ayah, s_word = int(sp[0]), int(sp[1]), int(sp[2]) | |
| e_surah, e_ayah, e_word = int(ep[0]), int(ep[1]), int(ep[2]) | |
| # Only handle same-verse refs | |
| if s_surah != e_surah or s_ayah != e_ayah: | |
| return None | |
| return (s_surah, s_ayah, s_word, e_word) | |
| except (ValueError, IndexError): | |
| return None | |
| def _parse_ref_verse_ranges(matched_ref: str) -> list[tuple[int, int, int, int]]: | |
| """Decompose a ref into per-verse (surah, ayah, word_from, word_to) ranges. | |
| Handles same-verse refs like '2:255:1-2:255:5' and cross-verse refs | |
| like '76:1:11-76:2:7'. Returns empty list for special/unparseable refs. | |
| """ | |
| if not matched_ref or "-" not in matched_ref: | |
| return [] | |
| try: | |
| start_ref, end_ref = matched_ref.split("-", 1) | |
| sp = start_ref.split(":") | |
| ep = end_ref.split(":") | |
| if len(sp) < 3 or len(ep) < 3: | |
| return [] | |
| s_surah, s_ayah, s_word = int(sp[0]), int(sp[1]), int(sp[2]) | |
| e_surah, e_ayah, e_word = int(ep[0]), int(ep[1]), int(ep[2]) | |
| except (ValueError, IndexError): | |
| return [] | |
| if s_surah != e_surah: | |
| return [] # cross-surah not expected | |
| surah = s_surah | |
| if s_ayah == e_ayah: | |
| return [(surah, s_ayah, s_word, e_word)] | |
| # Cross-verse: decompose into per-verse ranges | |
| verse_wc = _load_verse_word_counts() | |
| ranges = [] | |
| for ayah in range(s_ayah, e_ayah + 1): | |
| expected = verse_wc.get(surah, {}).get(ayah, 0) | |
| if expected == 0: | |
| continue | |
| if ayah == s_ayah: | |
| ranges.append((surah, ayah, s_word, expected)) | |
| elif ayah == e_ayah: | |
| ranges.append((surah, ayah, 1, e_word)) | |
| else: | |
| ranges.append((surah, ayah, 1, expected)) | |
| return ranges | |
| def recompute_missing_words(segments: list) -> None: | |
| """Recompute has_missing_words flags for all segments based on word gaps. | |
| Uses coverage-based analysis: decomposes all refs (including cross-verse) | |
| into per-verse word ranges, then checks each verse for uncovered words. | |
| """ | |
| verse_wc = _load_verse_word_counts() | |
| # Reset all flags | |
| for seg in segments: | |
| seg.has_missing_words = False | |
| # Build per-verse coverage: {(surah, ayah): [(word_from, word_to, seg_idx), ...]} | |
| coverage: dict[tuple[int, int], list[tuple[int, int, int]]] = {} | |
| for i, seg in enumerate(segments): | |
| for surah, ayah, wf, wt in _parse_ref_verse_ranges(seg.matched_ref): | |
| coverage.setdefault((surah, ayah), []).append((wf, wt, i)) | |
| # Check each verse for gaps | |
| for (surah, ayah), entries in coverage.items(): | |
| expected = verse_wc.get(surah, {}).get(ayah, 0) | |
| if expected == 0: | |
| continue | |
| entries.sort() # sort by word_from | |
| # Gap at start of verse | |
| if entries[0][0] > 1: | |
| segments[entries[0][2]].has_missing_words = True | |
| # Gaps between consecutive coverage entries | |
| for j in range(len(entries) - 1): | |
| wf_j, wt_j, idx_j = entries[j] | |
| wf_k, wt_k, idx_k = entries[j + 1] | |
| if wf_k > wt_j + 1: | |
| segments[idx_j].has_missing_words = True | |
| segments[idx_k].has_missing_words = True | |
| # Gap at end of verse | |
| if entries[-1][1] < expected: | |
| segments[entries[-1][2]].has_missing_words = True | |
| # Check for whole-verse gaps between consecutive covered verses | |
| by_surah: dict[int, list[int]] = {} | |
| for (surah, ayah) in coverage: | |
| by_surah.setdefault(surah, []).append(ayah) | |
| for surah, ayahs in by_surah.items(): | |
| ayahs_sorted = sorted(set(ayahs)) | |
| for k in range(len(ayahs_sorted) - 1): | |
| if ayahs_sorted[k + 1] > ayahs_sorted[k] + 1: | |
| # Whole verse(s) missing between these two covered verses | |
| prev_entries = coverage[(surah, ayahs_sorted[k])] | |
| next_entries = coverage[(surah, ayahs_sorted[k + 1])] | |
| last_in_prev = max(prev_entries, key=lambda e: e[1])[2] | |
| first_in_next = min(next_entries, key=lambda e: e[0])[2] | |
| segments[last_in_prev].has_missing_words = True | |
| segments[first_in_next].has_missing_words = True | |
| def resolve_ref_text(matched_ref: str) -> str: | |
| """Return the matched_text for a given ref (display text from QuranIndex or special text).""" | |
| from src.alignment.special_segments import ALL_SPECIAL_REFS, TRANSITION_TEXT | |
| BASMALA_TEXT = "بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيم" | |
| ISTIATHA_TEXT = "أَعُوذُ بِٱللَّهِ مِنَ الشَّيْطَانِ الرَّجِيم" | |
| if matched_ref in ALL_SPECIAL_REFS: | |
| if matched_ref == "Basmala": | |
| return BASMALA_TEXT | |
| elif matched_ref == "Isti'adha": | |
| return ISTIATHA_TEXT | |
| return TRANSITION_TEXT.get(matched_ref, matched_ref) | |
| from src.core.quran_index import get_quran_index | |
| index = get_quran_index() | |
| indices = index.ref_to_indices(matched_ref) | |
| if not indices: | |
| return "" | |
| return " ".join(w.display_text for w in index.words[indices[0]:indices[1] + 1]) | |
| def split_into_char_groups(text): | |
| """Split text into groups of base character + following combining marks. | |
| Each group is one visible "letter" — a base character followed by any | |
| diacritics (tashkeel) or other combining marks attached to it. | |
| Tatweel (U+0640) and Word Joiner (U+2060) are folded into the current | |
| group as zero-width visual extensions. | |
| Hamza above/below (U+0654/U+0655) start their own group so MFA can | |
| assign them separate timestamps. | |
| """ | |
| groups = [] | |
| current = "" | |
| for ch in text: | |
| if ch in ('\u0640', '\u2060'): | |
| current += ch # Tatweel / Word Joiner fold into current group | |
| elif ch in ('\u0654', '\u0655'): | |
| # Hamza above/below: start own group (MFA gives separate timestamps) | |
| if current: | |
| groups.append(current) | |
| current = ch | |
| elif unicodedata.category(ch).startswith('M') and ch != '\u0670': | |
| current += ch | |
| else: | |
| if current: | |
| groups.append(current) | |
| current = ch | |
| if current: | |
| groups.append(current) | |
| return groups | |
| ZWSP = '\u2060' # Word Joiner: zero-width non-breaking (avoids mid-word line breaks) | |
| DAGGER_ALEF = '\u0670' | |
| def _wrap_word(word_text, pos=None): | |
| """Wrap a word in <span class="word">. Char spans are deferred to MFA timestamp injection.""" | |
| pos_attr = f' data-pos="{pos}"' if pos else '' | |
| return f'<span class="word"{pos_attr}>{word_text}</span>' | |
| def get_text_with_markers(matched_ref: str) -> str | None: | |
| """ | |
| Generate matched text with verse markers inserted at verse boundaries. | |
| Uses position-based detection: iterates words and inserts an HTML marker | |
| after the last word of each verse (matching recitation_app approach). | |
| Args: | |
| matched_ref: Reference like "2:255:1-2:255:5" | |
| Returns: | |
| Text with verse markers, or None if ref is invalid | |
| """ | |
| if not matched_ref: | |
| return None | |
| from src.core.quran_index import get_quran_index | |
| index = get_quran_index() | |
| indices = index.ref_to_indices(matched_ref) | |
| if not indices: | |
| return None | |
| start_idx, end_idx = indices | |
| verse_word_counts = _load_verse_word_counts() | |
| parts = [] | |
| for w in index.words[start_idx:end_idx + 1]: | |
| parts.append(_wrap_word(w.display_text, pos=f"{w.surah}:{w.ayah}:{w.word}")) | |
| # Check if this is the last word of its verse | |
| num_words = verse_word_counts.get(w.surah, {}).get(w.ayah, 0) | |
| if num_words > 0 and w.word == num_words: | |
| parts.append(format_verse_marker(w.ayah)) | |
| return " ".join(parts) | |
| def simplify_ref(ref: str) -> str: | |
| """Simplify a matched_ref like '84:9:1-84:9:4' to '84:9:1-4' when same verse.""" | |
| if not ref or "-" not in ref: | |
| return ref | |
| parts = ref.split("-") | |
| if len(parts) != 2: | |
| return ref | |
| start, end = parts | |
| start_parts = start.split(":") | |
| end_parts = end.split(":") | |
| if len(start_parts) == 3 and len(end_parts) == 3: | |
| if start_parts[0] == end_parts[0] and start_parts[1] == end_parts[1]: | |
| return f"{start}-{end_parts[2]}" | |
| return ref | |
| def render_segment_card(seg: SegmentInfo, idx: int, full_audio_url: str = "", render_key: str = "", segment_dir: str = "", in_missing_pair: bool = False) -> str: | |
| """Render a single segment as an HTML card with optional audio player.""" | |
| is_special = seg.matched_ref in ALL_SPECIAL_REFS | |
| confidence_class = get_confidence_class(seg.match_score) | |
| confidence_badge_class = confidence_class # preserve original for badge color | |
| if is_special: | |
| confidence_class = "segment-special" | |
| elif seg.has_repeated_words: | |
| confidence_class = "segment-med" | |
| elif seg.has_missing_words and not in_missing_pair: | |
| confidence_class = "segment-low" | |
| timestamp = f"{format_timestamp(seg.start_time)} - {format_timestamp(seg.end_time)}" | |
| duration = seg.end_time - seg.start_time | |
| # Format reference (simplify same-verse refs) | |
| ref_display = simplify_ref(seg.matched_ref) if seg.matched_ref else "" | |
| # Confidence percentage with label | |
| confidence_pct = f"Confidence: {seg.match_score:.0%}" | |
| # Missing words badge (only for single-segment cases; pairs use a group wrapper) | |
| missing_badge = "" | |
| if seg.has_missing_words and not in_missing_pair: | |
| missing_badge = '<div class="segment-badge segment-low-badge">Missing Words</div>' | |
| # Repeated words badge with feedback buttons | |
| repeated_badge = "" | |
| if seg.has_repeated_words: | |
| repeated_badge = ( | |
| f'<div class="repeat-feedback-group" data-segment-idx="{idx}">' | |
| '<button class="repeat-fb-btn repeat-fb-up" title="Correct">✓</button>' | |
| '<button class="repeat-fb-btn repeat-fb-down" title="Incorrect">✗</button>' | |
| '<div class="segment-badge segment-repeated-badge">Repeated Words</div>' | |
| '</div>' | |
| ) | |
| # Error display | |
| error_html = "" | |
| if seg.error: | |
| error_html = f'<div class="segment-error">{seg.error}</div>' | |
| # Audio player HTML — per-segment WAV (preferred) or media fragment fallback | |
| audio_html = "" | |
| if segment_dir or full_audio_url: | |
| if segment_dir: | |
| audio_src = f"/gradio_api/file={segment_dir}/seg_{idx}.wav" | |
| else: | |
| audio_src = f"{full_audio_url}#t={seg.start_time:.3f},{seg.end_time:.3f}" | |
| # Add animate button only if segment has a Quran verse ref (word spans for animation). | |
| # Basmala/Isti'adha get animate because they have indexed word spans for MFA. | |
| # Transition segments (Amin, Takbir, Tahmeed) don't. | |
| animate_btn = "" | |
| _ANIMATABLE_SPECIALS = {"Basmala", "Isti'adha"} | |
| if seg.matched_ref and (seg.matched_ref not in ALL_SPECIAL_REFS or seg.matched_ref in _ANIMATABLE_SPECIALS): | |
| animate_btn = f'<button class="animate-btn" data-segment="{idx}" disabled>Animate</button>' | |
| audio_html = f''' | |
| <div class="segment-audio"> | |
| <audio data-src="{audio_src}" preload="none" | |
| style="display:none; width: 100%; height: 32px;"> | |
| </audio> | |
| <button class="play-btn">▶</button> | |
| {animate_btn} | |
| </div> | |
| ''' | |
| # Build matched text with verse markers at all verse boundaries | |
| BASMALA_TEXT = "بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيم" | |
| ISTIATHA_TEXT = "أَعُوذُ بِٱللَّهِ مِنَ الشَّيْطَانِ الرَّجِيم" | |
| _SPECIAL_PREFIXES = [ISTIATHA_TEXT, BASMALA_TEXT] | |
| # Helper to wrap words in spans | |
| def wrap_words_in_spans(text): | |
| return " ".join(_wrap_word(w) for w in text.split()) | |
| if seg.matched_ref: | |
| # Generate text with markers from the index | |
| text_html = get_text_with_markers(seg.matched_ref) | |
| if text_html and seg.matched_text: | |
| # Check for any special prefix (fused or forward-merged) | |
| for _sp_name, _sp in [("Isti'adha", ISTIATHA_TEXT), | |
| ("Basmala", BASMALA_TEXT)]: | |
| if seg.matched_text.startswith(_sp): | |
| mfa_prefix = f"{_sp_name}+{seg.matched_ref}" | |
| words = _sp.replace(" ", " ").split() | |
| prefix_html = " ".join( | |
| _wrap_word(w, pos=f"{mfa_prefix}:0:0:{i+1}") | |
| for i, w in enumerate(words) | |
| ) | |
| text_html = prefix_html + " " + text_html | |
| break | |
| elif not text_html: | |
| # Special ref (Basmala/Isti'adha): wrap words with indexed data-pos | |
| # so MFA timestamps can be injected later | |
| if seg.matched_ref and seg.matched_text: | |
| words = seg.matched_text.replace(" \u06dd ", " ").split() | |
| text_html = " ".join( | |
| _wrap_word(w, pos=f"{seg.matched_ref}:0:0:{i+1}") | |
| for i, w in enumerate(words) | |
| ) | |
| else: | |
| text_html = seg.matched_text or "" | |
| elif seg.matched_text: | |
| # Special segments (Basmala/Isti'adha) have text but no ref | |
| text_html = wrap_words_in_spans(seg.matched_text) | |
| else: | |
| text_html = "" | |
| # Rebuild text as reading-order sections when wraps detected | |
| if seg.repeated_ranges: | |
| sections = [] | |
| for sec_from, sec_to in seg.repeated_ranges: | |
| sec = get_text_with_markers(f"{sec_from}-{sec_to}") | |
| if sec: | |
| sections.append(sec) | |
| if sections: | |
| text_html = '<div class="repeat-divider"></div>'.join(sections) | |
| if is_special: | |
| confidence_badge = f'<div class="segment-badge segment-special-badge">{seg.matched_ref}</div>' | |
| else: | |
| confidence_badge = f'<div class="segment-badge {confidence_badge_class}-badge">{confidence_pct}</div>' | |
| # Build inline header: Segment N | ref | duration | time range | |
| header_parts = [f"Segment {idx + 1}"] | |
| if ref_display: | |
| full_ref = seg.matched_ref or "" | |
| header_parts.append( | |
| f'<span class="ref-editable" data-segment-idx="{idx}" data-full-ref="{full_ref}">{ref_display}</span>' | |
| ) | |
| header_parts.append(f"{duration:.1f}s") | |
| header_parts.append(timestamp) | |
| header_text = " | ".join(header_parts) | |
| html = f''' | |
| <div class="segment-card {confidence_class}" data-duration="{duration:.3f}" data-segment-idx="{idx}" data-matched-ref="{seg.matched_ref or ''}" data-confidence-class="{confidence_badge_class}" data-start-time="{seg.start_time:.4f}" data-end-time="{seg.end_time:.4f}"> | |
| <div class="segment-header"> | |
| <div class="segment-title">{header_text}</div> | |
| <div class="segment-badges"> | |
| {repeated_badge} | |
| {missing_badge} | |
| {confidence_badge} | |
| </div> | |
| </div> | |
| {audio_html} | |
| <div class="segment-text"> | |
| {text_html} | |
| </div> | |
| {error_html} | |
| </div> | |
| ''' | |
| return html | |
| def render_segments(segments: list, full_audio_url: str = "", segment_dir: str = "") -> str: | |
| """Render all segments as HTML with optional audio players. | |
| Args: | |
| segments: List of SegmentInfo objects | |
| full_audio_url: URL to full audio WAV (used by mega card / Animate All) | |
| segment_dir: Path to segment directory containing per-segment WAV files | |
| """ | |
| if not segments: | |
| return '<div class="no-segments">No segments detected</div>' | |
| # Generate unique key for this render to prevent audio caching | |
| render_key = str(int(time.time() * 1000)) | |
| # Categorize segments by confidence level (1-indexed for display), excluding specials | |
| med_segments = [i + 1 for i, s in enumerate(segments) | |
| if CONFIDENCE_MED <= s.match_score < CONFIDENCE_HIGH and s.matched_ref not in ALL_SPECIAL_REFS] | |
| low_segments = [i + 1 for i, s in enumerate(segments) | |
| if s.match_score < CONFIDENCE_MED and s.matched_ref not in ALL_SPECIAL_REFS] | |
| # Build header with confidence summary | |
| header_parts = [] | |
| header_parts.append(f'<div class="segments-header">Found {len(segments)} segments</div>') | |
| # Combined review summary: merge medium and low confidence segments into one color-coded list | |
| low_set = set(low_segments) | |
| all_review = sorted(set(med_segments) | low_set) | |
| if all_review: | |
| def _span(n: int) -> str: | |
| css = "segment-low-text" if n in low_set else "segment-med-text" | |
| return f'<span class="{css}">{n}</span>' | |
| if len(all_review) <= REVIEW_SUMMARY_MAX_SEGMENTS: | |
| seg_html = ", ".join(_span(n) for n in all_review) | |
| else: | |
| seg_html = ", ".join(_span(n) for n in all_review[:REVIEW_SUMMARY_MAX_SEGMENTS]) | |
| remaining = len(all_review) - REVIEW_SUMMARY_MAX_SEGMENTS | |
| seg_html += f" ... and {remaining} more" | |
| header_parts.append( | |
| f'<div class="segments-review-summary">' | |
| f'Needs review: {len(all_review)} (segments {seg_html})' | |
| f'</div>' | |
| ) | |
| missing_segments = [i + 1 for i, s in enumerate(segments) if s.has_missing_words] | |
| if missing_segments: | |
| # Group consecutive segment numbers into pairs (only if same verse) | |
| missing_pairs = [] | |
| i = 0 | |
| while i < len(missing_segments): | |
| if i + 1 < len(missing_segments) and missing_segments[i + 1] == missing_segments[i] + 1: | |
| idx_a = missing_segments[i] - 1 # 0-based | |
| idx_b = missing_segments[i + 1] - 1 | |
| ref_a = _parse_ref_endpoints(segments[idx_a].matched_ref) | |
| ref_b = _parse_ref_endpoints(segments[idx_b].matched_ref) | |
| if ref_a and ref_b and (ref_a[0], ref_a[1]) == (ref_b[0], ref_b[1]): | |
| missing_pairs.append(f"{missing_segments[i]}/{missing_segments[i + 1]}") | |
| i += 2 | |
| continue | |
| missing_pairs.append(str(missing_segments[i])) | |
| i += 1 | |
| if len(missing_pairs) <= REVIEW_SUMMARY_MAX_SEGMENTS: | |
| pairs_display = ", ".join(missing_pairs) | |
| else: | |
| pairs_display = ", ".join(missing_pairs[:REVIEW_SUMMARY_MAX_SEGMENTS]) | |
| remaining = len(missing_pairs) - REVIEW_SUMMARY_MAX_SEGMENTS | |
| pairs_display += f" ... and {remaining} more" | |
| header_parts.append( | |
| f'<div class="segments-review-summary">' | |
| f'Segments with missing words: <span class="segment-low-text">{len(missing_pairs)} (segments {pairs_display})</span>' | |
| f'</div>' | |
| ) | |
| repeated_segments = [i + 1 for i, s in enumerate(segments) if s.has_repeated_words] | |
| if repeated_segments: | |
| if len(repeated_segments) <= REVIEW_SUMMARY_MAX_SEGMENTS: | |
| rep_display = ", ".join(str(n) for n in repeated_segments) | |
| else: | |
| rep_display = ", ".join(str(n) for n in repeated_segments[:REVIEW_SUMMARY_MAX_SEGMENTS]) | |
| remaining = len(repeated_segments) - REVIEW_SUMMARY_MAX_SEGMENTS | |
| rep_display += f" ... and {remaining} more" | |
| header_parts.append( | |
| f'<div class="segments-review-summary">' | |
| f'Segments with repeated words: <span class="segment-med-text">{len(repeated_segments)} (segments {rep_display})</span>' | |
| f'</div>' | |
| ) | |
| html_parts = [ | |
| f'<div class="segments-container" data-render-key="{render_key}" data-full-audio="{full_audio_url}">', | |
| "\n".join(header_parts), | |
| ] | |
| # Classify missing-word segments into pairs vs singles | |
| # Only pair consecutive segments if they share the same verse (same surah:ayah) | |
| missing_indices = [i for i, s in enumerate(segments) if s.has_missing_words] | |
| missing_in_pair = set() | |
| visited = set() | |
| for j in range(len(missing_indices)): | |
| idx = missing_indices[j] | |
| if idx in visited: | |
| continue | |
| if j + 1 < len(missing_indices) and missing_indices[j + 1] == idx + 1: | |
| ref_a = _parse_ref_endpoints(segments[idx].matched_ref) | |
| ref_b = _parse_ref_endpoints(segments[idx + 1].matched_ref) | |
| if ref_a and ref_b and (ref_a[0], ref_a[1]) == (ref_b[0], ref_b[1]): | |
| missing_in_pair.add(idx) | |
| missing_in_pair.add(idx + 1) | |
| visited.add(idx) | |
| visited.add(idx + 1) | |
| continue | |
| visited.add(idx) | |
| t_cards = time.time() | |
| skip_next = False | |
| for idx, seg in enumerate(segments): | |
| if skip_next: | |
| skip_next = False | |
| continue | |
| if idx in missing_in_pair and (idx + 1) in missing_in_pair: | |
| seg_b = segments[idx + 1] | |
| html_parts.append('<div class="missing-words-group">') | |
| html_parts.append('<div class="missing-words-group-tag">Missing Words</div>') | |
| html_parts.append(render_segment_card(seg, idx, full_audio_url, render_key, segment_dir, in_missing_pair=True)) | |
| html_parts.append(render_segment_card(seg_b, idx + 1, full_audio_url, render_key, segment_dir, in_missing_pair=True)) | |
| html_parts.append('</div>') | |
| skip_next = True | |
| else: | |
| html_parts.append(render_segment_card(seg, idx, full_audio_url, render_key, segment_dir)) | |
| html_parts.append('</div>') | |
| print(f"[PROFILE] Segment cards: {time.time() - t_cards:.3f}s ({len(segments)} cards, HTML only)") | |
| return "\n".join(html_parts) | |
| def is_end_of_verse(matched_ref: str) -> bool: | |
| """ | |
| Check if a reference ends at the last word of a verse. | |
| Expects formats like "2:255:1-2:255:5" or "2:255:5". | |
| """ | |
| if not matched_ref or ":" not in matched_ref: | |
| return False | |
| try: | |
| # Take the end part of the range (or the single ref) | |
| end_ref = matched_ref.split("-")[-1] | |
| parts = end_ref.split(":") | |
| if len(parts) < 3: | |
| return False | |
| surah = int(parts[0]) | |
| ayah = int(parts[1]) | |
| word = int(parts[2]) | |
| verse_word_counts = _load_verse_word_counts() | |
| if surah not in verse_word_counts: | |
| return False | |
| num_words = verse_word_counts[surah].get(ayah, 0) | |
| return word >= num_words | |
| except Exception as e: | |
| print(f"Error checking end of verse: {e}") | |
| return False | |