Spaces:

hetchyy
/

Tajweed-AI

Running on Zero

App Files Files Community

Tajweed-AI / recitation_engine /alignment.py

hetchyy

Add i8n

fb39b28 27 days ago

raw

history blame contribute delete

40.5 kB

	"""
	Alignment and Visualization Generation

	This module provides functions for generating HTML visualizations of phoneme- and word-level
	alignment for Quranic recitation evaluation. It aligns predicted and canonical phoneme
	sequences, color-codes alignment differences (correct, substitution, insertion, deletion),
	renders tables (scrolling or chunked), aggregates results for segmented audio, and produces
	interactive UI elements for feedback (including per-segment audio playback and error reports).

	Key capabilities:
	- Per-segment alignment visualization for segmented audio submissions.
	- Word-level and phoneme-level feedback, including error marking and metrics.
	- Flexible rendering styles: inline vs. combined tables, collapsible sections, and error
	grouping.
	- UI elements for reference/user audio, interactive playback, and lazy loading of segments.
	- Compatibility with error-analysis rendering (for advanced UI sort/grouping).
	- Designed for integration into full-stack Gradio/app interfaces or downstream UI frameworks.

	Most functions return HTML or HTML + metrics, designed for direct embedding in frontend UIs.
	"""
	import sys
	from pathlib import Path

	# Add parent directory to path for imports
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from i8n import t
	from config import (
	COLORS,
	PHONEME_TABLE_STYLE,
	PHONEME_TABLE_CHUNK_SIZE,
	SEGMENT_PHONEME_TABLE_LOCATION,
	SEGMENT_ERROR_TABLE_LOCATION,
	SEGMENT_PHONEME_TABLE_COLLAPSED,
	)

	from .metrics import align_sequences, PER_single
	from .word_recovery import recover_words, recover_words_for_segment
	from recitation_analysis.ui.tooltip import get_tooltip_css
	from recitation_analysis.text_display.error_highlighting import render_combined_error_table
	# Note: segment audio clips are provided directly by the pipeline.


	def _generate_segment_divider(segment_num, total_segments, time_range, word_range, is_processing=False, status=None):
	"""Generate a divider between segments.

	Args:
	segment_num: Current segment number (1-based)
	total_segments: Total number of segments
	time_range: Time range string (e.g., "1.2s - 3.5s") - kept for API compatibility but not displayed
	word_range: Word range string (e.g., "1-4")
	is_processing: If True, show blinking orange animation
	status: 'success' for ✓, 'error' for ✗, 'warning' for ⚠️, None for no indicator
	"""
	# Determine status indicator
	status_icon = ""
	if status == "success":
	status_icon = " ✅"
	elif status == "error":
	status_icon = " ❌"
	elif status == "warning":
	status_icon = " ⚠️"

	if is_processing:
	# Blinking orange divider for processing state
	return f'''
	<style>
	@keyframes blink-orange {{
	0%, 100% {{ opacity: 1; }}
	50% {{ opacity: 0.3; }}
	}}
	</style>
	<div style="display: flex; align-items: center; margin: 20px 0 10px 0; direction: ltr;">
	<div style="flex-grow: 1; height: 3px; background: linear-gradient(to right, transparent, #fb8c00); animation: blink-orange 1s ease-in-out infinite;"></div>
	<div style="padding: 0 15px; font-size: 13px; color: #fb8c00; font-weight: bold; animation: blink-orange 1s ease-in-out infinite;">
	{t("segments.processing_template", n=segment_num, total=total_segments, range=word_range)}
	</div>
	<div style="flex-grow: 1; height: 3px; background: linear-gradient(to left, transparent, #fb8c00); animation: blink-orange 1s ease-in-out infinite;"></div>
	</div>
	'''
	else:
	# Normal blue divider for completed segments with status indicator
	return f'''
	<div style="display: flex; align-items: center; margin: 20px 0 10px 0; direction: ltr;">
	<div style="flex-grow: 1; height: 2px; background: linear-gradient(to right, transparent, var(--primary-500, #3b82f6));"></div>
	<div style="padding: 0 15px; font-size: 13px; color: var(--primary-500, #3b82f6); font-weight: bold;">
	{t("segments.header_template", n=segment_num, total=total_segments, range=word_range)}{status_icon}
	</div>
	<div style="flex-grow: 1; height: 2px; background: linear-gradient(to left, transparent, var(--primary-500, #3b82f6));"></div>
	</div>
	'''


	def _generate_scroll_table(expected_row, actual_row, diff_row):
	"""Generate a single horizontally scrolling table."""
	html = f'''
	<div style="font-family: monospace; font-size: 14px; margin: 10px 0;">
	<div style="border: 1px solid var(--border-color-primary, #e5e7eb); border-radius: 5px; background: var(--background-fill-primary, white); overflow-x: auto;">
	<table style="border-collapse: collapse; white-space: nowrap;">
	<tr style="background-color: var(--table-even-background-fill, #f9fafb); border-bottom: 1px solid var(--border-color-primary, #e5e7eb);">
	<td style="padding: 8px 12px; font-weight: bold; border-right: 1px solid var(--border-color-primary, #e5e7eb); background-color: var(--table-even-background-fill, #f9fafb); position: sticky; left: 0; z-index: 1; color: var(--body-text-color, inherit);">{t("phoneme_alignment.expected")}</td>
	'''

	for token in expected_row:
	html += f'<td style="padding: 8px; text-align: center; border-right: 1px solid var(--border-color-accent, #f3f4f6); min-width: 40px; color: var(--body-text-color, inherit);">{token}</td>'

	html += f'''
	</tr>
	<tr style="border-bottom: 1px solid var(--border-color-primary, #e5e7eb);">
	<td style="padding: 8px 12px; font-weight: bold; border-right: 1px solid var(--border-color-primary, #e5e7eb); background-color: var(--background-fill-primary, white); position: sticky; left: 0; z-index: 1; color: var(--body-text-color, inherit);">{t("phoneme_alignment.your_rec")}</td>
	'''

	for token in actual_row:
	html += f'<td style="padding: 8px; text-align: center; border-right: 1px solid var(--border-color-accent, #f3f4f6); min-width: 40px; color: var(--body-text-color, inherit);">{token}</td>'

	html += f'''
	</tr>
	<tr>
	<td style="padding: 8px 12px; font-weight: bold; border-right: 1px solid var(--border-color-primary, #e5e7eb); background-color: var(--background-fill-primary, white); position: sticky; left: 0; z-index: 1; color: var(--body-text-color, inherit);">{t("phoneme_alignment.diff")}</td>
	'''

	for symbol, color in diff_row:
	html += f'<td style="padding: 8px; text-align: center; border-right: 1px solid var(--border-color-accent, #f3f4f6); min-width: 40px; {color} color: var(--body-text-color, inherit);">{symbol}</td>'

	html += '''
	</tr>
	</table>
	</div>
	</div>
	'''
	return html


	def _generate_chunked_table(expected_row, actual_row, diff_row):
	"""Generate multiple tables split into chunks."""
	chunk_size = PHONEME_TABLE_CHUNK_SIZE
	num_chunks = (len(expected_row) + chunk_size - 1) // chunk_size

	html = '<div style="font-family: monospace; font-size: 14px; margin: 10px 0;">'

	for chunk_idx in range(num_chunks):
	start_idx = chunk_idx * chunk_size
	end_idx = min(start_idx + chunk_size, len(expected_row))

	expected_chunk = expected_row[start_idx:end_idx]
	actual_chunk = actual_row[start_idx:end_idx]
	diff_chunk = diff_row[start_idx:end_idx]

	# Add chunk number if multiple chunks
	if num_chunks > 1:
	html += f'<div style="font-weight: bold; margin-top: 15px; margin-bottom: 5px;">Part {chunk_idx + 1} of {num_chunks}</div>'

	html += '''
	<div style="border: 1px solid var(--border-color-primary, #e5e7eb); border-radius: 5px; background: var(--background-fill-primary, white); margin-bottom: 10px; width: 100%; display: block;">
	<table style="width: 100%; border-collapse: collapse; table-layout: auto; min-width: 100%;">
	<colgroup>
	<col style="width: 80px;">
	'''

	for _ in expected_chunk:
	html += '<col style="min-width: 40px; width: auto;">'

	html += f'''
	</colgroup>
	<tr style="background-color: var(--table-even-background-fill, #f9fafb); border-bottom: 1px solid var(--border-color-primary, #e5e7eb);">
	<td style="padding: 8px; font-weight: bold; border-right: 1px solid var(--border-color-primary, #e5e7eb); background-color: var(--table-even-background-fill, #f9fafb); white-space: nowrap; color: var(--body-text-color, inherit);">{t("phoneme_alignment.expected")}</td>
	'''

	for token in expected_chunk:
	html += f'<td style="padding: 8px; text-align: center; border-right: 1px solid #f1f3f4; white-space: nowrap; min-width: 40px;">{token}</td>'

	html += f'''</tr>
	<tr style="border-bottom: 1px solid var(--border-color-primary, #e5e7eb);">
	<td style="padding: 8px; font-weight: bold; border-right: 1px solid var(--border-color-primary, #e5e7eb); background-color: var(--background-fill-primary, white); white-space: nowrap; color: var(--body-text-color, inherit);">{t("phoneme_alignment.your_rec")}</td>'''

	for token in actual_chunk:
	html += f'<td style="padding: 8px; text-align: center; border-right: 1px solid #f1f3f4; white-space: nowrap; min-width: 40px;">{token}</td>'

	html += f'''</tr>
	<tr>
	<td style="padding: 8px; font-weight: bold; border-right: 1px solid var(--border-color-primary, #e5e7eb); background-color: var(--background-fill-primary, white); white-space: nowrap; color: var(--body-text-color, inherit);">{t("phoneme_alignment.diff")}</td>'''

	for symbol, color in diff_chunk:
	html += f'<td style="padding: 8px; text-align: center; border-right: 1px solid #f1f3f4; white-space: nowrap; min-width: 40px; {color}">{symbol}</td>'

	html += '''</tr>
	</table>
	</div>
	'''

	html += '</div>'
	return html


	def create_alignment_visualization(expected_phonemes, actual_phonemes, verse_ref=None, audio_data=None):
	"""
	Create an HTML visualization of phoneme alignment, color-coding substitutions, insertions,
	and deletions between expected and actual phoneme sequences.

	- If a verse reference is provided, word-level feedback is included above the phoneme table.
	- Handles both plain and pre-computed alignments, using error pipeline outputs if available.
	- Rendering style (scroll or chunked tables) and output appearance depend on config settings.

	Args:
	expected_phonemes: Space-separated expected phoneme string
	actual_phonemes: Space-separated actual phoneme string (from user's recitation)
	verse_ref: (Optional) Verse reference for word-level feedback and advanced alignment
	audio_data: (Optional) Tuple of (sample_rate, audio_array) for duration analysis

	Returns:
	Tuple of (html_string, accuracy, expected_count, actual_count)
	html_string: HTML visualization for embedding
	accuracy: Accuracy percentage (100 - PER)
	expected_count: Number of expected phonemes
	actual_count: Number of actual phonemes
	"""
	try:
	# Generate word-level feedback if verse_ref is provided
	word_html = ""
	recitation_result = None
	if verse_ref:
	word_feedback, word_error, recitation_result = recover_words(verse_ref, expected_phonemes, actual_phonemes, audio_data)
	if word_feedback:
	word_html = word_feedback
	elif word_error:
	# Show error but continue with phoneme alignment
	word_html = f'<div style="color: var(--body-text-color-subdued, #6b7280); padding: 8px; font-size: 13px;">ℹ️ Word-level feedback unavailable: {word_error}</div>'

	# Tokenize phonemes for alignment and metrics
	expected_tokens = expected_phonemes.split()
	actual_tokens = actual_phonemes.split()

	# Reuse alignment from error pipeline if available (avoids O(n*m) recomputation)
	if recitation_result and recitation_result.phoneme_alignment:
	alignment = list(recitation_result.phoneme_alignment)
	else:
	# Fallback: compute alignment (only when no verse_ref or error occurred)
	alignment = align_sequences(expected_tokens, actual_tokens)

	# Prepare data for 3-row table
	expected_row = []
	actual_row = []
	diff_row = []

	for ref_tok, hyp_tok, op in alignment:
	if op == "C": # Correct
	expected_row.append(ref_tok)
	actual_row.append(hyp_tok)
	diff_row.append(("", "")) # No symbol, no color
	elif op == "S": # Substitution
	expected_row.append(ref_tok)
	actual_row.append(hyp_tok)
	diff_row.append(("✗", COLORS["substitution"]))
	elif op == "D": # Deletion
	expected_row.append(ref_tok)
	actual_row.append("—")
	diff_row.append(("−", COLORS["deletion"]))
	elif op == "I": # Insertion
	expected_row.append("—")
	actual_row.append(hyp_tok)
	diff_row.append(("+", COLORS["insertion"]))

	# Calculate PER
	per_score = PER_single(actual_phonemes, expected_phonemes)
	accuracy = 100 - per_score

	# Generate table based on style setting
	if PHONEME_TABLE_STYLE == "scroll":
	table_html = _generate_scroll_table(expected_row, actual_row, diff_row)
	else:
	table_html = _generate_chunked_table(expected_row, actual_row, diff_row)

	# Wrap in collapsible section (respects config setting)
	# Use ltr-preserve class on table only - header should follow RTL direction for Arabic text
	details_open = "" if SEGMENT_PHONEME_TABLE_COLLAPSED else "open"
	phoneme_html = f'''
	<div class="phoneme-alignment">
	<details {details_open} style="margin: 10px 0;">
	<summary style="cursor: pointer; font-weight: bold; font-size: 14px; padding: 8px; background-color: var(--background-fill-secondary, #f3f4f6); border-radius: 4px; user-select: none; color: var(--body-text-color, inherit);">
	{t("phoneme_alignment.header")}
	</summary>
	<div class="ltr-preserve">
	{table_html}
	</div>
	</details>
	</div>
	'''

	# Prepend word-level feedback before phoneme alignment
	full_html = word_html + phoneme_html

	return full_html, accuracy, len(expected_tokens), len(actual_tokens)

	except Exception as e:
	error_html = f'<div style="color: red; padding: 10px;">Error creating alignment: {str(e)}</div>'
	return error_html, 0, 0, 0


	def format_metrics_html(accuracy, expected_count, actual_count):
	"""
	Format metrics as HTML for embedding in alignment visualizations—
	includes expected count, predicted (actual) count, and accuracy percentage.

	Args:
	accuracy: Accuracy percentage
	expected_count: Number of expected phonemes
	actual_count: Number of actual phonemes

	Returns:
	HTML string with formatted metrics
	"""
	html = f'''
	<div style="display: flex; justify-content: space-around; padding: 12px; background-color: var(--background-fill-secondary, #f3f4f6); border-radius: 5px; margin: 10px 0; border: 1px solid var(--border-color-primary, #e5e7eb);">
	<div style="text-align: center;">
	<div style="font-size: 11px; color: var(--body-text-color-subdued, #6b7280);">Expected</div>
	<div style="font-size: 20px; font-weight: bold; color: var(--primary-500, #3b82f6);">{expected_count}</div>
	</div>
	<div style="text-align: center;">
	<div style="font-size: 11px; color: var(--body-text-color-subdued, #6b7280);">Predicted</div>
	<div style="font-size: 20px; font-weight: bold; color: var(--primary-500, #3b82f6);">{actual_count}</div>
	</div>
	<div style="text-align: center;">
	<div style="font-size: 11px; color: var(--body-text-color-subdued, #6b7280);">Accuracy</div>
	<div style="font-size: 20px; font-weight: bold; color: {'#10b981' if accuracy >= 90 else '#f59e0b' if accuracy >= 70 else '#ef4444'};">{accuracy:.1f}%</div>
	</div>
	</div>
	'''
	return html


	def create_segmented_alignment_visualization(
	segments,
	predicted_phonemes_per_segment,
	verse_ref,
	canonical_text,
	coverage_warning=None,
	reference_audio_clips=None,
	user_audio_clips=None,
	):
	"""
	Create a rich HTML visualization for segmented audio recitation,
	generating per-segment feedback boxes with audio controls,
	word-level and phoneme-level alignments, and error summaries.

	- For each segment:
	- Displays audio controls (reference and user recordings).
	- Highlights word-level feedback and phoneme alignment table.
	- Marks errors, pending states, and disables alignment if segment is missing or errored.
	- Supports lazy loading of reference audio and playback for each segment.
	- Handles both inline and end-of-block error/phoneme tables based on configuration.
	- After all segments, optionally appends combined error and phoneme tables.

	Args:
	segments: List of SegmentInfo objects from segment_processor (audio/time/word boundaries)
	predicted_phonemes_per_segment: List of predicted phoneme strings for each segment (may include None for pending)
	verse_ref: Full verse reference string
	canonical_text: Full canonical Arabic text for the verse
	coverage_warning: (Optional) Warning about incomplete segment/audio coverage (shown at top)
	reference_audio_clips: (Optional) List of data URIs for reference audio per segment
	user_audio_clips: (Optional) List of data URIs for user's recorded audio per segment

	Returns:
	Tuple of:
	html_string: HTML for embedding in app UI
	overall_accuracy: Accuracy percentage aggregated across all segments
	total_expected: Total expected phoneme count over all segments
	total_actual: Actual predicted phoneme count over all segments
	all_results: List[RecitationResult or None] for each segment
	"""
	if not segments:
	return '<div style="color: var(--body-text-color-subdued, #6b7280); padding: 10px;">No segments detected</div>', 0, 0, 0, []

	total_segments = len(segments)

	segment_html_parts = [] # Word feedback per segment
	all_errors = [] # Collect errors from all segments
	all_results = [] # Collect RecitationResult objects for unified table
	phoneme_tables = [] # Collect phoneme tables for all segments
	segment_metadata = [] # Per-segment data for re-rendering with sort modes
	total_expected = 0
	total_actual = 0
	total_correct = 0

	# Add coverage warning if present
	if coverage_warning:
	# Ensure tooltip CSS is present (segment mode builds multiple blocks).
	segment_html_parts.append(get_tooltip_css())
	segment_html_parts.append(f'''
	<div style="padding: 10px 15px; background-color: var(--warning-background-fill, #fef3c7); border: 1px solid var(--warning-border-color, #f59e0b);
	border-radius: 5px; margin-bottom: 15px; color: var(--warning-text-color, #92400e);">
	{coverage_warning}
	</div>
	''')
	else:
	# Ensure tooltip CSS is present even when no warning.
	segment_html_parts.append(get_tooltip_css())

	# Process each segment
	for seg_idx, (segment, predicted_phonemes) in enumerate(zip(segments, predicted_phonemes_per_segment)):
	seg_num = seg_idx + 1

	# Time range string
	time_range = f"{segment.start_time:.1f}s - {segment.end_time:.1f}s"

	# Word range string (1-based for display)
	word_range = f"{segment.word_start_idx + 1}-{segment.word_end_idx + 1}"

	# Check if this segment is currently being processed (None = pending)
	is_processing = predicted_phonemes is None and not segment.error

	# Handle segment errors
	if segment.error:
	# Add divider with error status
	divider = _generate_segment_divider(seg_num, total_segments, time_range, word_range, is_processing=False, status="error")
	segment_html_parts.append(divider)
	segment_html_parts.append(f'''
	<div style="padding: 10px; background-color: var(--error-background-fill, #fee2e2); border: 1px solid var(--error-border-color, #ef4444);
	border-radius: 5px; color: var(--error-text-color, #991b1b); margin: 10px 0;">
	⚠️ Segment {seg_num}: {segment.error}
	</div>
	''')
	all_results.append(None) # Maintain 1:1 correspondence with segments
	continue

	# Check for pending status (None)
	if predicted_phonemes is None:
	# Add blinking divider for processing
	divider = _generate_segment_divider(seg_num, total_segments, time_range, word_range, is_processing=True)
	segment_html_parts.append(divider)
	segment_html_parts.append(f'''
	<div style="padding: 20px; background-color: rgba(251, 140, 0, 0.1); border: 2px dashed #fb8c00;
	border-radius: 8px; color: #fb8c00; margin: 10px 0; text-align: center;">
	<div style="display: inline-block; width: 20px; height: 20px; border: 3px solid rgba(251, 140, 0, 0.3); border-top: 3px solid #fb8c00; border-radius: 50%; animation: spin 1s linear infinite; margin-right: 10px; vertical-align: middle;"></div>
	<style>
	@keyframes spin {{
	0% {{ transform: rotate(0deg); }}
	100% {{ transform: rotate(360deg); }}
	}}
	</style>
	<span style="font-weight: 600;">Processing segment {seg_num}...</span>
	</div>
	''')
	all_results.append(None) # Maintain 1:1 correspondence with segments
	continue

	if not predicted_phonemes:
	divider = _generate_segment_divider(seg_num, total_segments, time_range, word_range, is_processing=False, status="warning")
	segment_html_parts.append(divider)
	# Show matched verse info if available, even without phoneme alignment
	verse_info = f" ({segment.matched_ref})" if segment.matched_ref else ""
	text_info = f"<br><span style='font-size: 0.9em;'>{segment.matched_text}</span>" if segment.matched_text else ""
	segment_html_parts.append(f'''
	<div style="padding: 12px; background-color: rgba(251, 140, 0, 0.1); border: 1px solid #fb8c00;
	border-radius: 6px; color: var(--body-text-color, #374151); margin: 10px 0;">
	<div style="font-weight: 600; color: #fb8c00; margin-bottom: 4px;">
	⚠️ Segment {seg_num}{verse_info}: Phoneme transcription failed
	</div>
	<div style="font-size: 0.85em; color: var(--body-text-color-subdued, #6b7280);">
	Audio segment processed but phoneme extraction returned empty. Check logs for details.
	</div>{text_info}
	</div>
	''')
	all_results.append(None) # Maintain 1:1 correspondence with segments
	continue

	# Collect segment content first to determine status
	segment_content_parts = []
	segment_has_errors = False

	# Audio section: reference reciter (left) and user audio (right) in 2-column layout
	ref_clip = reference_audio_clips[seg_idx] if reference_audio_clips and seg_idx < len(reference_audio_clips) else None
	user_clip = user_audio_clips[seg_idx] if user_audio_clips and seg_idx < len(user_audio_clips) else None

	# Build segment info JSON for lazy loading (used when ref_clip is None)
	import json
	segment_info_json = json.dumps({
	"verse_ref": verse_ref,
	"word_start": segment.word_start_idx,
	"word_end": segment.word_end_idx
	}).replace('"', '"')

	# Always show audio section if we have user clip OR segment info for lazy loading
	has_segment_info = segment.word_start_idx is not None and segment.word_end_idx is not None
	if ref_clip or user_clip or has_segment_info:
	segment_content_parts.append('<div style="display: flex; gap: 10px; margin: 10px 0;">')

	# {t("segments.reference_audio")} audio (left column) - either embedded or lazy load
	if ref_clip:
	# Pre-loaded audio clip
	segment_content_parts.append(f'''
	<div style="flex: 1; padding: 10px 12px; border: 1px solid var(--border-color-accent, #e5e7eb); border-radius: 10px; background: var(--background-fill-secondary, #f3f4f6);">
	<div style="font-weight: 700; font-size: 13px; margin-bottom: 6px; color: var(--body-text-color, inherit); text-align: center;">{t("segments.reference_audio")}</div>
	<audio controls style="width: 100%;">
	<source src="{ref_clip}" type="audio/wav">
	Your browser does not support the audio element.
	</audio>
	</div>
	''')
	elif has_segment_info:
	# Lazy load placeholder - click to load
	segment_content_parts.append(f'''
	<div id="ref-clip-{seg_idx}" class="ref-clip-container" style="flex: 1; padding: 10px 12px; border: 1px solid var(--border-color-accent, #e5e7eb); border-radius: 10px; background: var(--background-fill-secondary, #f3f4f6);"
	data-segment="{segment_info_json}">
	<div style="font-weight: 700; font-size: 13px; margin-bottom: 6px; color: var(--body-text-color, inherit); text-align: center;">{t("segments.reference_audio")}</div>
	<div class="ref-clip-placeholder" style="display: flex; align-items: center; gap: 8px;">
	<button onclick="loadSegmentClip({seg_idx})" style="padding: 6px 12px; background: var(--button-primary-background-fill, #2563eb); color: white; border: none; border-radius: 6px; cursor: pointer; font-size: 13px;">
	▶ Load Audio
	</button>
	<span style="font-size: 12px; color: var(--body-text-color-subdued, #666);">Click to load reference</span>
	</div>
	<audio controls style="width: 100%; display: none;">
	Your browser does not support the audio element.
	</audio>
	</div>
	''')

	# User's recorded audio (right column)
	if user_clip:
	segment_content_parts.append(f'''
	<div style="flex: 1; padding: 10px 12px; border: 1px solid var(--border-color-accent, #e5e7eb); border-radius: 10px; background: var(--background-fill-secondary, #f3f4f6);">
	<div style="font-weight: 700; font-size: 13px; margin-bottom: 6px; color: var(--body-text-color, inherit); text-align: center;">{t("segments.user_audio")}</div>
	<audio controls style="width: 100%;">
	<source src="{user_clip}" type="audio/wav">
	Your browser does not support the audio element.
	</audio>
	</div>
	''')

	segment_content_parts.append('</div>')

	# Word-level feedback for this segment (segment treated as stopping at end).
	segment_ref = segment.matched_ref if hasattr(segment, 'matched_ref') and segment.matched_ref else None
	word_html, segment_errors, resolved_canonical_phonemes, segment_result = recover_words_for_segment(
	verse_ref,
	predicted_phonemes,
	word_start_idx=segment.word_start_idx,
	word_end_idx=segment.word_end_idx,
	segment_ref=segment_ref,
	)

	if word_html:
	segment_content_parts.append(word_html)

	if segment_errors:
	segment_has_errors = True
	all_errors.extend(segment_errors)

	# Collect RecitationResult for unified table rendering
	# Always append to maintain 1:1 correspondence with segments (can be None)
	all_results.append(segment_result)

	# Render inline error table if configured (no title for inline)
	if segment_result and SEGMENT_ERROR_TABLE_LOCATION == "inline":
	inline_error_table = render_combined_error_table(
	[segment_result], title=""
	)
	if inline_error_table:
	segment_content_parts.append(inline_error_table)

	# Prefer the separately-phonemized canonical phonemes (segment treated as stopping);
	# fall back to the segment processor output if needed.
	segment_canonical_phonemes = resolved_canonical_phonemes or segment.canonical_phonemes

	if not segment_canonical_phonemes:
	segment_content_parts.append(f'''
	<div style="padding: 10px; color: var(--body-text-color-subdued, #6b7280); margin: 10px 0;">
	Segment {seg_num}: No canonical phonemes available
	</div>
	''')
	# Still add divider and wrapped content box
	status = "error" if segment_has_errors else "success"
	divider = _generate_segment_divider(seg_num, total_segments, time_range, word_range, is_processing=False, status=status)
	segment_html_parts.append(divider)
	segment_box = f'''
	<div style="border: 1px solid var(--border-color-primary, #e5e7eb); border-radius: 8px; padding: 16px; margin-bottom: 12px; background-color: var(--background-fill-secondary, #f9fafb);">
	{''.join(segment_content_parts)}
	</div>
	'''
	segment_html_parts.append(segment_box)
	continue

	# Create phoneme alignment for this segment
	try:
	# Reuse alignment from segment_result if available (avoids O(n*m) recomputation)
	if segment_result and segment_result.phoneme_alignment:
	alignment = list(segment_result.phoneme_alignment)
	expected_tokens = segment_canonical_phonemes.split()
	actual_tokens = predicted_phonemes.split()
	else:
	# Fallback: compute alignment
	expected_tokens = segment_canonical_phonemes.split()
	actual_tokens = predicted_phonemes.split()
	alignment = align_sequences(expected_tokens, actual_tokens)

	expected_row = []
	actual_row = []
	diff_row = []
	correct_count = 0

	for ref_tok, hyp_tok, op in alignment:
	if op == "C":
	expected_row.append(ref_tok)
	actual_row.append(hyp_tok)
	diff_row.append(("", ""))
	correct_count += 1
	elif op == "S":
	expected_row.append(ref_tok)
	actual_row.append(hyp_tok)
	diff_row.append(("✗", COLORS["substitution"]))
	elif op == "D":
	expected_row.append(ref_tok)
	actual_row.append("—")
	diff_row.append(("−", COLORS["deletion"]))
	elif op == "I":
	expected_row.append("—")
	actual_row.append(hyp_tok)
	diff_row.append(("+", COLORS["insertion"]))

	total_expected += len(expected_tokens)
	total_actual += len(actual_tokens)
	total_correct += correct_count

	segment_accuracy = (correct_count / len(expected_tokens) * 100) if expected_tokens else 0

	# Render inline or store for combined display at end
	if SEGMENT_PHONEME_TABLE_LOCATION == "inline":
	# Generate table
	if PHONEME_TABLE_STYLE == "scroll":
	table_html = _generate_scroll_table(expected_row, actual_row, diff_row)
	else:
	table_html = _generate_chunked_table(expected_row, actual_row, diff_row)

	# If collapsed mode, wrap in collapsible details; otherwise show directly
	if SEGMENT_PHONEME_TABLE_COLLAPSED:
	inline_phoneme_html = f'''
	<details style="margin: 10px 0;">
	<summary style="cursor: pointer; font-weight: bold; font-size: 13px; padding: 6px;
	background-color: var(--background-fill-secondary, #f3f4f6); border-radius: 4px; user-select: none; color: var(--body-text-color, inherit);">
	{t("phoneme_alignment.header")}
	</summary>
	<div style="margin-top: 8px;">
	{table_html}
	</div>
	</details>
	'''
	else:
	# Show directly without collapsible wrapper
	inline_phoneme_html = f'''
	<div style="margin: 10px 0;">
	{table_html}
	</div>
	'''
	# Add to segment content (inside the box)
	segment_content_parts.append(inline_phoneme_html)
	else:
	# Store phoneme table data for combined display at end
	phoneme_tables.append({
	'seg_num': seg_num,
	'time_range': time_range,
	'word_range': word_range,
	'expected_row': expected_row,
	'actual_row': actual_row,
	'diff_row': diff_row,
	'accuracy': segment_accuracy,
	'has_errors': segment_has_errors,
	})

	except Exception as e:
	segment_content_parts.append(f'''
	<div style="color: var(--error-text-color, #ef4444); padding: 10px;">
	Error creating alignment for segment {seg_num}: {str(e)}
	</div>
	''')

	# Now add divider with status based on whether segment has errors
	status = "error" if segment_has_errors else "success"
	divider = _generate_segment_divider(seg_num, total_segments, time_range, word_range, is_processing=False, status=status)
	segment_html_parts.append(divider)

	# Wrap all segment content in a single box
	segment_box = f'''
	<div style="border: 1px solid var(--border-color-primary, #e5e7eb); border-radius: 8px; padding: 16px; margin-bottom: 12px; background-color: var(--background-fill-secondary, #f9fafb);">
	{''.join(segment_content_parts)}
	</div>
	'''
	segment_html_parts.append(segment_box)

	# Count errors by handler type for summary table
	error_counts = {}
	if segment_result and segment_result.errors:
	for error in segment_result.errors:
	handler = getattr(error, 'source_handler', '') or ''
	# Handle multiple handlers (e.g., "Handler1+Handler2")
	for part in handler.split('+'):
	# Extract handler name (before ":")
	handler_name = part.split(':')[0].strip() if ':' in part else part.strip()
	if handler_name:
	error_counts[handler_name] = error_counts.get(handler_name, 0) + 1

	# Track segment data for re-rendering with sort modes
	segment_metadata.append({
	'segment_idx': seg_idx,
	'divider_html': divider,
	'segment_html': segment_box,
	'has_errors': segment_has_errors,
	'segment_num': seg_num,
	'error_counts': error_counts,
	})

	# Build final HTML: segments feedback -> combined errors (final) -> phoneme tables
	all_html_parts = segment_html_parts.copy()

	# Add combined error table once all segments are processed (skip if only 1 segment)
	# Only render at end if configured for "end" mode (inline mode renders per-segment)
	if SEGMENT_ERROR_TABLE_LOCATION == "end":
	if all(p is not None for p in predicted_phonemes_per_segment) and total_segments > 1:
	combined_table = render_combined_error_table(all_results, title="All Errors")
	if combined_table:
	all_html_parts.append(combined_table)

	# Add combined phoneme alignment tables at the end (only populated in "end" mode)
	if phoneme_tables:
	details_open = "" if SEGMENT_PHONEME_TABLE_COLLAPSED else "open"
	phoneme_html = f'''
	<div class="phoneme-alignment">
	<details {details_open} style="margin: 15px 0;">
	<summary style="cursor: pointer; font-weight: bold; font-size: 14px; padding: 8px;
	background-color: var(--background-fill-secondary, #f3f4f6); border-radius: 4px; user-select: none; color: var(--body-text-color, inherit);">
	{t("phoneme_alignment.header")}
	</summary>
	<div class="ltr-preserve" style="margin-top: 10px;">
	'''

	for table_data in phoneme_tables:
	# Add mini divider for each segment's phoneme table with status indicator
	status_icon = " ❌" if table_data.get('has_errors') else " ✅"
	phoneme_html += f'''
	<div style="display: flex; align-items: center; margin: 15px 0 5px 0; direction: ltr;">
	<div style="flex-grow: 1; height: 1px; background: var(--border-color-primary, #e5e7eb);"></div>
	<div style="padding: 0 10px; font-size: 12px; color: var(--body-text-color-subdued, #6b7280);">
	Segment {table_data['seg_num']}{status_icon}
	</div>
	<div style="flex-grow: 1; height: 1px; background: var(--border-color-primary, #e5e7eb);"></div>
	</div>
	'''

	# Generate table
	if PHONEME_TABLE_STYLE == "scroll":
	table_html = _generate_scroll_table(
	table_data['expected_row'],
	table_data['actual_row'],
	table_data['diff_row']
	)
	else:
	table_html = _generate_chunked_table(
	table_data['expected_row'],
	table_data['actual_row'],
	table_data['diff_row']
	)

	phoneme_html += table_html

	phoneme_html += '''
	</div>
	</details>
	</div>
	'''
	all_html_parts.append(phoneme_html)

	# Calculate overall accuracy
	overall_accuracy = (total_correct / total_expected * 100) if total_expected > 0 else 0

	# Cache segment metadata for re-rendering (only when all segments are processed)
	# Check that we have metadata for all segments (no pending/error segments skipped)
	if len(segment_metadata) == total_segments and all(p is not None for p in predicted_phonemes_per_segment):
	from shared_state import set_last_error_segment_data
	set_last_error_segment_data(segment_metadata)

	full_html = ''.join(all_html_parts)
	return full_html, overall_accuracy, total_expected, total_actual, all_results