Spaces:

hetchyy
/

quranic-universal-aligner

Running on Zero

App Files Files Community

quranic-universal-aligner / src /alignment /alignment_pipeline.py

hetchyy

Upload folder using huggingface_hub

602b5d3 verified 12 days ago

raw

history blame contribute delete

32.1 kB

	"""Orchestration for phoneme-based alignment and retries."""

	from typing import List, Tuple

	from config import (
	ANCHOR_SEGMENTS,
	MAX_CONSECUTIVE_FAILURES,
	RETRY_LOOKBACK_WORDS,
	RETRY_LOOKAHEAD_WORDS,
	MAX_EDIT_DISTANCE_RELAXED,
	PHONEME_ALIGNMENT_PROFILING,
	)
	from src.core.debug_collector import get_debug_collector


	def _debug_alignment_result(alignment, chapter_ref):
	"""Extract JSON-safe dict from an AlignmentResult for the debug collector."""
	if alignment is None:
	return None
	return {
	"matched_ref": alignment.matched_ref,
	"start_word_idx": alignment.start_word_idx,
	"end_word_idx": alignment.end_word_idx,
	"edit_cost": round(alignment.edit_cost, 4),
	"confidence": round(alignment.confidence, 4),
	"j_start": alignment.j_start,
	"best_j": alignment.best_j,
	"basmala_consumed": alignment.basmala_consumed,
	"n_wraps": alignment.n_wraps,
	"wrap_points": alignment.wrap_points,
	"wrap_word_ranges": alignment.wrap_word_ranges,
	}


	def run_phoneme_matching(
	phoneme_texts: List[List[str]],
	detected_surah: int,
	first_quran_idx: int = 0,
	special_results: List[tuple] = None,
	start_pointer: int = 0,
	) -> Tuple[List[tuple], dict, set, dict, set]:
	"""
	Phoneme-based segment matching using substring DP.

	Args:
	phoneme_texts: List of phoneme lists (each is a list of phoneme strings)
	detected_surah: Surah number from anchor search
	first_quran_idx: Index where Quran segments start (after specials)
	special_results: Results for special segments (Isti'adha/Basmala)
	start_pointer: Initial word pointer from anchor voting

	Returns:
	(results, profiling_dict, gap_segments, merged_into, repetition_segments)
	results: List[(matched_text, score, matched_ref, wrap_word_ranges_or_None), ...]
	merged_into: dict mapping consumed segment indices to their target segment index
	repetition_segments: set of segment indices where wraps were detected
	"""
	from .phoneme_matcher import align_segment, get_matched_text
	from .phoneme_matcher_cache import get_chapter_reference
	from .phoneme_anchor import verse_to_word_index, find_anchor_by_voting
	from .ngram_index import get_ngram_index

	# Only import time if profiling enabled
	if PHONEME_ALIGNMENT_PROFILING:
	import time
	total_start = time.perf_counter()
	ref_build_start = time.perf_counter()

	# Build/get cached chapter reference (includes phonemizer call if not cached)
	chapter_ref = get_chapter_reference(detected_surah)

	if PHONEME_ALIGNMENT_PROFILING:
	ref_build_time = time.perf_counter() - ref_build_start

	# Initialize results with special segments
	results = list(special_results) if special_results else []
	# Parallel list: None for specials/failures, (start_word_idx, end_word_idx) for matches
	word_indices = [None] * len(results)

	# Timing accumulators (only used if profiling enabled)
	if PHONEME_ALIGNMENT_PROFILING:
	dp_times = []
	window_setup_total = 0.0
	result_build_total = 0.0

	# Track whether the next segment might have Basmala fused with verse content
	from .special_segments import (
	SPECIAL_PHONEMES, SPECIAL_TEXT, TRANSITION_TEXT,
	detect_transition_segment, detect_inter_chapter_specials,
	)
	basmala_already_detected = any(
	r[2] in ("Basmala", "Isti'adha+Basmala") for r in (special_results or [])
	)
	is_first_after_transition = not basmala_already_detected

	special_merges = 0

	# Transition segment state
	transition_mode = False
	transition_skips = 0
	tahmeed_merge_skip = 0
	merged_into = {} # {consumed_idx: target_idx}

	# Gap and repetition tracking
	gap_segments = set()
	repetition_segments = set()
	transition_expected_pointer = -1 # -1 = no pending check

	def _check_transition_gap(start_word_idx):
	"""Flag missing words at start of new chapter after transition."""
	nonlocal transition_expected_pointer
	if transition_expected_pointer < 0:
	return
	if start_word_idx > transition_expected_pointer:
	seg_idx = len(word_indices) - 1
	gap_segments.add(seg_idx)
	gap = start_word_idx - transition_expected_pointer
	print(f" [GAP] {gap} word(s) missing at start of chapter after transition "
	f"(expected word {transition_expected_pointer}, got {start_word_idx})")
	transition_expected_pointer = -1

	# Process Quran segments with phoneme alignment
	pointer = start_pointer
	num_segments = 0
	consecutive_failures = 0
	skip_count = 0
	pending_specials = []
	tier1_attempts = 0
	tier1_passed = 0
	tier1_segments = []
	tier2_attempts = 0
	tier2_passed = 0
	tier2_segments = []
	consec_reanchors = 0
	segments_attempted = 0
	segments_passed = 0

	for i, asr_phonemes in enumerate(phoneme_texts[first_quran_idx:]):
	# Handle segments consumed by inter-chapter special detection
	if skip_count > 0:
	results.append(pending_specials.pop(0))
	word_indices.append(None)
	skip_count -= 1
	continue

	# Handle segments consumed by Tahmeed merge (sami'a + rabbana in separate segments)
	if tahmeed_merge_skip > 0:
	# This segment's audio was merged into the previous Tahmeed segment
	results.append(("", 0.0, "", None))
	word_indices.append(None)
	tahmeed_merge_skip -= 1
	transition_skips += 1
	continue

	segment_idx = first_quran_idx + i + 1 # 1-indexed for display
	segments_attempted += 1

	# Transition mode: keep checking for transitions before trying alignment
	if transition_mode:
	trans_name, trans_conf = detect_transition_segment(asr_phonemes)
	if trans_name:
	print(f" [TRANSITION-MODE] Segment {segment_idx}: {trans_name} (conf={trans_conf:.2f})")
	_dc = get_debug_collector()
	if _dc is not None:
	_dc.add_event("transition_detected", segment_idx=segment_idx,
	transition_type=trans_name, confidence=round(trans_conf, 4),
	context="transition_mode")
	results.append((TRANSITION_TEXT[trans_name], trans_conf, trans_name, None))
	word_indices.append(None)
	transition_skips += 1

	# Tahmeed peek-ahead for merge
	if trans_name == "Tahmeed":
	next_abs = first_quran_idx + i + 1
	if next_abs < len(phoneme_texts) and phoneme_texts[next_abs]:
	resp_name, resp_conf = detect_transition_segment(
	phoneme_texts[next_abs], allowed={"Tahmeed"})
	if resp_name:
	merged_into[next_abs] = first_quran_idx + i
	tahmeed_merge_skip = 1
	print(f" [TAHMEED-MERGE] Next segment merged into Tahmeed")
	if _dc is not None:
	_dc.add_event("tahmeed_merge", segment_idx=segment_idx,
	merged_segment=next_abs)

	continue
	else:
	# Exit transition mode, global reanchor
	transition_mode = False
	print(f" [TRANSITION-MODE] Exiting at segment {segment_idx}, running global reanchor...")
	remaining_idx = first_quran_idx + i
	remaining_texts = phoneme_texts[remaining_idx:]
	if remaining_texts:
	reanchor_surah, reanchor_ayah = find_anchor_by_voting(
	remaining_texts, get_ngram_index(), ANCHOR_SEGMENTS,
	)
	if reanchor_surah > 0:
	if reanchor_surah != detected_surah:
	detected_surah = reanchor_surah
	chapter_ref = get_chapter_reference(detected_surah)
	pointer = verse_to_word_index(chapter_ref, reanchor_ayah)
	transition_expected_pointer = pointer
	print(f" [GLOBAL-REANCHOR] Jumped to Surah {detected_surah}, "
	f"Ayah {reanchor_ayah}, word {pointer}")
	_dc = get_debug_collector()
	if _dc is not None:
	_dc.add_event("reanchor", at_segment=segment_idx,
	reason="transition_mode_exit",
	new_surah=detected_surah,
	new_ayah=reanchor_ayah, new_pointer=pointer)
	consecutive_failures = 0
	# Fall through to normal alignment below

	alignment, timing = align_segment(asr_phonemes, chapter_ref, pointer, segment_idx)
	num_segments += 1

	# Accumulate timing if profiling enabled
	if PHONEME_ALIGNMENT_PROFILING:
	dp_times.append(timing['dp_time'])
	window_setup_total += timing['window_setup_time']
	result_build_total += timing['result_build_time']

	# Debug collector: primary alignment attempt
	_dc = get_debug_collector()
	if _dc is not None:
	_dc.add_alignment_result(
	segment_idx, asr_phonemes,
	window={"pointer": pointer, "surah": detected_surah},
	expected_pointer=pointer,
	result=_debug_alignment_result(alignment, chapter_ref),
	timing=timing,
	)

	# Chapter transition: pointer past end of chapter
	if alignment is None and pointer >= chapter_ref.num_words:
	remaining_phonemes = phoneme_texts[first_quran_idx + i:]
	amin_consumed = 0

	if chapter_ref.surah == 1:
	# Check for Amin after Al-Fatiha before inter-chapter specials
	amin_name, amin_conf = detect_transition_segment(
	asr_phonemes, allowed={"Amin"})
	if amin_name:
	print(f" [AMIN] Detected after Surah 1 (conf={amin_conf:.2f})")
	results.append((TRANSITION_TEXT["Amin"], amin_conf, "Amin", None))
	word_indices.append(None)
	transition_skips += 1
	amin_consumed = 1
	# Re-slice remaining phonemes to start after Amin
	remaining_phonemes = phoneme_texts[first_quran_idx + i + 1:]

	inter_specials, num_consumed = detect_inter_chapter_specials(remaining_phonemes)

	if chapter_ref.surah == 1:
	# After Al-Fatiha, the next chapter could be anything — global reanchor
	print(f" [CHAPTER-END] Surah 1 complete at segment {segment_idx}, "
	f"running global reanchor...")
	_dc = get_debug_collector()
	if _dc is not None:
	_dc.add_event("chapter_end", at_segment=segment_idx,
	from_surah=1, next_action="global_reanchor")

	# Use segments after Amin + specials for anchor voting
	anchor_offset = first_quran_idx + i + amin_consumed + num_consumed
	anchor_remaining = phoneme_texts[anchor_offset:]

	reanchor_surah, reanchor_ayah = find_anchor_by_voting(
	anchor_remaining, get_ngram_index(), ANCHOR_SEGMENTS,
	)

	if reanchor_surah > 0:
	next_surah = reanchor_surah
	chapter_ref = get_chapter_reference(next_surah)
	pointer = verse_to_word_index(chapter_ref, reanchor_ayah)
	# Don't set transition_expected_pointer — after Surah 1 the next
	# chapter is arbitrary (global reanchor), so gaps are expected.
	print(f" [GLOBAL-REANCHOR] Anchored to Surah {next_surah}, "
	f"Ayah {reanchor_ayah}, word {pointer}")
	else:
	# Fallback: assume chapter 2
	next_surah = 2
	chapter_ref = get_chapter_reference(next_surah)
	pointer = 0
	print(f" [GLOBAL-REANCHOR] No anchor found, falling back to Surah 2")
	else:
	next_surah = chapter_ref.surah + 1
	if next_surah > 114:
	pass # No more chapters — fall through to failure handling
	else:
	# Check for transition before committing to next sequential surah
	if num_consumed == 0:
	trans_name, trans_conf = detect_transition_segment(asr_phonemes)
	if trans_name:
	print(f" [CHAPTER-END-TRANSITION] Segment {segment_idx}: {trans_name} "
	f"at end of Surah {chapter_ref.surah} (conf={trans_conf:.2f})")
	results.append((TRANSITION_TEXT[trans_name], trans_conf, trans_name, None))
	word_indices.append(None)
	transition_skips += 1
	transition_mode = True
	detected_surah = next_surah
	chapter_ref = get_chapter_reference(next_surah)
	pointer = 0
	transition_expected_pointer = 0
	consecutive_failures = 0
	continue

	print(f" [CHAPTER-END] Surah {chapter_ref.surah} complete at segment {segment_idx}, "
	f"transitioning to Surah {next_surah}")
	_dc = get_debug_collector()
	if _dc is not None:
	_dc.add_event("chapter_transition", at_segment=segment_idx,
	from_surah=chapter_ref.surah, to_surah=next_surah)
	chapter_ref = get_chapter_reference(next_surah)
	pointer = 0
	transition_expected_pointer = 0

	if next_surah <= 114:
	detected_surah = next_surah
	consecutive_failures = 0

	if amin_consumed > 0:
	# Current segment was Amin (already appended above).
	# Queue inter-chapter specials for subsequent segments.
	has_basmala = any(s[2] in ("Basmala", "Isti'adha+Basmala") for s in inter_specials)
	is_first_after_transition = not has_basmala
	if num_consumed > 0:
	pending_specials = list(inter_specials)
	skip_count = num_consumed
	else:
	is_first_after_transition = True
	continue

	if num_consumed > 0:
	has_basmala = any(s[2] in ("Basmala", "Isti'adha+Basmala") for s in inter_specials)
	is_first_after_transition = not has_basmala
	# Current segment is a special — append its result
	results.append(inter_specials[0])
	word_indices.append(None)
	# Queue remaining specials for subsequent segments
	if num_consumed > 1:
	pending_specials = list(inter_specials[1:])
	skip_count = num_consumed - 1

	continue
	else:
	is_first_after_transition = True
	# No specials — re-try alignment on this segment against the new chapter
	alignment, timing = align_segment(asr_phonemes, chapter_ref, pointer, segment_idx)
	num_segments += 1
	if PHONEME_ALIGNMENT_PROFILING:
	dp_times.append(timing['dp_time'])
	window_setup_total += timing['window_setup_time']
	result_build_total += timing['result_build_time']
	# Fall through to existing if/else below

	# Basmala-fused retry: if this is the first segment after a transition
	# and Basmala wasn't detected, the reciter may have merged Basmala with
	# the first verse. Always try prepending Basmala phonemes to R and pick
	# the better result (even if the plain alignment already succeeded).
	if is_first_after_transition:
	is_first_after_transition = False

	basmala_alignment, basmala_timing = align_segment(
	asr_phonemes, chapter_ref, pointer, segment_idx,
	basmala_prefix=True)
	num_segments += 1
	if PHONEME_ALIGNMENT_PROFILING:
	dp_times.append(basmala_timing['dp_time'])
	window_setup_total += basmala_timing['window_setup_time']
	result_build_total += basmala_timing['result_build_time']

	if basmala_alignment and basmala_alignment.basmala_consumed:
	existing_conf = alignment.confidence if alignment else 0.0
	if basmala_alignment.confidence > existing_conf:
	matched_text = SPECIAL_TEXT["Basmala"] + " " + get_matched_text(chapter_ref, basmala_alignment)
	result = (matched_text, basmala_alignment.confidence, basmala_alignment.matched_ref,
	basmala_alignment.wrap_word_ranges)
	pointer = basmala_alignment.end_word_idx + 1
	consecutive_failures = 0
	word_indices.append((basmala_alignment.start_word_idx, basmala_alignment.end_word_idx))
	_check_transition_gap(basmala_alignment.start_word_idx)
	if basmala_alignment.n_wraps > 0:
	repetition_segments.add(len(results))
	results.append(result)
	special_merges += 1
	segments_passed += 1
	print(f" [BASMALA-FUSED] Segment {segment_idx}: Basmala merged with verse "
	f"(fused conf={basmala_alignment.confidence:.2f} > plain conf={existing_conf:.2f})")
	_dc = get_debug_collector()
	if _dc is not None:
	_dc.add_event("basmala_fused", segment_idx=segment_idx,
	fused_conf=round(basmala_alignment.confidence, 4),
	plain_conf=round(existing_conf, 4), chose="fused")
	continue
	# Basmala-fused didn't win — fall through with original alignment

	if alignment:
	is_first_after_transition = False
	matched_text = get_matched_text(chapter_ref, alignment)
	result = (matched_text, alignment.confidence, alignment.matched_ref,
	alignment.wrap_word_ranges)
	pointer = alignment.end_word_idx + 1 # Advance pointer
	consecutive_failures = 0
	word_indices.append((alignment.start_word_idx, alignment.end_word_idx))
	_check_transition_gap(alignment.start_word_idx)
	if alignment.n_wraps > 0:
	repetition_segments.add(len(results))
	segments_passed += 1
	else:
	# === Check for transition segment before retry tiers ===
	trans_name, trans_conf = detect_transition_segment(asr_phonemes)
	if trans_name:
	print(f" [TRANSITION] Segment {segment_idx}: {trans_name} (conf={trans_conf:.2f})")
	_dc = get_debug_collector()
	if _dc is not None:
	_dc.add_event("transition_detected", segment_idx=segment_idx,
	transition_type=trans_name, confidence=round(trans_conf, 4),
	context="pre_retry")
	result = (TRANSITION_TEXT[trans_name], trans_conf, trans_name, None)
	word_indices.append(None)
	transition_skips += 1
	transition_mode = True

	# Tahmeed peek-ahead for merge
	if trans_name == "Tahmeed":
	next_abs = first_quran_idx + i + 1
	if next_abs < len(phoneme_texts) and phoneme_texts[next_abs]:
	resp_name, resp_conf = detect_transition_segment(
	phoneme_texts[next_abs], allowed={"Tahmeed"})
	if resp_name:
	merged_into[next_abs] = first_quran_idx + i
	tahmeed_merge_skip = 1
	print(f" [TAHMEED-MERGE] Next segment merged into Tahmeed")
	if _dc is not None:
	_dc.add_event("tahmeed_merge", segment_idx=segment_idx,
	merged_segment=next_abs)

	results.append(result)
	continue

	# === Graduated retry ===
	# Tier 1: expanded window, same threshold
	tier1_attempts += 1
	tier1_segments.append(segment_idx)
	alignment, timing = align_segment(
	asr_phonemes, chapter_ref, pointer, segment_idx,
	lookback_override=RETRY_LOOKBACK_WORDS,
	lookahead_override=RETRY_LOOKAHEAD_WORDS,
	)
	num_segments += 1
	if PHONEME_ALIGNMENT_PROFILING:
	dp_times.append(timing['dp_time'])
	window_setup_total += timing['window_setup_time']
	result_build_total += timing['result_build_time']

	# Tier 2: expanded window + relaxed threshold
	tier2_entered = False
	if alignment is None:
	tier2_entered = True
	tier2_attempts += 1
	tier2_segments.append(segment_idx)
	alignment, timing = align_segment(
	asr_phonemes, chapter_ref, pointer, segment_idx,
	lookback_override=RETRY_LOOKBACK_WORDS,
	lookahead_override=RETRY_LOOKAHEAD_WORDS,
	max_edit_distance_override=MAX_EDIT_DISTANCE_RELAXED,
	)
	num_segments += 1
	if PHONEME_ALIGNMENT_PROFILING:
	dp_times.append(timing['dp_time'])
	window_setup_total += timing['window_setup_time']
	result_build_total += timing['result_build_time']

	if alignment:
	# Retry succeeded
	is_first_after_transition = False
	matched_text = get_matched_text(chapter_ref, alignment)
	result = (matched_text, alignment.confidence, alignment.matched_ref,
	alignment.wrap_word_ranges)
	pointer = alignment.end_word_idx + 1
	consecutive_failures = 0
	word_indices.append((alignment.start_word_idx, alignment.end_word_idx))
	_check_transition_gap(alignment.start_word_idx)
	if alignment.n_wraps > 0:
	repetition_segments.add(len(results))
	segments_passed += 1
	tier_name = "tier2" if tier2_entered else "tier1"
	if tier2_entered:
	tier2_passed += 1
	else:
	tier1_passed += 1
	print(f" [RETRY-OK] Segment {segment_idx}: recovered via expanded window/relaxed threshold")
	_dc = get_debug_collector()
	if _dc is not None:
	_dc.add_alignment_result(
	segment_idx, asr_phonemes,
	window={"pointer": pointer - 1, "surah": detected_surah},
	expected_pointer=pointer - 1,
	result=_debug_alignment_result(alignment, chapter_ref),
	timing=timing, retry_tier=tier_name,
	)
	_dc.add_event(f"retry_{tier_name}", segment_idx=segment_idx,
	passed=True, confidence=round(alignment.confidence, 4))
	else:
	# Real failure after all retries
	result = ("", 0.0, "", None)
	consecutive_failures += 1
	word_indices.append(None)
	_dc = get_debug_collector()
	if _dc is not None:
	_dc.add_event("retry_failed", segment_idx=segment_idx,
	tier1=True, tier2=tier2_entered)

	if consecutive_failures >= MAX_CONSECUTIVE_FAILURES:
	consec_reanchors += 1
	# Global re-anchor (not constrained to current surah)
	remaining_idx = first_quran_idx + i + 1
	remaining_texts = phoneme_texts[remaining_idx:]
	if remaining_texts:
	reanchor_surah, reanchor_ayah = find_anchor_by_voting(
	remaining_texts, get_ngram_index(), ANCHOR_SEGMENTS,
	)
	if reanchor_surah > 0:
	if reanchor_surah != detected_surah:
	detected_surah = reanchor_surah
	chapter_ref = get_chapter_reference(detected_surah)
	pointer = verse_to_word_index(chapter_ref, reanchor_ayah)
	transition_expected_pointer = pointer
	print(f" [GLOBAL-REANCHOR] Jumped to Surah {detected_surah}, "
	f"Ayah {reanchor_ayah}, word {pointer}")
	_dc = get_debug_collector()
	if _dc is not None:
	_dc.add_event("reanchor", at_segment=segment_idx,
	reason="consecutive_failures",
	new_surah=detected_surah,
	new_ayah=reanchor_ayah, new_pointer=pointer)
	consecutive_failures = 0

	results.append(result)

	# Post-processing: detect consecutive segments with reference gaps
	# (gap_segments may already have entries from chapter-transition checks above)
	prev_matched_idx = None
	for idx in range(len(results)):
	if word_indices[idx] is None:
	continue

	if prev_matched_idx is not None:
	# Skip gap check across chapter transitions — word indices are per-chapter
	prev_ref = results[prev_matched_idx][2]
	curr_ref = results[idx][2]
	prev_surah = prev_ref.split(":")[0] if prev_ref and ":" in prev_ref else None
	curr_surah = curr_ref.split(":")[0] if curr_ref and ":" in curr_ref else None

	if prev_surah is not None and prev_surah == curr_surah:
	prev_end = word_indices[prev_matched_idx][1]
	curr_start = word_indices[idx][0]
	gap = curr_start - prev_end - 1

	if gap > 0:
	gap_segments.add(prev_matched_idx)
	gap_segments.add(idx)

	print(f" [GAP] {gap} word(s) missing between segments "
	f"{prev_matched_idx + 1} and {idx + 1}")
	_dc = get_debug_collector()
	if _dc is not None:
	_dc.add_event("gap", position="between",
	segment_before=prev_matched_idx + 1,
	segment_after=idx + 1, missing_words=gap)

	prev_matched_idx = idx

	# Edge case: missing words at start of expected range
	first_matched = next((i for i, w in enumerate(word_indices) if w is not None), None)
	if first_matched is not None:
	first_start = word_indices[first_matched][0]
	if first_start > start_pointer:
	gap_segments.add(first_matched)
	gap_count = first_start - start_pointer
	print(f" [GAP] {gap_count} word(s) missing before first segment {first_matched + 1}")
	_dc = get_debug_collector()
	if _dc is not None:
	_dc.add_event("gap", position="before_first",
	segment_idx=first_matched + 1, missing_words=gap_count)

	# Edge case: missing words at end of current verse
	# Only flag if the last matched segment is also the final segment overall.
	# If there are trailing no-match segments after it, those account for the
	# remaining audio — the words aren't missing, they just failed to align.
	# Compare against the verse boundary (not chapter end), since a recitation
	# doesn't necessarily cover the entire chapter.
	last_matched = next((i for i in range(len(word_indices) - 1, -1, -1) if word_indices[i] is not None), None)
	if last_matched is not None and last_matched == len(word_indices) - 1:
	last_end = word_indices[last_matched][1]
	last_ayah = chapter_ref.words[last_end].ayah
	# Find the last word index that belongs to the same verse
	verse_end = last_end
	while verse_end + 1 < chapter_ref.num_words and chapter_ref.words[verse_end + 1].ayah == last_ayah:
	verse_end += 1
	if last_end < verse_end:
	gap_segments.add(last_matched)
	gap_count = verse_end - last_end
	print(f" [GAP] {gap_count} word(s) missing after last segment {last_matched + 1}")
	_dc = get_debug_collector()
	if _dc is not None:
	_dc.add_event("gap", position="after_last",
	segment_idx=last_matched + 1, missing_words=gap_count)

	# Build profiling dict
	if PHONEME_ALIGNMENT_PROFILING:
	total_time = time.perf_counter() - total_start
	profiling = {
	"total_time": total_time,
	"ref_build_time": ref_build_time,
	"dp_total_time": sum(dp_times),
	"dp_min_time": min(dp_times) if dp_times else 0.0,
	"dp_max_time": max(dp_times) if dp_times else 0.0,
	"window_setup_time": window_setup_total,
	"result_build_time": result_build_total,
	"num_segments": num_segments,
	"tier1_attempts": tier1_attempts,
	"tier1_passed": tier1_passed,
	"tier1_segments": tier1_segments,
	"tier2_attempts": tier2_attempts,
	"tier2_passed": tier2_passed,
	"tier2_segments": tier2_segments,
	"consec_reanchors": consec_reanchors,
	"segments_attempted": segments_attempted,
	"segments_passed": segments_passed,
	"special_merges": special_merges,
	"transition_skips": transition_skips,
	"phoneme_wraps_detected": len(repetition_segments),
	}
	else:
	profiling = {
	"num_segments": num_segments,
	"tier1_attempts": tier1_attempts,
	"tier1_passed": tier1_passed,
	"tier1_segments": tier1_segments,
	"tier2_attempts": tier2_attempts,
	"tier2_passed": tier2_passed,
	"tier2_segments": tier2_segments,
	"consec_reanchors": consec_reanchors,
	"segments_attempted": segments_attempted,
	"segments_passed": segments_passed,
	"special_merges": special_merges,
	"transition_skips": transition_skips,
	"phoneme_wraps_detected": len(repetition_segments),
	}

	return results, profiling, gap_segments, merged_into, repetition_segments