Spaces:

hetchyy
/

quranic-universal-aligner

Running on Zero

App Files Files Community

quranic-universal-aligner / src /core /log_blocks.py

hetchyy

deploy

bfecf3d verified about 1 month ago

raw

history blame contribute delete

15.7 kB

	"""Pure data-assembly for v3 usage-log JSON columns.

	Each build_* function takes explicit inputs (no globals, no thread-locals) so
	it's unit-testable without the pipeline. The pipeline reads thread-locals via
	helper getters in `zero_gpu` / `pipeline.py` and passes the values down.

	Shape spec: `docs/usage-logging-v3.md`.
	"""

	from __future__ import annotations

	from typing import Any, Optional


	def _r(v: Optional[float], n: int = 3) -> Optional[float]:
	"""Round-or-None helper. Preserves `None` so downstream JSON serializes null."""
	if v is None:
	return None
	try:
	return round(float(v), n)
	except (TypeError, ValueError):
	return None


	# ---------------------------------------------------------------------------
	# settings
	# ---------------------------------------------------------------------------

	def build_settings(min_silence_ms: int, min_speech_ms: int, pad_ms: int,
	asr_model_id: str, asr_model_label: Optional[str],
	device: str, url_source: Optional[str] = None) -> dict:
	# Wrap / DP-repetition constants live in config.py. Log them per row so
	# historical analyses stay interpretable after tuning.
	try:
	from config import (
	WRAP_PENALTY, WRAP_SPAN_WEIGHT, WRAP_SCORE_COST,
	MAX_EDIT_DISTANCE, MAX_EDIT_DISTANCE_RELAXED,
	)
	align_cfg = {
	"wrap_penalty": float(WRAP_PENALTY),
	"wrap_span_weight": float(WRAP_SPAN_WEIGHT),
	"wrap_score_cost": float(WRAP_SCORE_COST),
	# Acceptance thresholds from config — both primary and retry
	# logged per-row so threshold-tuning analysis can correlate
	# dp_debug.norm_dist against the active thresholds even when
	# they're retuned between runs.
	"max_edit_distance": float(MAX_EDIT_DISTANCE),
	"max_edit_distance_relaxed": float(MAX_EDIT_DISTANCE_RELAXED),
	}
	except Exception:
	align_cfg = None

	try:
	from config import SEGMENTER_BATCH_SIZE
	# CPU path always runs VAD at bs=1 (see src/segmenter/vad.py).
	vad_bs = 1 if str(device).lower().startswith("cpu") else int(SEGMENTER_BATCH_SIZE)
	except Exception:
	vad_bs = None

	return {
	"min_silence_ms": int(min_silence_ms),
	"min_speech_ms": int(min_speech_ms),
	"pad_ms": int(pad_ms),
	"asr_model": asr_model_id,
	"asr_model_label": asr_model_label,
	"device": device,
	"url_source": url_source,
	"align_config": align_cfg,
	"vad_batch_size": vad_bs,
	}


	# ---------------------------------------------------------------------------
	# timing (unified CPU + GPU + stage block)
	# ---------------------------------------------------------------------------

	def build_timing(profiling, cpu_stats: Optional[dict], worker_dispatch: Optional[dict],
	lease_stats: Optional[dict], estimate_given_s: Optional[float],
	device: str, estimate_formula_s: Optional[float] = None) -> dict:
	"""Assemble the v3 `timing` block.

	Args:
	profiling: ProfilingData instance (stage timings + aggregates).
	cpu_stats: `_CPU_STATS_TLS.info` dict, or None (GPU path or no dispatch).
	worker_dispatch: `worker_pool._DISPATCH_TLS.info` dict, or None (not using remote pool).
	lease_stats: `_LEASE_STATS_TLS.info` dict, or None (CPU path — no lease).
	estimate_given_s: Ceil-to-5 value handed to the user — populates
	`timing.estimate_given_s`.
	estimate_formula_s: Raw formula output pre-ceil — populates
	`timing.estimate_formula_s`. Lets estimator tuning separate ceiling
	error from slope/intercept error.
	device: "gpu" / "cpu".
	"""
	is_gpu = device.lower() == "gpu" and lease_stats is not None

	# stages = non-lease, non-per-batch wall buckets (always present)
	stages = {
	"resample_s": _r(getattr(profiling, "resample_time", 0.0)),
	"anchor_s": _r(getattr(profiling, "anchor_time", 0.0)),
	"match_wall_s": _r(getattr(profiling, "match_wall_time", 0.0)),
	"result_build_s": _r(getattr(profiling, "result_build_time", 0.0)),
	"result_audio_encode_s": _r(getattr(profiling, "result_audio_encode_time", 0.0)),
	}

	# VAD / ASR: inference dominates (~99.5%). Stage decomposition collapsed
	# to wall_s + queue_s only. Per-batch ASR detail (incl QK^T) still lives
	# in `asr_batches[]` for the L3-cache-cliff signal.
	vad_wall = getattr(profiling, "vad_wall_time", 0.0) or 0.0
	vad_gpu = getattr(profiling, "vad_gpu_time", 0.0) or 0.0
	asr_wall = getattr(profiling, "asr_time", 0.0) or 0.0
	asr_gpu = getattr(profiling, "asr_gpu_time", 0.0) or 0.0

	vad = {
	"wall_s": _r(vad_wall),
	"queue_s": _r(max(0.0, vad_wall - vad_gpu)),
	}
	asr = {
	"wall_s": _r(asr_wall),
	"queue_s": _r(max(0.0, asr_wall - asr_gpu)),
	}

	# DP aggregate block
	dp = {
	"total_s": _r(getattr(profiling, "phoneme_dp_total_time", 0.0)),
	"avg_ms_per_seg": _r(1000 * getattr(profiling, "phoneme_dp_avg_time", 0.0)),
	"min_ms_per_seg": _r(1000 * getattr(profiling, "phoneme_dp_min_time", 0.0)),
	"max_ms_per_seg": _r(1000 * getattr(profiling, "phoneme_dp_max_time", 0.0)),
	"window_setup_s_total": _r(getattr(profiling, "phoneme_window_setup_time", 0.0)),
	"ref_build_s": _r(getattr(profiling, "phoneme_ref_build_time", 0.0)),
	"num_segments_aligned": int(getattr(profiling, "phoneme_num_segments", 0)),
	}

	timing = {
	"lease_type": (lease_stats or {}).get("lease_type") if is_gpu else ("none" if device.lower() == "cpu" else None),
	"lease_requested_s": (lease_stats or {}).get("requested_s") if is_gpu else None,
	"lease_cap_hit": (lease_stats or {}).get("cap_hit") if is_gpu else None,
	"estimate_given_s": _r(estimate_given_s),
	"estimate_formula_s": _r(estimate_formula_s),
	"wall_total_s": _r(getattr(profiling, "total_time", 0.0)),
	"stages": stages,
	"vad": vad,
	"asr": asr,
	"dp": dp,
	}

	# CPU block — local subprocess dispatch (the production CPU path)
	if cpu_stats:
	timing["cpu"] = {
	"strategy": cpu_stats.get("strategy"),
	"worker_mode": cpu_stats.get("worker_mode"),
	"dtype": cpu_stats.get("dtype"),
	"concurrency_cap": cpu_stats.get("concurrency_cap"),
	"queue_wait_s": _r(cpu_stats.get("queue_wait_s")),
	"compute_s": _r(cpu_stats.get("compute_s")),
	"peers_at_acquire": cpu_stats.get("peers_at_acquire"),
	"peers_at_release": cpu_stats.get("peers_at_release"),
	"subprocess_spawn_s": _r(cpu_stats.get("subprocess_spawn_s")),
	}
	else:
	timing["cpu"] = None

	# Remote worker-pool dispatch (when CPU_STRATEGY=workers). Kept alongside
	# the local `cpu` block; post-hoc analysis can pick whichever is populated.
	timing["worker_dispatch"] = worker_dispatch
	return timing


	# ---------------------------------------------------------------------------
	# asr_batches (pass-through of already-shaped profiling.asr_batch_profiling)
	# ---------------------------------------------------------------------------

	def build_asr_batches(profiling) -> list[dict]:
	"""Return the per-batch ASR detail list (already in v3-compatible shape)."""
	return list(getattr(profiling, "asr_batch_profiling", None) or [])


	# ---------------------------------------------------------------------------
	# segments (with dp_debug)
	# ---------------------------------------------------------------------------

	def build_segments(seg_infos, debug_collector,
	word_counts: list, ayah_spans: list,
	all_special_refs: set, include_dp_debug: bool = True,
	audio=None, sample_rate: int = 16000,
	noise_floor_rms: Optional[float] = None) -> list[dict]:
	"""Per-segment log entries, one row per seg in `seg_infos`.

	DP-trace lookup uses `seg._original_alignment_idx` (preserved across
	`_split_fused_segments`) — NOT the post-split enumerate index — so
	segments that survived a Basmala/Isti'adha split still find their trace.

	`debug_collector.to_dp_debug(idx)` is called lazily — only when
	include_dp_debug is True — to avoid the `\|`-separator build cost when
	disabled via `USAGE_LOG_DISABLE_DP_DEBUG=1`.
	"""
	out = []
	for i, seg in enumerate(seg_infos):
	sp_type = seg.matched_ref if seg.matched_ref in all_special_refs else None
	entry = {
	"idx": i + 1,
	"start": round(seg.start_time, 3),
	"end": round(seg.end_time, 3),
	"duration": round(seg.end_time - seg.start_time, 3),
	"ref": seg.matched_ref or "",
	"confidence": round(seg.match_score, 3),
	"word_count": word_counts[i] if i < len(word_counts) else 0,
	"ayah_span": ayah_spans[i] if i < len(ayah_spans) else 0,
	"has_repeated_words": seg.has_repeated_words,
	"has_missing_words": seg.has_missing_words,
	"special_type": sp_type,
	}
	if seg.repeated_ranges:
	entry["repeated_ranges"] = seg.repeated_ranges
	if seg.repeated_text:
	entry["repeated_text"] = seg.repeated_text
	if include_dp_debug and debug_collector is not None:
	trace_idx = getattr(seg, "_original_alignment_idx", None)
	if trace_idx is None:
	trace_idx = i # fallback for unsplit segments
	dbg = debug_collector.to_dp_debug(trace_idx)
	if dbg is not None:
	entry["dp_debug"] = dbg
	# Per-segment audio stats (3.1.3+) — rms/peak/snr_db computed on
	# the segment slice. SNR uses clip-level noise floor from the
	# non-speech VAD concat. Unlocks segment-granular correlations in
	# 07/08/10 (previously only clip-level `audio_rms` available).
	if audio is not None:
	from .audio_analytics import segment_audio_stats
	entry["audio_stats"] = segment_audio_stats(
	audio, sample_rate,
	seg.start_time, seg.end_time,
	noise_floor_rms,
	)
	out.append(entry)
	return out


	# ---------------------------------------------------------------------------
	# events (pass-through of DebugCollector.events)
	# ---------------------------------------------------------------------------

	def build_events(debug_collector) -> list[dict]:
	if debug_collector is None:
	return []
	return list(debug_collector.events)


	# ---------------------------------------------------------------------------
	# anchor (pass-through of DebugCollector.anchor)
	# ---------------------------------------------------------------------------

	def build_anchor(debug_collector) -> dict:
	if debug_collector is None:
	return {}
	return dict(debug_collector.anchor)


	# ---------------------------------------------------------------------------
	# reciter_stats (adds audio_rms / audio_peak; drops pps)
	# ---------------------------------------------------------------------------

	def build_reciter_stats(wpm: float, avg_seg_dur: float, std_seg_dur: float,
	avg_pause_dur: float, std_pause_dur: float,
	audio,
	audio_analytics: Optional[dict] = None) -> dict:
	"""audio is the float32 mono 16kHz waveform already in scope at log time.

	When `audio_analytics` is supplied (3.1.3+), pull `rms/peak` plus the
	additive amplitude fields from its `whole` block instead of calling
	`audio_rms_peak` a second time. Falls back to `audio_rms_peak` when
	analytics computation was skipped.
	"""
	whole = (audio_analytics or {}).get("whole") or {}
	if whole:
	rms = whole.get("rms", 0.0)
	peak = whole.get("peak", 0.0)
	dc = whole.get("dc_offset")
	p99 = whole.get("p99")
	p01 = whole.get("p01")
	crest = whole.get("crest")
	dyn_range_db = whole.get("dyn_range_db")
	else:
	from .audio_stats import audio_rms_peak
	rms, peak = audio_rms_peak(audio) if audio is not None else (0.0, 0.0)
	dc = p99 = p01 = crest = dyn_range_db = None
	return {
	"wpm": _r(wpm, 2),
	"avg_seg_dur": _r(avg_seg_dur, 2),
	"std_seg_dur": _r(std_seg_dur, 2),
	"avg_pause_dur": _r(avg_pause_dur, 2),
	"std_pause_dur": _r(std_pause_dur, 2),
	"audio_rms": _r(rms, 5),
	"audio_peak": _r(peak, 5),
	"audio_dc_offset": _r(dc, 6),
	"audio_p99": _r(p99, 5),
	"audio_p01": _r(p01, 5),
	"audio_crest": _r(crest, 3),
	"audio_dyn_range_db": _r(dyn_range_db, 2),
	}


	# ---------------------------------------------------------------------------
	# results_summary
	# ---------------------------------------------------------------------------

	def _parse_detected_surahs(segments) -> list[int]:
	"""Sorted distinct surah numbers across all aligned segments.

	Derived from each segment's `matched_ref` leading "surah:" prefix.
	Skips specials (Basmala/Isti'adha/Amin/etc.) and empty refs.
	"""
	seen: set[int] = set()
	for s in segments:
	ref = getattr(s, "matched_ref", None) or ""
	if not ref or ":" not in ref:
	continue # special or no match
	head = ref.split("-", 1)[0] # handle "37:151:3-37:152:2"
	try:
	seen.add(int(head.split(":", 1)[0]))
	except ValueError:
	continue
	return sorted(seen)


	def build_results_summary(segments: list, profiling,
	total_speech_s: float,
	missing_word_count: int) -> dict:
	all_scores = [s.match_score for s in segments]
	mean_conf = (sum(all_scores) / len(all_scores)) if all_scores else 0.0
	min_conf = min(all_scores) if all_scores else 0.0
	passed = int(getattr(profiling, "segments_passed", 0) or 0)
	return {
	"detected_surahs": _parse_detected_surahs(segments),
	"num_segments": len(segments),
	"missing_word_count": int(missing_word_count),
	"total_speech_s": round(float(total_speech_s), 3),
	"mean_confidence": _r(mean_conf, 3),
	"min_confidence": _r(min_conf, 3),
	"segments_passed": passed,
	"retry_attempts": int(getattr(profiling, "retry_attempts", 0) or 0),
	"retry_passed": int(getattr(profiling, "retry_passed", 0) or 0),
	"reanchors": int(getattr(profiling, "consec_reanchors", 0) or 0),
	"special_merges": int(getattr(profiling, "special_merges", 0) or 0),
	"transition_skips": int(getattr(profiling, "transition_skips", 0) or 0),
	"wraps_detected": int(getattr(profiling, "phoneme_wraps_detected", 0) or 0),
	}


	# ---------------------------------------------------------------------------
	# gpu_memory (small one-liner — inlined at call site but exposed for parity)
	# ---------------------------------------------------------------------------

	def build_gpu_memory(profiling, device: str) -> Optional[dict]:
	if device.lower() != "gpu":
	return None
	return {
	"peak_vram_mb": _r(getattr(profiling, "gpu_peak_vram_mb", 0.0), 2),
	"reserved_vram_mb": _r(getattr(profiling, "gpu_reserved_vram_mb", 0.0), 2),
	}