Quran-multi-aligner

Running on Zero

App Files Files Community

Quran-multi-aligner / config.py

hetchyy

Upload folder using huggingface_hub

97471e8 verified about 15 hours ago

raw

history blame contribute delete

19.8 kB

	"""
	Configuration settings for the Segments App.
	"""
	import os
	from pathlib import Path

	# HF Spaces detection
	IS_HF_SPACE = os.environ.get("SPACE_ID") is not None
	DEV_TAB_VISIBLE = not IS_HF_SPACE

	# Get project root directory
	PROJECT_ROOT = Path(__file__).parent.absolute()

	# Port for local development
	PORT = 6902

	# =============================================================================
	# Audio settings
	# =============================================================================

	RESAMPLE_TYPE = "soxr_lq"
	SEGMENT_AUDIO_DIR = Path("/tmp/segments") # WAV files written here per request
	URL_DOWNLOAD_DIR = Path("/tmp/url_downloads") # Audio downloaded from URLs via yt-dlp
	DEFAULT_INPUT_MODE = "Upload" # "Link", "Upload", or "Record"
	DELETE_CACHE_FREQUENCY = 3600*5 # Gradio cache cleanup interval (seconds)
	DELETE_CACHE_AGE = 3600*5 # Delete cached files older than this (seconds)

	# =============================================================================
	# Session API settings
	# =============================================================================

	SESSION_DIR = Path("/tmp/aligner_sessions") # Per-session cached data (audio, VAD, metadata)
	SESSION_EXPIRY_SECONDS = 3600*5 # 5 hours — matches DELETE_CACHE_AGE

	# =============================================================================
	# CPU dispatch strategy — which path runs @gpu_with_fallback funcs when the
	# user selects device=CPU (or when GPU quota is exhausted).
	# =============================================================================

	# Routing:
	# "subprocess" — spawn a local subprocess on the main Space (fast on zero-a10g,
	# ~10s for 112.mp3 Base; isolates CUDA state). Requires main
	# Space hardware to be ZeroGPU-capable.
	# "workers" — dispatch to remote CPU Spaces listed in WORKER_SPACES
	# (isolates load off main; ~40–80s per request on cpu-basic).
	# "both" — prefer local subprocess (concurrency 1), overflow to remote.
	# NOT IMPLEMENTED yet — reserved for future orchestration work.
	CPU_STRATEGY = os.environ.get("CPU_STRATEGY", "subprocess").lower()

	# Max seconds a subprocess CPU job can run before SIGKILL (used by "subprocess" and "both" strategies).
	CPU_SUBPROCESS_TIMEOUT = int(os.environ.get("CPU_SUBPROCESS_TIMEOUT", str(3600 * 2)))

	# Max concurrent CPU subprocesses on the main Space.
	CPU_SUBPROCESS_CONCURRENCY = int(os.environ.get("CPU_SUBPROCESS_CONCURRENCY", "2"))

	# CPU_WORKER_MODE — when CPU_STRATEGY="subprocess", chooses between:
	# "spawn" — legacy: fork a fresh subprocess per request (cpu_subprocess.py).
	# "persistent" — new: route to a pool of long-lived workers (cpu_worker_pool.py).
	# Semaphore capacity stays = CPU_SUBPROCESS_CONCURRENCY either way.
	CPU_WORKER_MODE = os.environ.get("CPU_WORKER_MODE", "persistent").lower()

	# Whether the persistent pool preloads ASR Large at boot. If False, Large is
	# loaded on-demand inside the worker and cached there.
	CPU_POOL_PRELOAD_LARGE = os.environ.get("CPU_POOL_PRELOAD_LARGE", "1") == "1"

	# Model dtype for CPU inference.
	# "bfloat16" — default. Routes attention through PyTorch's chunked CPU flash
	# kernel (`_scaled_dot_product_flash_attention_for_cpu`), which
	# does NOT materialise the full `(batch, heads, seq, seq)` QK^T
	# tensor per layer. Avoids the L3-cache cliff that fp16 triggers
	# at large batch shapes (observed 24× slowdown on 22 min audio).
	# Measured ~32% faster than fp16 on the same input.
	# "float16" — fast on CPUs with AVX512_FP16 (zero-a10g host) but CATASTROPHIC
	# on CPUs without it (cpu-basic workers: 10-100× slower) AND
	# hits the cache cliff at large batches even on supported CPUs.
	# "float32" — safe fallback, ~2× slower than bf16 on modern hosts.
	CPU_DTYPE = os.environ.get("CPU_DTYPE", "bfloat16").lower()

	# =============================================================================
	# CPU worker pool settings (remote dispatch to duplicate CPU Spaces)
	# =============================================================================

	# Comma-separated HF Space slugs, e.g. "owner/space-a,owner/space-b".
	# Empty = no pool (dispatch falls back to local subprocess).
	CPU_WORKER_SPACES = os.environ.get("WORKER_SPACES", "").strip()

	# Audio encoding on the wire: "float32" \| "int16" \| "ogg". OGG is ~17x smaller
	# than float32 for speech and fastest end-to-end at every tested size.
	CPU_WORKER_TRANSPORT_DEFAULT = os.environ.get("CPU_TRANSPORT", "ogg").lower()

	# Per-job HTTP read timeout (seconds). Long because CPU pipelines are slow and
	# workers may cold-start from sleep.
	CPU_WORKER_HTTP_TIMEOUT = int(os.environ.get("CPU_WORKER_TIMEOUT", str(3600 * 2)))

	# Max wait for a free worker before failing with PoolExhaustedError. User-facing.
	CPU_WORKER_ACQUIRE_TIMEOUT = int(os.environ.get("CPU_WORKER_ACQUIRE_TIMEOUT", "900")) # 15 mins

	# Admission control: reject when busy_workers + queued_waiters exceeds this and
	# no worker is immediately free. Prevents runaway pile-up under bursty load.
	CPU_WORKER_MAX_QUEUE_DEPTH = int(os.environ.get("CPU_WORKER_MAX_QUEUE", "10"))

	# Background thread ping interval for unhealthy workers (seconds).
	CPU_WORKER_HEALTH_INTERVAL = int(os.environ.get("CPU_WORKER_HEALTH_INTERVAL", "600"))

	# Max retry attempts on dispatch failure (retries land on a different worker if available).
	CPU_WORKER_MAX_RETRIES = 1

	# Idle-read timeout on the SSE stream from a worker. If no bytes arrive within
	# this window, either the worker is stuck or the client has disconnected and
	# we give the watchdog a chance to abort. Must be > longest silent compute block.
	CPU_WORKER_SSE_IDLE_TIMEOUT = int(os.environ.get("CPU_WORKER_SSE_IDLE_TIMEOUT", "120"))

	# Client-disconnect poll interval (seconds) for the cancel watchdog thread.
	CPU_WORKER_CANCEL_POLL_INTERVAL = float(os.environ.get("CPU_WORKER_CANCEL_POLL_INTERVAL", "2.0"))

	# =============================================================================
	# Model and data paths
	# =============================================================================

	# VAD segmenter model
	SEGMENTER_MODEL = "obadx/recitation-segmenter-v2"

	# Phoneme ASR models (wav2vec2 CTC)
	PHONEME_ASR_MODELS = {
	"Base": "hetchyy/r15_95m",
	"Large": "hetchyy/r7",
	}
	PHONEME_ASR_MODEL_DEFAULT = "Base"

	DATA_PATH = PROJECT_ROOT / "data"
	SURAH_INFO_PATH = DATA_PATH / "surah_info.json"

	# Quran script paths
	QURAN_SCRIPT_PATH_COMPUTE = DATA_PATH / "qpc_hafs.json"
	QURAN_SCRIPT_PATH_DISPLAY = DATA_PATH / "digital_khatt_v2_script.json"

	# Pre-built phoneme cache (all 114 chapters)
	PHONEME_CACHE_PATH = DATA_PATH / "phoneme_cache.pkl"

	# Phoneme n-gram index for anchor detection
	NGRAM_SIZE = 5
	NGRAM_INDEX_PATH = DATA_PATH / f"phoneme_ngram_index_{NGRAM_SIZE}.pkl"

	# =============================================================================
	# ZeroGPU Lease Timings
	# =============================================================================

	ZEROGPU_MAX_DURATION = 120 # Hard cap enforced by HF ZeroGPU
	AUDIO_DURATION_WARNING_MINUTES = 300 # Warn user on upload if audio exceeds this (minutes)

	def get_vad_duration(minutes):
	"""GPU seconds needed for VAD based on audio minutes."""
	VAD_LEASE_BUFFER = 5
	return max(3, 0.28 * minutes + 1.66 + VAD_LEASE_BUFFER)

	def get_asr_duration(minutes, model_name="Base"):
	"""GPU seconds needed for ASR, scales linearly with audio duration."""
	if model_name == "Large":
	ASR_LEASE_BUFFER = 6.54
	return max(3, 0.0579 * minutes + 1.72 + ASR_LEASE_BUFFER)
	ASR_LEASE_BUFFER = 4.5
	return max(3, 0.0198 * minutes + 0.32 + ASR_LEASE_BUFFER)

	# =============================================================================
	# Estimations
	# =============================================================================

	MFA_PROGRESS_SEGMENT_RATE = 0.05 # seconds per segment for progress bar animation

	ESTIMATE_GPU_BASE_SLOPE = 0.45
	ESTIMATE_GPU_BASE_INTERCEPT = 7.6
	ESTIMATE_GPU_LARGE_SLOPE = 0.53
	ESTIMATE_GPU_LARGE_INTERCEPT = 7.2
	ESTIMATE_CPU_BASE_SLOPE = 11.2
	ESTIMATE_CPU_BASE_INTERCEPT = 20.9
	ESTIMATE_CPU_LARGE_SLOPE = 25.2
	ESTIMATE_CPU_LARGE_INTERCEPT = 24.4

	# =============================================================================
	# Inference Settings
	# =============================================================================

	# Batching strategy
	BATCHING_STRATEGY = "dynamic" # "naive" (fixed count) or "dynamic" (seconds + pad waste)

	# Naive batching
	INFERENCE_BATCH_SIZE = 32 # Fixed segments per batch (used when BATCHING_STRATEGY="naive")

	# Dynamic batching constraints
	MAX_BATCH_SECONDS = 600 # GPU: max total audio seconds per batch (sum of durations)
	MAX_BATCH_SECONDS_CPU = 300 # CPU: tighter cap. SDPA materialises the QK^T tensor per encoder layer
	MAX_PAD_WASTE = 0.2 # Max fraction of padded tensor that is wasted (0=no waste, 1=all waste)
	MIN_BATCH_SIZE = 8 # Minimum segments per batch (prevents underutilization)

	# Model precision
	DTYPE = "float16"
	TORCH_COMPILE = True # Apply torch.compile() to GPU models (reduce-overhead mode)

	# AOTInductor compilation (ZeroGPU optimization)
	AOTI_ENABLED = True # Enable AOT compilation for VAD model on HF Space
	AOTI_MIN_AUDIO_MINUTES = 15 # Min audio duration for dynamic shapes
	AOTI_MAX_AUDIO_MINUTES = 90 # Max audio duration for dynamic shapes
	AOTI_HUB_ENABLED = True # Enable Hub persistence (upload/download compiled models)
	AOTI_HUB_REPO = "hetchyy/quran-aligner-aoti" # Hub repo for compiled model cache

	# =============================================================================
	# Phoneme-based alignment settings
	# =============================================================================

	ANCHOR_SEGMENTS = 5 # N-gram voting uses first N Quran segments
	ANCHOR_RARITY_WEIGHTING = True # Weight votes by 1/count (rarity); False = equal weight
	ANCHOR_RUN_TRIM_RATIO = 0.2 # Trim leading/trailing ayahs whose weight < ratio * max weight in run
	ANCHOR_TOP_CANDIDATES = 20 # Evaluate top N surahs by total weight for contiguous run comparison

	# Edit operation costs (Levenshtein hyperparameters)
	COST_SUBSTITUTION = 1.0 # Default phoneme substitution cost
	COST_INSERTION = 1.0 # Insert phoneme from reference (R)
	COST_DELETION = 1.0 # Delete phoneme from ASR (P)

	# Repetition detection (wraparound DP)
	WRAP_PENALTY = 3.5 # Cost per wrap transition in DP
	WRAP_SPAN_WEIGHT = 0.1 # Per-word cost for wrap span width (penalizes wide jumps)
	MAX_WRAPS = 5 # Max wraps for all segments
	# Scoring mode for wraparound candidate selection:
	# "no_subtract" — WRAP_PENALTY stays in the raw cost before normalizing, so wraps
	# are penalized proportionally to segment length. WRAP_SCORE_COST ignored.
	# "additive" — WRAP_PENALTY is subtracted from cost before normalizing (so it doesn't
	# inflate the edit distance), then WRAP_SCORE_COST * k is added to the
	# final score as a flat per-wrap penalty.
	# "subtract" — WRAP_PENALTY subtracted from cost before normalizing, but nothing
	# replaces it. Wraps are essentially free after subtraction — useful
	# as a debug/baseline mode only.
	WRAP_SCORING_MODE = "no_subtract"
	WRAP_SCORE_COST = 0.005 # Per-wrap additive penalty in scoring (only used with "additive" mode)

	# Alignment thresholds (normalized edit distance: 0 = identical, 1 = completely different)
	LOOKBACK_WORDS = 30 # Window words to look back from pointer for starting positions
	LOOKAHEAD_WORDS = 10 # Window words to look ahead after expected end position
	MAX_EDIT_DISTANCE = 0.25 # Max normalized edit distance for valid ayah match
	MAX_SPECIAL_EDIT_DISTANCE = 0.35 # Max normalized edit distance for Basmala/Isti'adha detection
	MAX_TRANSITION_EDIT_DISTANCE = 0.45 # Max normalized edit distance for transition segments (Amin/Takbir/Tahmeed)
	START_PRIOR_WEIGHT = 0.005 # Penalty per word away from expected position

	# Failed Segments
	RETRY_LOOKBACK_WORDS = 70 # Expanded lookback for retry tier 1+2
	RETRY_LOOKAHEAD_WORDS = 40 # Expanded lookahead for retry tier 1+2
	MAX_EDIT_DISTANCE_RELAXED = 0.45 # Relaxed threshold for retry tier 2
	MAX_CONSECUTIVE_FAILURES = 2 # Re-anchor within surah after this many DP failures

	# Debug output
	ANCHOR_DEBUG = False # Show detailed n-gram voting info (votes, top candidates)
	PHONEME_ALIGNMENT_DEBUG = False # Show detailed alignment info (R, P, edit costs)
	PHONEME_ALIGNMENT_PROFILING = True # Track and log timing breakdown (DP, window setup, etc.)

	# =============================================================================
	# Segmentation slider settings
	# =============================================================================

	# Segmentation presets: (min_silence_ms, min_speech_ms, pad_ms)
	PRESET_MUJAWWAD = (600, 1500, 300) # Slow / Mujawwad recitation
	PRESET_MURATTAL = (200, 750, 100) # Normal pace (default)
	PRESET_FAST = (75, 750, 40) # Fast recitation

	# Slider ranges (defaults come from PRESET_MURATTAL)
	MIN_SILENCE_MIN = 25
	MIN_SILENCE_MAX = 1000
	MIN_SILENCE_STEP = 25

	MIN_SPEECH_MIN = 500
	MIN_SPEECH_MAX = 2000
	MIN_SPEECH_STEP = 250

	PAD_MIN = 0
	PAD_MAX = 300
	PAD_STEP = 25

	# =============================================================================
	# Confidence thresholds for color coding
	# =============================================================================

	CONFIDENCE_HIGH = 0.8 # >= this: Green
	CONFIDENCE_MED = 0.6 # >= this: Yellow, below: Red
	REVIEW_SUMMARY_MAX_SEGMENTS = 15 # Max segment numbers to list before truncating

	# =============================================================================
	# MFA forced alignment (word-level timestamps via HF Space)
	# =============================================================================

	MFA_SPACE_URL = "https://hetchyy-quran-phoneme-mfa.hf.space"
	MFA_TIMEOUT = 240
	MFA_METHOD = "kalpy" # "kalpy", "align_one", "python_api", "cli"
	MFA_BEAM = 15 # Viterbi beam width
	MFA_RETRY_BEAM = 40 # Retry beam width (used when initial alignment fails)
	MFA_SHARED_CMVN = False # Compute shared CMVN across batch (kalpy only)

	# =============================================================================
	# Usage logging (pushed to HF Hub via ParquetScheduler)
	# =============================================================================

	# Subset naming: HF config per dataset. Bump on breaking column changes
	# (column added / dropped / renamed). Patch-level changes stay in the same
	# subset and are filtered at read-time via the row-level `schema_version`.
	# Flush cadences default to 60 min on prod; dev Space overrides to 1 via env.

	# --- Logs dataset (per-request metadata) ---
	USAGE_LOG_LOGS_REPO = os.environ.get("USAGE_LOG_LOGS_REPO", "hetchyy/quran-aligner-logs")
	USAGE_LOG_LOGS_SUBSET = "v3.1"
	USAGE_LOG_SCHEMA_VERSION = "3.1.1" # row-level tag; also inherited by audio rows
	USAGE_LOG_FLUSH_MINUTES = int(os.environ.get("USAGE_LOG_FLUSH_MINUTES", "60"))

	# --- Audio dataset (source audio, deduped by content hash) ---
	USAGE_LOG_AUDIO_REPO = os.environ.get("USAGE_LOG_AUDIO_REPO", "hetchyy/quran-aligner-audio")
	USAGE_LOG_AUDIO_SUBSET = "v3.0"
	USAGE_LOG_AUDIO_FLUSH_MINUTES = int(os.environ.get("USAGE_LOG_AUDIO_FLUSH_MINUTES", str(USAGE_LOG_FLUSH_MINUTES)))

	# --- Errors dataset (per-error-event rows) ---
	USAGE_LOG_ERRORS_REPO = os.environ.get("USAGE_LOG_ERRORS_REPO", "hetchyy/quran-aligner-errors")
	USAGE_LOG_ERRORS_SUBSET = "v1.0"
	USAGE_LOG_ERRORS_SCHEMA_VERSION = "1.0.0"
	USAGE_LOG_ERRORS_FLUSH_MINUTES = int(os.environ.get("USAGE_LOG_ERRORS_FLUSH_MINUTES", str(USAGE_LOG_FLUSH_MINUTES)))

	# --- Telemetry dataset (periodic host + CPU pool samples) ---
	USAGE_LOG_TELEMETRY_REPO = os.environ.get("USAGE_LOG_TELEMETRY_REPO", "hetchyy/quran-aligner-telemetry")
	USAGE_LOG_TELEMETRY_SUBSET = "v1.0"
	TELEMETRY_SCHEMA_VERSION = "1.0.3"
	TELEMETRY_ENABLED = os.environ.get("TELEMETRY_ENABLED", "1") == "1"
	TELEMETRY_SAMPLE_SECONDS = int(os.environ.get("TELEMETRY_SAMPLE_SECONDS", "60"))
	TELEMETRY_FLUSH_MINUTES = int(os.environ.get("TELEMETRY_FLUSH_MINUTES", "60"))

	# Temporary kill-switch for the per-segment DP replay strings. Set to "1" to disable.
	USAGE_LOG_DISABLE_DP_DEBUG = os.environ.get("USAGE_LOG_DISABLE_DP_DEBUG", "0") == "1"

	# =============================================================================
	# UI settings
	# =============================================================================

	# Main layout column scales
	LEFT_COLUMN_SCALE = 4
	RIGHT_COLUMN_SCALE = 6

	QURAN_TEXT_SIZE_PX = 24 # Size for Quran text in segment cards
	ARABIC_WORD_SPACING = "0.2em" # Word spacing for Arabic text

	# =============================================================================
	# Animation settings
	# =============================================================================

	# Animation granularity
	ANIM_GRANULARITIES = ["Words", "Characters"]
	ANIM_GRANULARITY_DEFAULT = "Words"

	ANIM_WORD_COLOR = "#49c3b3" # Green highlight for active word
	ANIM_STYLE_ROW_SCALES = (2, 6, 1, 1) # Granularity, Style, Verse Only, Color

	ANIM_OPACITY_PREV_DEFAULT = 0.3 # Default "before" opacity
	ANIM_OPACITY_AFTER_DEFAULT = 0.3 # Default "after" opacity
	ANIM_OPACITY_STEP = 0.1 # Opacity slider step size

	# Mega-card text styling sliders
	MEGA_WORD_SPACING_MIN = 0.0
	MEGA_WORD_SPACING_MAX = 1.0
	MEGA_WORD_SPACING_STEP = 0.05
	MEGA_WORD_SPACING_DEFAULT = 0.2 # matches ARABIC_WORD_SPACING

	MEGA_TEXT_SIZE_MIN = 12
	MEGA_TEXT_SIZE_MAX = 60
	MEGA_TEXT_SIZE_STEP = 2
	MEGA_TEXT_SIZE_DEFAULT = 30 # matches QURAN_TEXT_SIZE_PX
	MEGA_SURAH_LIGATURE_SIZE = 2 # em — surah name ligature font size in megacard

	MEGA_LINE_SPACING_MIN = 1.5
	MEGA_LINE_SPACING_MAX = 3.0
	MEGA_LINE_SPACING_STEP = 0.1
	MEGA_LINE_SPACING_DEFAULT = 2 # matches mega-card line-height

	# Window engine settings (all modes use the window engine internally)
	ANIM_WINDOW_PREV_DEFAULT = 4 # Default number of visible previous words/chars
	ANIM_WINDOW_AFTER_DEFAULT = 4 # Default number of visible after words/chars
	ANIM_WINDOW_PREV_MIN = 0
	ANIM_WINDOW_AFTER_MIN = 0
	ANIM_WINDOW_PREV_MAX = 15
	ANIM_WINDOW_AFTER_MAX = 15

	# Presets map mode names to window engine parameter values
	ANIM_DISPLAY_MODE_DEFAULT = "Reveal"
	ANIM_DISPLAY_MODES = ["Reveal", "Fade", "Spotlight", "Isolate", "Consume", "Custom"]
	ANIM_PRESETS = {
	"Reveal": {
	"prev_opacity": 1.0,
	"prev_words": ANIM_WINDOW_PREV_MAX,
	"after_opacity": 0.0,
	"after_words": 0,
	},
	"Fade": {
	"prev_opacity": 1.0,
	"prev_words": ANIM_WINDOW_PREV_MAX,
	"after_opacity": 0.3,
	"after_words": ANIM_WINDOW_AFTER_MAX,
	},
	"Spotlight": {
	"prev_opacity": 0.3,
	"prev_words": ANIM_WINDOW_PREV_MAX,
	"after_opacity": 0.3,
	"after_words": ANIM_WINDOW_AFTER_MAX,
	},
	"Isolate": {
	"prev_opacity": 0,
	"prev_words": 0,
	"after_opacity": 0,
	"after_words": 0,
	},
	"Consume": {
	"prev_opacity": 0,
	"prev_words": 0,
	"after_opacity": 0.3,
	"after_words": ANIM_WINDOW_AFTER_MAX,
	}
	}