Spaces:
Running on Zero
Running on Zero
| """ | |
| Configuration settings for the Segments App. | |
| """ | |
| import os | |
| from pathlib import Path | |
| # HF Spaces detection | |
| IS_HF_SPACE = os.environ.get("SPACE_ID") is not None | |
| DEV_TAB_VISIBLE = not IS_HF_SPACE | |
| # Get project root directory | |
| PROJECT_ROOT = Path(__file__).parent.absolute() | |
| # Port for local development | |
| PORT = 6902 | |
| # ============================================================================= | |
| # Audio settings | |
| # ============================================================================= | |
| RESAMPLE_TYPE = "soxr_lq" | |
| SEGMENT_AUDIO_DIR = Path("/tmp/segments") # WAV files written here per request | |
| URL_DOWNLOAD_DIR = Path("/tmp/url_downloads") # Audio downloaded from URLs via yt-dlp | |
| DEFAULT_INPUT_MODE = "Upload" # "Link", "Upload", or "Record" | |
| DELETE_CACHE_FREQUENCY = 3600*5 # Gradio cache cleanup interval (seconds) | |
| DELETE_CACHE_AGE = 3600*5 # Delete cached files older than this (seconds) | |
| # ============================================================================= | |
| # Session API settings | |
| # ============================================================================= | |
| SESSION_DIR = Path("/tmp/aligner_sessions") # Per-session cached data (audio, VAD, metadata) | |
| SESSION_EXPIRY_SECONDS = 3600*5 # 5 hours — matches DELETE_CACHE_AGE | |
| # ============================================================================= | |
| # CPU dispatch strategy — which path runs @gpu_with_fallback funcs when the | |
| # user selects device=CPU (or when GPU quota is exhausted). | |
| # ============================================================================= | |
| # Routing: | |
| # "subprocess" — spawn a local subprocess on the main Space (fast on zero-a10g, | |
| # ~10s for 112.mp3 Base; isolates CUDA state). Requires main | |
| # Space hardware to be ZeroGPU-capable. | |
| # "workers" — dispatch to remote CPU Spaces listed in WORKER_SPACES | |
| # (isolates load off main; ~40–80s per request on cpu-basic). | |
| # "both" — prefer local subprocess (concurrency 1), overflow to remote. | |
| # NOT IMPLEMENTED yet — reserved for future orchestration work. | |
| CPU_STRATEGY = os.environ.get("CPU_STRATEGY", "subprocess").lower() | |
| # Max seconds a subprocess CPU job can run before SIGKILL (used by "subprocess" and "both" strategies). | |
| CPU_SUBPROCESS_TIMEOUT = int(os.environ.get("CPU_SUBPROCESS_TIMEOUT", str(3600 * 2))) | |
| # Max concurrent CPU subprocesses on the main Space. | |
| CPU_SUBPROCESS_CONCURRENCY = int(os.environ.get("CPU_SUBPROCESS_CONCURRENCY", "2")) | |
| # CPU_WORKER_MODE — when CPU_STRATEGY="subprocess", chooses between: | |
| # "spawn" — legacy: fork a fresh subprocess per request (cpu_subprocess.py). | |
| # "persistent" — new: route to a pool of long-lived workers (cpu_worker_pool.py). | |
| # Semaphore capacity stays = CPU_SUBPROCESS_CONCURRENCY either way. | |
| CPU_WORKER_MODE = os.environ.get("CPU_WORKER_MODE", "persistent").lower() | |
| # Whether the persistent pool preloads ASR Large at boot. If False, Large is | |
| # loaded on-demand inside the worker and cached there. | |
| CPU_POOL_PRELOAD_LARGE = os.environ.get("CPU_POOL_PRELOAD_LARGE", "1") == "1" | |
| # Model dtype for CPU inference. | |
| # "bfloat16" — default. Routes attention through PyTorch's chunked CPU flash | |
| # kernel (`_scaled_dot_product_flash_attention_for_cpu`), which | |
| # does NOT materialise the full `(batch, heads, seq, seq)` QK^T | |
| # tensor per layer. Avoids the L3-cache cliff that fp16 triggers | |
| # at large batch shapes (observed 24× slowdown on 22 min audio). | |
| # Measured ~32% faster than fp16 on the same input. | |
| # "float16" — fast on CPUs with AVX512_FP16 (zero-a10g host) but CATASTROPHIC | |
| # on CPUs without it (cpu-basic workers: 10-100× slower) AND | |
| # hits the cache cliff at large batches even on supported CPUs. | |
| # "float32" — safe fallback, ~2× slower than bf16 on modern hosts. | |
| CPU_DTYPE = os.environ.get("CPU_DTYPE", "bfloat16").lower() | |
| # ============================================================================= | |
| # CPU worker pool settings (remote dispatch to duplicate CPU Spaces) | |
| # ============================================================================= | |
| # Comma-separated HF Space slugs, e.g. "owner/space-a,owner/space-b". | |
| # Empty = no pool (dispatch falls back to local subprocess). | |
| CPU_WORKER_SPACES = os.environ.get("WORKER_SPACES", "").strip() | |
| # Audio encoding on the wire: "float32" | "int16" | "ogg". OGG is ~17x smaller | |
| # than float32 for speech and fastest end-to-end at every tested size. | |
| CPU_WORKER_TRANSPORT_DEFAULT = os.environ.get("CPU_TRANSPORT", "ogg").lower() | |
| # Per-job HTTP read timeout (seconds). Long because CPU pipelines are slow and | |
| # workers may cold-start from sleep. | |
| CPU_WORKER_HTTP_TIMEOUT = int(os.environ.get("CPU_WORKER_TIMEOUT", str(3600 * 2))) | |
| # Max wait for a free worker before failing with PoolExhaustedError. User-facing. | |
| CPU_WORKER_ACQUIRE_TIMEOUT = int(os.environ.get("CPU_WORKER_ACQUIRE_TIMEOUT", "900")) # 15 mins | |
| # Admission control: reject when busy_workers + queued_waiters exceeds this and | |
| # no worker is immediately free. Prevents runaway pile-up under bursty load. | |
| CPU_WORKER_MAX_QUEUE_DEPTH = int(os.environ.get("CPU_WORKER_MAX_QUEUE", "10")) | |
| # Background thread ping interval for unhealthy workers (seconds). | |
| CPU_WORKER_HEALTH_INTERVAL = int(os.environ.get("CPU_WORKER_HEALTH_INTERVAL", "600")) | |
| # Max retry attempts on dispatch failure (retries land on a different worker if available). | |
| CPU_WORKER_MAX_RETRIES = 1 | |
| # Idle-read timeout on the SSE stream from a worker. If no bytes arrive within | |
| # this window, either the worker is stuck or the client has disconnected and | |
| # we give the watchdog a chance to abort. Must be > longest silent compute block. | |
| CPU_WORKER_SSE_IDLE_TIMEOUT = int(os.environ.get("CPU_WORKER_SSE_IDLE_TIMEOUT", "120")) | |
| # Client-disconnect poll interval (seconds) for the cancel watchdog thread. | |
| CPU_WORKER_CANCEL_POLL_INTERVAL = float(os.environ.get("CPU_WORKER_CANCEL_POLL_INTERVAL", "2.0")) | |
| # ============================================================================= | |
| # Model and data paths | |
| # ============================================================================= | |
| # VAD segmenter model | |
| SEGMENTER_MODEL = "obadx/recitation-segmenter-v2" | |
| # Phoneme ASR models (wav2vec2 CTC) | |
| PHONEME_ASR_MODELS = { | |
| "Base": "hetchyy/r15_95m", | |
| "Large": "hetchyy/r7", | |
| } | |
| PHONEME_ASR_MODEL_DEFAULT = "Base" | |
| DATA_PATH = PROJECT_ROOT / "data" | |
| SURAH_INFO_PATH = DATA_PATH / "surah_info.json" | |
| # Quran script paths | |
| QURAN_SCRIPT_PATH_COMPUTE = DATA_PATH / "qpc_hafs.json" | |
| QURAN_SCRIPT_PATH_DISPLAY = DATA_PATH / "digital_khatt_v2_script.json" | |
| # Pre-built phoneme cache (all 114 chapters) | |
| PHONEME_CACHE_PATH = DATA_PATH / "phoneme_cache.pkl" | |
| # Phoneme n-gram index for anchor detection | |
| NGRAM_SIZE = 5 | |
| NGRAM_INDEX_PATH = DATA_PATH / f"phoneme_ngram_index_{NGRAM_SIZE}.pkl" | |
| # ============================================================================= | |
| # ZeroGPU Lease Timings | |
| # ============================================================================= | |
| ZEROGPU_MAX_DURATION = 120 # Hard cap enforced by HF ZeroGPU | |
| AUDIO_DURATION_WARNING_MINUTES = 300 # Warn user on upload if audio exceeds this (minutes) | |
| def get_vad_duration(minutes): | |
| """GPU seconds needed for VAD based on audio minutes.""" | |
| VAD_LEASE_BUFFER = 5 | |
| return max(3, 0.28 * minutes + 1.66 + VAD_LEASE_BUFFER) | |
| def get_asr_duration(minutes, model_name="Base"): | |
| """GPU seconds needed for ASR, scales linearly with audio duration.""" | |
| if model_name == "Large": | |
| ASR_LEASE_BUFFER = 6.54 | |
| return max(3, 0.0579 * minutes + 1.72 + ASR_LEASE_BUFFER) | |
| ASR_LEASE_BUFFER = 4.5 | |
| return max(3, 0.0198 * minutes + 0.32 + ASR_LEASE_BUFFER) | |
| # ============================================================================= | |
| # Estimations | |
| # ============================================================================= | |
| MFA_PROGRESS_SEGMENT_RATE = 0.05 # seconds per segment for progress bar animation | |
| ESTIMATE_GPU_BASE_SLOPE = 0.45 | |
| ESTIMATE_GPU_BASE_INTERCEPT = 7.6 | |
| ESTIMATE_GPU_LARGE_SLOPE = 0.53 | |
| ESTIMATE_GPU_LARGE_INTERCEPT = 7.2 | |
| ESTIMATE_CPU_BASE_SLOPE = 11.2 | |
| ESTIMATE_CPU_BASE_INTERCEPT = 20.9 | |
| ESTIMATE_CPU_LARGE_SLOPE = 25.2 | |
| ESTIMATE_CPU_LARGE_INTERCEPT = 24.4 | |
| # ============================================================================= | |
| # Inference Settings | |
| # ============================================================================= | |
| # Batching strategy | |
| BATCHING_STRATEGY = "dynamic" # "naive" (fixed count) or "dynamic" (seconds + pad waste) | |
| # Naive batching | |
| INFERENCE_BATCH_SIZE = 32 # Fixed segments per batch (used when BATCHING_STRATEGY="naive") | |
| # Dynamic batching constraints | |
| MAX_BATCH_SECONDS = 600 # GPU: max total audio seconds per batch (sum of durations) | |
| MAX_BATCH_SECONDS_CPU = 300 # CPU: tighter cap. SDPA materialises the QK^T tensor per encoder layer | |
| MAX_PAD_WASTE = 0.2 # Max fraction of padded tensor that is wasted (0=no waste, 1=all waste) | |
| MIN_BATCH_SIZE = 8 # Minimum segments per batch (prevents underutilization) | |
| # Model precision | |
| DTYPE = "float16" | |
| TORCH_COMPILE = True # Apply torch.compile() to GPU models (reduce-overhead mode) | |
| # AOTInductor compilation (ZeroGPU optimization) | |
| AOTI_ENABLED = True # Enable AOT compilation for VAD model on HF Space | |
| AOTI_MIN_AUDIO_MINUTES = 15 # Min audio duration for dynamic shapes | |
| AOTI_MAX_AUDIO_MINUTES = 90 # Max audio duration for dynamic shapes | |
| AOTI_HUB_ENABLED = True # Enable Hub persistence (upload/download compiled models) | |
| AOTI_HUB_REPO = "hetchyy/quran-aligner-aoti" # Hub repo for compiled model cache | |
| # ============================================================================= | |
| # Phoneme-based alignment settings | |
| # ============================================================================= | |
| ANCHOR_SEGMENTS = 5 # N-gram voting uses first N Quran segments | |
| ANCHOR_RARITY_WEIGHTING = True # Weight votes by 1/count (rarity); False = equal weight | |
| ANCHOR_RUN_TRIM_RATIO = 0.2 # Trim leading/trailing ayahs whose weight < ratio * max weight in run | |
| ANCHOR_TOP_CANDIDATES = 20 # Evaluate top N surahs by total weight for contiguous run comparison | |
| # Edit operation costs (Levenshtein hyperparameters) | |
| COST_SUBSTITUTION = 1.0 # Default phoneme substitution cost | |
| COST_INSERTION = 1.0 # Insert phoneme from reference (R) | |
| COST_DELETION = 1.0 # Delete phoneme from ASR (P) | |
| # Repetition detection (wraparound DP) | |
| WRAP_PENALTY = 3.5 # Cost per wrap transition in DP | |
| WRAP_SPAN_WEIGHT = 0.1 # Per-word cost for wrap span width (penalizes wide jumps) | |
| MAX_WRAPS = 5 # Max wraps for all segments | |
| # Scoring mode for wraparound candidate selection: | |
| # "no_subtract" — WRAP_PENALTY stays in the raw cost before normalizing, so wraps | |
| # are penalized proportionally to segment length. WRAP_SCORE_COST ignored. | |
| # "additive" — WRAP_PENALTY is subtracted from cost before normalizing (so it doesn't | |
| # inflate the edit distance), then WRAP_SCORE_COST * k is added to the | |
| # final score as a flat per-wrap penalty. | |
| # "subtract" — WRAP_PENALTY subtracted from cost before normalizing, but nothing | |
| # replaces it. Wraps are essentially free after subtraction — useful | |
| # as a debug/baseline mode only. | |
| WRAP_SCORING_MODE = "no_subtract" | |
| WRAP_SCORE_COST = 0.005 # Per-wrap additive penalty in scoring (only used with "additive" mode) | |
| # Alignment thresholds (normalized edit distance: 0 = identical, 1 = completely different) | |
| LOOKBACK_WORDS = 30 # Window words to look back from pointer for starting positions | |
| LOOKAHEAD_WORDS = 10 # Window words to look ahead after expected end position | |
| MAX_EDIT_DISTANCE = 0.25 # Max normalized edit distance for valid ayah match | |
| MAX_SPECIAL_EDIT_DISTANCE = 0.35 # Max normalized edit distance for Basmala/Isti'adha detection | |
| MAX_TRANSITION_EDIT_DISTANCE = 0.45 # Max normalized edit distance for transition segments (Amin/Takbir/Tahmeed) | |
| START_PRIOR_WEIGHT = 0.005 # Penalty per word away from expected position | |
| # Failed Segments | |
| RETRY_LOOKBACK_WORDS = 70 # Expanded lookback for retry tier 1+2 | |
| RETRY_LOOKAHEAD_WORDS = 40 # Expanded lookahead for retry tier 1+2 | |
| MAX_EDIT_DISTANCE_RELAXED = 0.45 # Relaxed threshold for retry tier 2 | |
| MAX_CONSECUTIVE_FAILURES = 2 # Re-anchor within surah after this many DP failures | |
| # Debug output | |
| ANCHOR_DEBUG = False # Show detailed n-gram voting info (votes, top candidates) | |
| PHONEME_ALIGNMENT_DEBUG = False # Show detailed alignment info (R, P, edit costs) | |
| PHONEME_ALIGNMENT_PROFILING = True # Track and log timing breakdown (DP, window setup, etc.) | |
| # ============================================================================= | |
| # Segmentation slider settings | |
| # ============================================================================= | |
| # Segmentation presets: (min_silence_ms, min_speech_ms, pad_ms) | |
| PRESET_MUJAWWAD = (600, 1500, 300) # Slow / Mujawwad recitation | |
| PRESET_MURATTAL = (200, 750, 100) # Normal pace (default) | |
| PRESET_FAST = (75, 750, 40) # Fast recitation | |
| # Slider ranges (defaults come from PRESET_MURATTAL) | |
| MIN_SILENCE_MIN = 25 | |
| MIN_SILENCE_MAX = 1000 | |
| MIN_SILENCE_STEP = 25 | |
| MIN_SPEECH_MIN = 500 | |
| MIN_SPEECH_MAX = 2000 | |
| MIN_SPEECH_STEP = 250 | |
| PAD_MIN = 0 | |
| PAD_MAX = 300 | |
| PAD_STEP = 25 | |
| # ============================================================================= | |
| # Confidence thresholds for color coding | |
| # ============================================================================= | |
| CONFIDENCE_HIGH = 0.8 # >= this: Green | |
| CONFIDENCE_MED = 0.6 # >= this: Yellow, below: Red | |
| REVIEW_SUMMARY_MAX_SEGMENTS = 15 # Max segment numbers to list before truncating | |
| # ============================================================================= | |
| # MFA forced alignment (word-level timestamps via HF Space) | |
| # ============================================================================= | |
| MFA_SPACE_URL = "https://hetchyy-quran-phoneme-mfa.hf.space" | |
| MFA_TIMEOUT = 240 | |
| MFA_METHOD = "kalpy" # "kalpy", "align_one", "python_api", "cli" | |
| MFA_BEAM = 15 # Viterbi beam width | |
| MFA_RETRY_BEAM = 40 # Retry beam width (used when initial alignment fails) | |
| MFA_SHARED_CMVN = False # Compute shared CMVN across batch (kalpy only) | |
| # ============================================================================= | |
| # Usage logging (pushed to HF Hub via ParquetScheduler) | |
| # ============================================================================= | |
| # Subset naming: HF config per dataset. Bump on breaking column changes | |
| # (column added / dropped / renamed). Patch-level changes stay in the same | |
| # subset and are filtered at read-time via the row-level `schema_version`. | |
| # Flush cadences default to 60 min on prod; dev Space overrides to 1 via env. | |
| # --- Logs dataset (per-request metadata) --- | |
| USAGE_LOG_LOGS_REPO = os.environ.get("USAGE_LOG_LOGS_REPO", "hetchyy/quran-aligner-logs") | |
| USAGE_LOG_LOGS_SUBSET = "v3.1" | |
| USAGE_LOG_SCHEMA_VERSION = "3.1.1" # row-level tag; also inherited by audio rows | |
| USAGE_LOG_FLUSH_MINUTES = int(os.environ.get("USAGE_LOG_FLUSH_MINUTES", "60")) | |
| # --- Audio dataset (source audio, deduped by content hash) --- | |
| USAGE_LOG_AUDIO_REPO = os.environ.get("USAGE_LOG_AUDIO_REPO", "hetchyy/quran-aligner-audio") | |
| USAGE_LOG_AUDIO_SUBSET = "v3.0" | |
| USAGE_LOG_AUDIO_FLUSH_MINUTES = int(os.environ.get("USAGE_LOG_AUDIO_FLUSH_MINUTES", str(USAGE_LOG_FLUSH_MINUTES))) | |
| # --- Errors dataset (per-error-event rows) --- | |
| USAGE_LOG_ERRORS_REPO = os.environ.get("USAGE_LOG_ERRORS_REPO", "hetchyy/quran-aligner-errors") | |
| USAGE_LOG_ERRORS_SUBSET = "v1.0" | |
| USAGE_LOG_ERRORS_SCHEMA_VERSION = "1.0.0" | |
| USAGE_LOG_ERRORS_FLUSH_MINUTES = int(os.environ.get("USAGE_LOG_ERRORS_FLUSH_MINUTES", str(USAGE_LOG_FLUSH_MINUTES))) | |
| # --- Telemetry dataset (periodic host + CPU pool samples) --- | |
| USAGE_LOG_TELEMETRY_REPO = os.environ.get("USAGE_LOG_TELEMETRY_REPO", "hetchyy/quran-aligner-telemetry") | |
| USAGE_LOG_TELEMETRY_SUBSET = "v1.0" | |
| TELEMETRY_SCHEMA_VERSION = "1.0.3" | |
| TELEMETRY_ENABLED = os.environ.get("TELEMETRY_ENABLED", "1") == "1" | |
| TELEMETRY_SAMPLE_SECONDS = int(os.environ.get("TELEMETRY_SAMPLE_SECONDS", "60")) | |
| TELEMETRY_FLUSH_MINUTES = int(os.environ.get("TELEMETRY_FLUSH_MINUTES", "60")) | |
| # Temporary kill-switch for the per-segment DP replay strings. Set to "1" to disable. | |
| USAGE_LOG_DISABLE_DP_DEBUG = os.environ.get("USAGE_LOG_DISABLE_DP_DEBUG", "0") == "1" | |
| # ============================================================================= | |
| # UI settings | |
| # ============================================================================= | |
| # Main layout column scales | |
| LEFT_COLUMN_SCALE = 4 | |
| RIGHT_COLUMN_SCALE = 6 | |
| QURAN_TEXT_SIZE_PX = 24 # Size for Quran text in segment cards | |
| ARABIC_WORD_SPACING = "0.2em" # Word spacing for Arabic text | |
| # ============================================================================= | |
| # Animation settings | |
| # ============================================================================= | |
| # Animation granularity | |
| ANIM_GRANULARITIES = ["Words", "Characters"] | |
| ANIM_GRANULARITY_DEFAULT = "Words" | |
| ANIM_WORD_COLOR = "#49c3b3" # Green highlight for active word | |
| ANIM_STYLE_ROW_SCALES = (2, 6, 1, 1) # Granularity, Style, Verse Only, Color | |
| ANIM_OPACITY_PREV_DEFAULT = 0.3 # Default "before" opacity | |
| ANIM_OPACITY_AFTER_DEFAULT = 0.3 # Default "after" opacity | |
| ANIM_OPACITY_STEP = 0.1 # Opacity slider step size | |
| # Mega-card text styling sliders | |
| MEGA_WORD_SPACING_MIN = 0.0 | |
| MEGA_WORD_SPACING_MAX = 1.0 | |
| MEGA_WORD_SPACING_STEP = 0.05 | |
| MEGA_WORD_SPACING_DEFAULT = 0.2 # matches ARABIC_WORD_SPACING | |
| MEGA_TEXT_SIZE_MIN = 12 | |
| MEGA_TEXT_SIZE_MAX = 60 | |
| MEGA_TEXT_SIZE_STEP = 2 | |
| MEGA_TEXT_SIZE_DEFAULT = 30 # matches QURAN_TEXT_SIZE_PX | |
| MEGA_SURAH_LIGATURE_SIZE = 2 # em — surah name ligature font size in megacard | |
| MEGA_LINE_SPACING_MIN = 1.5 | |
| MEGA_LINE_SPACING_MAX = 3.0 | |
| MEGA_LINE_SPACING_STEP = 0.1 | |
| MEGA_LINE_SPACING_DEFAULT = 2 # matches mega-card line-height | |
| # Window engine settings (all modes use the window engine internally) | |
| ANIM_WINDOW_PREV_DEFAULT = 4 # Default number of visible previous words/chars | |
| ANIM_WINDOW_AFTER_DEFAULT = 4 # Default number of visible after words/chars | |
| ANIM_WINDOW_PREV_MIN = 0 | |
| ANIM_WINDOW_AFTER_MIN = 0 | |
| ANIM_WINDOW_PREV_MAX = 15 | |
| ANIM_WINDOW_AFTER_MAX = 15 | |
| # Presets map mode names to window engine parameter values | |
| ANIM_DISPLAY_MODE_DEFAULT = "Reveal" | |
| ANIM_DISPLAY_MODES = ["Reveal", "Fade", "Spotlight", "Isolate", "Consume", "Custom"] | |
| ANIM_PRESETS = { | |
| "Reveal": { | |
| "prev_opacity": 1.0, | |
| "prev_words": ANIM_WINDOW_PREV_MAX, | |
| "after_opacity": 0.0, | |
| "after_words": 0, | |
| }, | |
| "Fade": { | |
| "prev_opacity": 1.0, | |
| "prev_words": ANIM_WINDOW_PREV_MAX, | |
| "after_opacity": 0.3, | |
| "after_words": ANIM_WINDOW_AFTER_MAX, | |
| }, | |
| "Spotlight": { | |
| "prev_opacity": 0.3, | |
| "prev_words": ANIM_WINDOW_PREV_MAX, | |
| "after_opacity": 0.3, | |
| "after_words": ANIM_WINDOW_AFTER_MAX, | |
| }, | |
| "Isolate": { | |
| "prev_opacity": 0, | |
| "prev_words": 0, | |
| "after_opacity": 0, | |
| "after_words": 0, | |
| }, | |
| "Consume": { | |
| "prev_opacity": 0, | |
| "prev_words": 0, | |
| "after_opacity": 0.3, | |
| "after_words": ANIM_WINDOW_AFTER_MAX, | |
| } | |
| } |