Quran-multi-aligner

Running on Zero

App Files Files Community

hetchyy commited on Feb 26

Commit

058f17e

1 Parent(s): d5f2531

Add estimate duration API

Browse files

Files changed (8) hide show

.gitignore +2 -1
align_config.py +20 -2
app.py +19 -15
config.py +10 -25
docs/client_api.md +52 -0
src/api/session_api.py +102 -0
src/ui/event_wiring.py +8 -0
src/ui/interface.py +2 -0

.gitignore CHANGED Viewed

@@ -52,4 +52,5 @@ captures/
 docs/api.md
 docs/lease_duration_history.md
 scripts/
-tests/

 docs/api.md
 docs/lease_duration_history.md
 scripts/
+tests/
+align_config.py

align_config.py CHANGED Viewed

@@ -4,13 +4,31 @@ Only params that differ from the quran_aligner defaults.
 """
 # Window sizes
-LOOKBACK_WORDS = 10
-LOOKAHEAD_WORDS = 5
 # Retry windows
 RETRY_LOOKBACK_WORDS = 80
 RETRY_LOOKAHEAD_WORDS = 60
 # Debug/profiling -- off for batch CLI
 ANCHOR_DEBUG = False
 PHONEME_ALIGNMENT_DEBUG = False

 """
 # Window sizes
+LOOKBACK_WORDS = 30
+LOOKAHEAD_WORDS = 8
 # Retry windows
 RETRY_LOOKBACK_WORDS = 80
 RETRY_LOOKAHEAD_WORDS = 60
+# Inference settings
+DTYPE = "float16"
+TORCH_COMPILE = False          # Skip torch.compile() overhead for batch jobs
+# Download parallelism
+DOWNLOAD_WORKERS = 16          # Parallel download+decode threads (I/O-bound, safe to oversubscribe CPUs)
+# VAD batching (number of audio files to VAD together)
+VAD_BATCH_SIZE_AYAH = 256
+VAD_BATCH_SIZE_SURA = 4
+# ASR batching
+BATCHING_STRATEGY = "dynamic"
+MAX_BATCH_SECONDS = 800
+MAX_PAD_WASTE = 0.25
+MIN_BATCH_SIZE = 16
+INFERENCE_BATCH_SIZE = 32      # Only used when BATCHING_STRATEGY="naive"
 # Debug/profiling -- off for batch CLI
 ANCHOR_DEBUG = False
 PHONEME_ALIGNMENT_DEBUG = False

app.py CHANGED Viewed

@@ -54,6 +54,7 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--share", action="store_true", help="Create public link")
     parser.add_argument("--port", type=int, default=PORT, help="Port to run on")
     args = parser.parse_args()
     port = 7860
@@ -61,22 +62,25 @@ if __name__ == "__main__":
     print(f"ZeroGPU available: {ZERO_GPU_AVAILABLE}")
     print(f"Launching Gradio on port {port}")
-    # Preload models and caches at startup so first request is fast
-    print("Preloading models...")
-    load_segmenter()
-    load_phoneme_asr("Base")
-    load_phoneme_asr("Large")
-    print("Models preloaded.")
-    print("Preloading caches...")
-    get_ngram_index()
-    preload_all_chapters()
-    print("Caches preloaded.")
-    # Warm up soxr resampler so first request doesn't pay initialization cost
-    _dummy = librosa.resample(np.zeros(1600, dtype=np.float32),
-                              orig_sr=44100, target_sr=16000, res_type=RESAMPLE_TYPE)
-    del _dummy
-    print("Resampler warmed up.")
     # AoT compilation for VAD model (requires GPU lease)
     if IS_HF_SPACE and ZERO_GPU_AVAILABLE:

     parser = argparse.ArgumentParser()
     parser.add_argument("--share", action="store_true", help="Create public link")
     parser.add_argument("--port", type=int, default=PORT, help="Port to run on")
+    parser.add_argument("--dev", action="store_true", help="Dev mode: skip model preloading for fast startup")
     args = parser.parse_args()
     port = 7860
     print(f"ZeroGPU available: {ZERO_GPU_AVAILABLE}")
     print(f"Launching Gradio on port {port}")
+    if args.dev:
+        print("Dev mode: skipping model preloading (models load on first request)")
+    else:
+        # Preload models and caches at startup so first request is fast
+        print("Preloading models...")
+        load_segmenter()
+        load_phoneme_asr("Base")
+        load_phoneme_asr("Large")
+        print("Models preloaded.")
+        print("Preloading caches...")
+        get_ngram_index()
+        preload_all_chapters()
+        print("Caches preloaded.")
+        # Warm up soxr resampler so first request doesn't pay initialization cost
+        _dummy = librosa.resample(np.zeros(1600, dtype=np.float32),
+                                  orig_sr=44100, target_sr=16000, res_type=RESAMPLE_TYPE)
+        del _dummy
+        print("Resampler warmed up.")
     # AoT compilation for VAD model (requires GPU lease)
     if IS_HF_SPACE and ZERO_GPU_AVAILABLE:

config.py CHANGED Viewed

@@ -64,36 +64,21 @@ NGRAM_INDEX_PATH = DATA_PATH / f"phoneme_ngram_index_{NGRAM_SIZE}.pkl"
 # Inference settings
 # =============================================================================
 def get_vad_duration(minutes):
-    """GPU seconds needed for VAD based on audio minutes.
-    VAD GPU time scales linearly at ~0.28s per audio minute.
-    Tuned from 50-run log analysis (Feb 2026): previous leases were tight
-    at 30-60 min (15s lease vs 17s actual) and 60-120 min (25s vs 26s).
-    """
-    if minutes > 180:
-        return 60
-    elif minutes > 120:
-        return 45      # was 40 — 137 min audio hit 38.3s (95% of old lease)
-    elif minutes > 60:
-        return 30      # was 25 — 89 min audio hit 25.8s (exceeded old lease)
-    elif minutes > 30:
-        return 20      # was 15 — 58 min audio hit 17s (exceeded old lease)
-    elif minutes > 15:
-        return 10
-    else:
-        return 5
 def get_asr_duration(minutes, model_name="Base"):
     """GPU seconds needed for ASR.
-    ASR GPU time is nearly constant regardless of audio length due to batch
-    processing — no range tiers needed.  Tuned from 50-run log analysis
-    (Feb 2026): Base uses 0.2-2.5s (warm), Large uses 0.8-5.6s (warm).
     """
     if model_name == "Large":
-        return 10      # max warm 5.6s, cold start 10.4s
-    return 3           # max warm 2.5s, cold start 5.6s
 # Batching strategy
 BATCHING_STRATEGY = "dynamic"  # "naive" (fixed count) or "dynamic" (seconds + pad waste)
@@ -195,7 +180,7 @@ MFA_TIMEOUT = 240
 MFA_METHOD = "kalpy"            # "kalpy", "align_one", "python_api", "cli"
 MFA_BEAM = 10                   # Viterbi beam width
 MFA_RETRY_BEAM = 40             # Retry beam width (used when initial alignment fails)
-MFA_SHARED_CMVN = True         # Compute shared CMVN across batch (kalpy only)
 # =============================================================================
 # Usage logging (pushed to HF Hub via ParquetScheduler)

 # Inference settings
 # =============================================================================
+# VAD lease: linear regression from 121 GPU runs (R²=0.992)
 def get_vad_duration(minutes):
+    """GPU seconds needed for VAD based on audio minutes."""
+    VAD_LEASE_BUFFER = 3  # safety margin over regression (seconds)
+    return max(3, 0.282 * minutes + VAD_LEASE_BUFFER)
 def get_asr_duration(minutes, model_name="Base"):
     """GPU seconds needed for ASR.
     """
     if model_name == "Large":
+        return 7
+    return 3
+ESTIMATE_ALIGNMENT_OVERHEAD_S = 3  # DP alignment + result building
+ESTIMATE_CPU_MULTIPLIER = 50
 # Batching strategy
 BATCHING_STRATEGY = "dynamic"  # "naive" (fixed count) or "dynamic" (seconds + pad waste)
 MFA_METHOD = "kalpy"            # "kalpy", "align_one", "python_api", "cli"
 MFA_BEAM = 10                   # Viterbi beam width
 MFA_RETRY_BEAM = 40             # Retry beam width (used when initial alignment fails)
+MFA_SHARED_CMVN = True          # Compute shared CMVN across batch (kalpy only)
 # =============================================================================
 # Usage logging (pushed to HF Hub via ParquetScheduler)

docs/client_api.md CHANGED Viewed

@@ -7,6 +7,10 @@ from gradio_client import Client
 client = Client("https://your-space.hf.space")
 # Full pipeline
 result = client.predict(
     "recitation.mp3",   # audio file path
@@ -68,6 +72,54 @@ If `audio_id` is missing, expired, or invalid:
 ## Endpoints
 ### `POST /process_audio_session`
 Full pipeline: preprocess → VAD → ASR → alignment. Creates a server-side session.

 client = Client("https://your-space.hf.space")
+# Estimate processing time before starting
+est = client.predict("process_audio_session", 60.0, None, "Base", "GPU", api_name="/estimate_duration")
+print(f"Estimated time: {est['estimated_duration_s']}s")
 # Full pipeline
 result = client.predict(
     "recitation.mp3",   # audio file path
 ## Endpoints
+### `POST /estimate_duration`
+Estimate how long a processing endpoint will take before calling it.
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `endpoint` | str | required | Target endpoint name (e.g. `"process_audio_session"`) |
+| `audio_duration_s` | float | `None` | Audio length in seconds. Required if no `audio_id` |
+| `audio_id` | str | `None` | Session ID — infers audio duration from session metadata |
+| `model_name` | str | `"Base"` | `"Base"` or `"Large"` |
+| `device` | str | `"GPU"` | `"GPU"` or `"CPU"` |
+**Example — before first processing call:**
+```python
+est = client.predict(
+    "process_audio_session",  # endpoint
+    60.0,                     # audio_duration_s (seconds)
+    None,                     # audio_id (not yet available)
+    "Base",                   # model_name
+    "GPU",                    # device
+    api_name="/estimate_duration",
+)
+print(f"Estimated time: {est['estimated_duration_s']}s")
+```
+**Example — with existing session (e.g. before MFA):**
+```python
+est = client.predict(
+    "mfa_timestamps_session",  # endpoint
+    None,                      # audio_duration_s (inferred from session)
+    audio_id,                  # audio_id
+    "Base",                    # model_name
+    "GPU",                     # device
+    api_name="/estimate_duration",
+)
+```
+**Response:**
+```json
+{
+  "endpoint": "process_audio_session",
+  "estimated_duration_s": 28.0,
+  "device": "GPU",
+  "model_name": "Base"
+}
+```
+---
 ### `POST /process_audio_session`
 Full pipeline: preprocess → VAD → ASR → alignment. Creates a server-side session.

src/api/session_api.py CHANGED Viewed

@@ -7,6 +7,7 @@ re-uploads and re-inference.
 import hashlib
 import json
 import os
 import pickle
 import re
@@ -88,6 +89,7 @@ def create_session(audio, speech_intervals, is_complete, intervals, model_name):
         "intervals": intervals,
         "model_name": model_name,
         "intervals_hash": _intervals_hash(intervals),
     }
     with open(path / "metadata.json", "w") as f:
         json.dump(meta, f)
@@ -180,6 +182,106 @@ def _load_segments(audio_id):
 _SESSION_ERROR = {"error": "Session not found or expired", "segments": []}
 def _format_response(audio_id, json_output, warning=None):
     """Convert pipeline json_output to the documented API response schema."""
     segments = []

 import hashlib
 import json
+import math
 import os
 import pickle
 import re
         "intervals": intervals,
         "model_name": model_name,
         "intervals_hash": _intervals_hash(intervals),
+        "audio_duration_s": round(len(audio) / 16000, 2),
     }
     with open(path / "metadata.json", "w") as f:
         json.dump(meta, f)
 _SESSION_ERROR = {"error": "Session not found or expired", "segments": []}
+# ---------------------------------------------------------------------------
+# Duration estimation
+# ---------------------------------------------------------------------------
+_ESTIMABLE_ENDPOINTS = {
+    "process_audio_session",
+    "resegment_session",
+    "retranscribe_session",
+    "realign_from_timestamps",
+    "mfa_timestamps_session",
+    "mfa_timestamps_direct",
+}
+_MFA_ENDPOINTS = {"mfa_timestamps_session", "mfa_timestamps_direct"}
+_VAD_ENDPOINTS = {"process_audio_session"}
+def _load_session_metadata(audio_id):
+    """Load only metadata.json (no audio/VAD). Returns dict or None."""
+    if not _validate_id(audio_id):
+        return None
+    path = _session_dir(audio_id)
+    meta_path = path / "metadata.json"
+    if not meta_path.exists():
+        return None
+    ts_file = path / "created_at"
+    if not ts_file.exists() or _is_expired(float(ts_file.read_text())):
+        return None
+    with open(meta_path) as f:
+        return json.load(f)
+def estimate_duration(endpoint, audio_duration_s=None, audio_id=None,
+                      model_name="Base", device="GPU"):
+    """Estimate processing duration for a given endpoint."""
+    from config import (
+        get_vad_duration, get_asr_duration,
+        ESTIMATE_ALIGNMENT_OVERHEAD_S, ESTIMATE_CPU_MULTIPLIER,
+        MFA_PROGRESS_SEGMENT_RATE,
+    )
+    _error = {"estimated_duration_s": None}
+    if endpoint not in _ESTIMABLE_ENDPOINTS:
+        _error["error"] = (
+            f"Unknown endpoint '{endpoint}'. "
+            f"Valid: {', '.join(sorted(_ESTIMABLE_ENDPOINTS))}"
+        )
+        return _error
+    # --- Resolve audio duration ---
+    meta = None
+    if audio_id:
+        meta = _load_session_metadata(audio_id)
+    if audio_duration_s is not None and audio_duration_s > 0:
+        duration_s = float(audio_duration_s)
+    elif meta and meta.get("audio_duration_s"):
+        duration_s = meta["audio_duration_s"]
+    else:
+        _error["error"] = (
+            "audio_duration_s is required (or provide audio_id with an existing session)"
+        )
+        return _error
+    minutes = duration_s / 60.0
+    # --- MFA endpoints require session with stored segments ---
+    if endpoint in _MFA_ENDPOINTS:
+        if not audio_id:
+            _error["error"] = "MFA estimation requires audio_id with existing segments"
+            return _error
+        segments = _load_segments(audio_id)
+        if not segments:
+            _error["error"] = "No segments found in session — run an alignment endpoint first"
+            return _error
+        num_segments = len(segments)
+        estimate = MFA_PROGRESS_SEGMENT_RATE * num_segments
+    else:
+        # --- Pipeline endpoints: VAD + ASR + alignment overhead ---
+        estimate = 0.0
+        if endpoint in _VAD_ENDPOINTS:
+            estimate += get_vad_duration(minutes)
+        estimate += get_asr_duration(minutes, model_name)
+        estimate += ESTIMATE_ALIGNMENT_OVERHEAD_S
+    # --- CPU multiplier ---
+    if device == "CPU":
+        estimate *= ESTIMATE_CPU_MULTIPLIER
+    rounded = math.ceil(estimate / 5) * 5
+    return {
+        "endpoint": endpoint,
+        "estimated_duration_s": rounded,
+        "device": device,
+        "model_name": model_name,
+    }
 def _format_response(audio_id, json_output, warning=None):
     """Convert pipeline json_output to the documented API response schema."""
     segments = []

src/ui/event_wiring.py CHANGED Viewed

@@ -8,6 +8,7 @@ from src.pipeline import (
     _retranscribe_wrapper, save_json_export,
 )
 from src.api.session_api import (
     process_audio_session, resegment_session,
     retranscribe_session, realign_from_timestamps,
     mfa_timestamps_session, mfa_timestamps_direct,
@@ -461,6 +462,13 @@ def _wire_settings_restoration(app, c):
 def _wire_api_endpoint(c):
     """Hidden API-only endpoints for session-based programmatic access."""
     gr.Button(visible=False).click(
         fn=process_audio_session,
         inputs=[c.api_audio, c.api_silence, c.api_speech, c.api_pad,

     _retranscribe_wrapper, save_json_export,
 )
 from src.api.session_api import (
+    estimate_duration,
     process_audio_session, resegment_session,
     retranscribe_session, realign_from_timestamps,
     mfa_timestamps_session, mfa_timestamps_direct,
 def _wire_api_endpoint(c):
     """Hidden API-only endpoints for session-based programmatic access."""
+    gr.Button(visible=False).click(
+        fn=estimate_duration,
+        inputs=[c.api_estimate_endpoint, c.api_estimate_audio_duration,
+                c.api_audio_id, c.api_model, c.api_device],
+        outputs=[c.api_result],
+        api_name="estimate_duration",
+    )
     gr.Button(visible=False).click(
         fn=process_audio_session,
         inputs=[c.api_audio, c.api_silence, c.api_speech, c.api_pad,

src/ui/interface.py CHANGED Viewed

@@ -89,6 +89,8 @@ def build_interface():
         c.api_timestamps = gr.JSON(visible=False)
         c.api_mfa_segments = gr.JSON(visible=False)
         c.api_mfa_granularity = gr.Textbox(visible=False)
         c.api_result = gr.JSON(visible=False)
         wire_events(app, c)

         c.api_timestamps = gr.JSON(visible=False)
         c.api_mfa_segments = gr.JSON(visible=False)
         c.api_mfa_granularity = gr.Textbox(visible=False)
+        c.api_estimate_endpoint = gr.Textbox(visible=False)
+        c.api_estimate_audio_duration = gr.Number(visible=False)
         c.api_result = gr.JSON(visible=False)
         wire_events(app, c)