Spaces:
Running on Zero
Running on Zero
Update walltime estimation
Browse files- config.py +22 -10
- src/api/session_api.py +30 -13
config.py
CHANGED
|
@@ -63,21 +63,33 @@ NGRAM_INDEX_PATH = DATA_PATH / f"phoneme_ngram_index_{NGRAM_SIZE}.pkl"
|
|
| 63 |
# Inference settings
|
| 64 |
# =============================================================================
|
| 65 |
|
| 66 |
-
# VAD lease: linear regression from
|
|
|
|
| 67 |
def get_vad_duration(minutes):
|
| 68 |
"""GPU seconds needed for VAD based on audio minutes."""
|
| 69 |
-
VAD_LEASE_BUFFER =
|
| 70 |
-
return max(3, 0.
|
| 71 |
|
| 72 |
def get_asr_duration(minutes, model_name="Base"):
|
| 73 |
-
"""GPU seconds needed for ASR.
|
| 74 |
-
"""
|
| 75 |
if model_name == "Large":
|
| 76 |
-
return
|
| 77 |
-
return
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
# Batching strategy
|
| 83 |
BATCHING_STRATEGY = "dynamic" # "naive" (fixed count) or "dynamic" (seconds + pad waste)
|
|
|
|
| 63 |
# Inference settings
|
| 64 |
# =============================================================================
|
| 65 |
|
| 66 |
+
# VAD lease: linear regression from 203 GPU runs (R²=0.996)
|
| 67 |
+
# vad_gpu = 0.284 * minutes + 1.80; max residual +2.17s
|
| 68 |
def get_vad_duration(minutes):
|
| 69 |
"""GPU seconds needed for VAD based on audio minutes."""
|
| 70 |
+
VAD_LEASE_BUFFER = 3 # covers max residual (2.17s) with margin
|
| 71 |
+
return max(3, 0.284 * minutes + 1.80 + VAD_LEASE_BUFFER)
|
| 72 |
|
| 73 |
def get_asr_duration(minutes, model_name="Base"):
|
| 74 |
+
"""GPU seconds needed for ASR (constant, independent of audio duration)."""
|
|
|
|
| 75 |
if model_name == "Large":
|
| 76 |
+
return 10 # max observed 8.43s (n=32)
|
| 77 |
+
return 5 # max observed 4.63s (n=177)
|
| 78 |
+
|
| 79 |
+
# Wall-time estimation: direct regression on total time (not sum of leases)
|
| 80 |
+
# GPU Base: total = 0.43 * minutes + 5.5 (R²=0.89, n=177)
|
| 81 |
+
# GPU Large: total = 0.50 * minutes + 11.2 (R²=0.20, n=32)
|
| 82 |
+
# CPU Base: total = 11.2 * minutes + 20.9 (R²=0.46, n=37)
|
| 83 |
+
# CPU Large: total = 25.2 * minutes + 24.4 (R²=0.67, n=11)
|
| 84 |
+
ESTIMATE_GPU_BASE_SLOPE = 0.43
|
| 85 |
+
ESTIMATE_GPU_BASE_INTERCEPT = 5.5
|
| 86 |
+
ESTIMATE_GPU_LARGE_SLOPE = 0.50
|
| 87 |
+
ESTIMATE_GPU_LARGE_INTERCEPT = 11.2
|
| 88 |
+
ESTIMATE_CPU_BASE_SLOPE = 11.2
|
| 89 |
+
ESTIMATE_CPU_BASE_INTERCEPT = 20.9
|
| 90 |
+
ESTIMATE_CPU_LARGE_SLOPE = 25.2
|
| 91 |
+
ESTIMATE_CPU_LARGE_INTERCEPT = 24.4
|
| 92 |
+
ESTIMATE_WALL_BUFFER = 1.5 # multiplier on regression to cover variance
|
| 93 |
|
| 94 |
# Batching strategy
|
| 95 |
BATCHING_STRATEGY = "dynamic" # "naive" (fixed count) or "dynamic" (seconds + pad waste)
|
src/api/session_api.py
CHANGED
|
@@ -216,10 +216,17 @@ def _load_session_metadata(audio_id):
|
|
| 216 |
|
| 217 |
def estimate_duration(endpoint, audio_duration_s=None, audio_id=None,
|
| 218 |
model_name="Base", device="GPU"):
|
| 219 |
-
"""Estimate processing duration for a given endpoint.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
from config import (
|
| 221 |
-
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
| 223 |
MFA_PROGRESS_SEGMENT_RATE,
|
| 224 |
)
|
| 225 |
|
|
@@ -261,16 +268,26 @@ def estimate_duration(endpoint, audio_duration_s=None, audio_id=None,
|
|
| 261 |
num_segments = len(segments)
|
| 262 |
estimate = MFA_PROGRESS_SEGMENT_RATE * num_segments
|
| 263 |
else:
|
| 264 |
-
# --- Pipeline endpoints:
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
rounded = max(5, math.ceil(estimate / 5) * 5)
|
| 276 |
|
|
|
|
| 216 |
|
| 217 |
def estimate_duration(endpoint, audio_duration_s=None, audio_id=None,
|
| 218 |
model_name="Base", device="GPU"):
|
| 219 |
+
"""Estimate processing duration for a given endpoint.
|
| 220 |
+
|
| 221 |
+
Uses direct wall-time regression (not sum of lease components) fitted on
|
| 222 |
+
257 runs from hetchyy/quran-aligner-logs v1 dataset.
|
| 223 |
+
"""
|
| 224 |
from config import (
|
| 225 |
+
ESTIMATE_GPU_BASE_SLOPE, ESTIMATE_GPU_BASE_INTERCEPT,
|
| 226 |
+
ESTIMATE_GPU_LARGE_SLOPE, ESTIMATE_GPU_LARGE_INTERCEPT,
|
| 227 |
+
ESTIMATE_CPU_BASE_SLOPE, ESTIMATE_CPU_BASE_INTERCEPT,
|
| 228 |
+
ESTIMATE_CPU_LARGE_SLOPE, ESTIMATE_CPU_LARGE_INTERCEPT,
|
| 229 |
+
ESTIMATE_WALL_BUFFER,
|
| 230 |
MFA_PROGRESS_SEGMENT_RATE,
|
| 231 |
)
|
| 232 |
|
|
|
|
| 268 |
num_segments = len(segments)
|
| 269 |
estimate = MFA_PROGRESS_SEGMENT_RATE * num_segments
|
| 270 |
else:
|
| 271 |
+
# --- Pipeline endpoints: direct wall-time regression ---
|
| 272 |
+
device_upper = (device or "GPU").upper()
|
| 273 |
+
is_large = model_name == "Large"
|
| 274 |
+
|
| 275 |
+
if device_upper == "CPU":
|
| 276 |
+
if is_large:
|
| 277 |
+
estimate = ESTIMATE_CPU_LARGE_SLOPE * minutes + ESTIMATE_CPU_LARGE_INTERCEPT
|
| 278 |
+
else:
|
| 279 |
+
estimate = ESTIMATE_CPU_BASE_SLOPE * minutes + ESTIMATE_CPU_BASE_INTERCEPT
|
| 280 |
+
else:
|
| 281 |
+
if is_large:
|
| 282 |
+
estimate = ESTIMATE_GPU_LARGE_SLOPE * minutes + ESTIMATE_GPU_LARGE_INTERCEPT
|
| 283 |
+
else:
|
| 284 |
+
estimate = ESTIMATE_GPU_BASE_SLOPE * minutes + ESTIMATE_GPU_BASE_INTERCEPT
|
| 285 |
+
|
| 286 |
+
# Retranscribe/realign skip VAD — scale down by ~50% (ASR+DP only)
|
| 287 |
+
if endpoint not in _VAD_ENDPOINTS:
|
| 288 |
+
estimate *= 0.5
|
| 289 |
+
|
| 290 |
+
estimate *= ESTIMATE_WALL_BUFFER
|
| 291 |
|
| 292 |
rounded = max(5, math.ceil(estimate / 5) * 5)
|
| 293 |
|