Spaces:
Running on Zero
Running on Zero
Simplify client API docs and rename endpoints
Browse filesRewrite client_api.md descriptions to be less technical (remove internal
details like storage formats, VAD/ASR jargon). Rename API endpoints:
resegment_session → resegment, retranscribe_session → retranscribe,
mfa_timestamps_session → timestamps, mfa_timestamps_direct →
timestamps_direct. Update function names, event wiring, and api.md to
match.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- docs/client_api.md +50 -52
- src/api/session_api.py +10 -10
- src/ui/event_wiring.py +13 -13
docs/client_api.md
CHANGED
|
@@ -24,10 +24,10 @@ result = client.predict(
|
|
| 24 |
audio_id = result["audio_id"]
|
| 25 |
|
| 26 |
# Re-segment with different params (reuses cached audio + VAD)
|
| 27 |
-
result = client.predict(audio_id, 600, 1500, 300, "Base", "GPU", api_name="/
|
| 28 |
|
| 29 |
# Re-transcribe with a different model (reuses cached segments)
|
| 30 |
-
result = client.predict(audio_id, "Large", "GPU", api_name="/
|
| 31 |
|
| 32 |
# Realign with custom timestamps
|
| 33 |
result = client.predict(
|
|
@@ -37,14 +37,14 @@ result = client.predict(
|
|
| 37 |
api_name="/realign_from_timestamps"
|
| 38 |
)
|
| 39 |
|
| 40 |
-
#
|
| 41 |
-
mfa = client.predict(audio_id, None, "words", api_name="/
|
| 42 |
|
| 43 |
-
#
|
| 44 |
-
mfa = client.predict(audio_id, None, "words+chars", api_name="/
|
| 45 |
|
| 46 |
-
#
|
| 47 |
-
mfa = client.predict("recitation.mp3", result["segments"], "words", api_name="/
|
| 48 |
```
|
| 49 |
|
| 50 |
---
|
|
@@ -55,13 +55,13 @@ The first call returns an `audio_id` (32-character hex string). Pass it to subse
|
|
| 55 |
|
| 56 |
**What the server caches per session:**
|
| 57 |
|
| 58 |
-
| Data |
|
| 59 |
-
|---|---|
|
| 60 |
-
| Preprocessed audio
|
| 61 |
-
| Raw VAD speech intervals |
|
| 62 |
-
| Cleaned segment boundaries |
|
| 63 |
-
| Model name |
|
| 64 |
-
| Alignment segments |
|
| 65 |
|
| 66 |
If `audio_id` is missing, expired, or invalid:
|
| 67 |
```json
|
|
@@ -74,13 +74,13 @@ If `audio_id` is missing, expired, or invalid:
|
|
| 74 |
|
| 75 |
### `POST /estimate_duration`
|
| 76 |
|
| 77 |
-
Estimate
|
| 78 |
|
| 79 |
| Parameter | Type | Default | Description |
|
| 80 |
|---|---|---|---|
|
| 81 |
| `endpoint` | str | required | Target endpoint name (e.g. `"process_audio_session"`) |
|
| 82 |
| `audio_duration_s` | float | `None` | Audio length in seconds. Required if no `audio_id` |
|
| 83 |
-
| `audio_id` | str | `None` | Session ID —
|
| 84 |
| `model_name` | str | `"Base"` | `"Base"` or `"Large"` |
|
| 85 |
| `device` | str | `"GPU"` | `"GPU"` or `"CPU"` |
|
| 86 |
|
|
@@ -97,11 +97,11 @@ est = client.predict(
|
|
| 97 |
print(f"Estimated time: {est['estimated_duration_s']}s")
|
| 98 |
```
|
| 99 |
|
| 100 |
-
**Example — with existing session (e.g. before
|
| 101 |
```python
|
| 102 |
est = client.predict(
|
| 103 |
-
"
|
| 104 |
-
None, # audio_duration_s (
|
| 105 |
audio_id, # audio_id
|
| 106 |
"Base", # model_name
|
| 107 |
"GPU", # device
|
|
@@ -122,18 +122,18 @@ est = client.predict(
|
|
| 122 |
|
| 123 |
### `POST /process_audio_session`
|
| 124 |
|
| 125 |
-
|
| 126 |
|
| 127 |
| Parameter | Type | Default | Description |
|
| 128 |
|---|---|---|---|
|
| 129 |
-
| `audio` | file | required | Audio file (any
|
| 130 |
| `min_silence_ms` | int | 200 | Minimum silence gap to split segments |
|
| 131 |
| `min_speech_ms` | int | 1000 | Minimum speech duration to keep a segment |
|
| 132 |
| `pad_ms` | int | 100 | Padding added to each side of a segment |
|
| 133 |
-
| `model_name` | str | `"Base"` | `"Base"` (
|
| 134 |
| `device` | str | `"GPU"` | `"GPU"` or `"CPU"` |
|
| 135 |
|
| 136 |
-
If GPU
|
| 137 |
|
| 138 |
**Segmentation presets:**
|
| 139 |
|
|
@@ -185,8 +185,8 @@ If GPU quota is exhausted, automatically falls back to CPU processing rather tha
|
|
| 185 |
| `ref_from` | str | First matched word as `"surah:ayah:word"`. Empty string for special segments |
|
| 186 |
| `ref_to` | str | Last matched word as `"surah:ayah:word"`. Empty string for special segments |
|
| 187 |
| `matched_text` | str | Quran text for the matched range (or special segment text) |
|
| 188 |
-
| `confidence` | float | 0.0–1.0
|
| 189 |
-
| `has_missing_words` | bool |
|
| 190 |
| `special_type` | str | Only present for special (non-Quranic) segments — see below. Absent for normal segments |
|
| 191 |
| `error` | str? | Error message if alignment failed, else `null` |
|
| 192 |
|
|
@@ -208,7 +208,7 @@ Non-Quranic segments detected within recitations. When `special_type` is present
|
|
| 208 |
|
| 209 |
## GPU Fallback Warning
|
| 210 |
|
| 211 |
-
When
|
| 212 |
|
| 213 |
```json
|
| 214 |
{
|
|
@@ -231,18 +231,18 @@ All errors follow the same shape: `{"error": "...", "segments": []}`. Endpoints
|
|
| 231 |
| Session not found or expired | `"Session not found or expired"` | No |
|
| 232 |
| No speech detected (process) | `"No speech detected in audio"` | No (no session created) |
|
| 233 |
| No segments after resegment | `"No segments with these settings"` | Yes |
|
| 234 |
-
| Retranscribe with same model | `"Model and boundaries unchanged. Change model_name or call /
|
| 235 |
| Retranscription failed | `"Retranscription failed"` | Yes |
|
| 236 |
| Realignment failed | `"Alignment failed"` | Yes |
|
| 237 |
-
| No segments in session (
|
| 238 |
-
|
|
| 239 |
-
| No segments provided (
|
| 240 |
|
| 241 |
---
|
| 242 |
|
| 243 |
-
### `POST /
|
| 244 |
|
| 245 |
-
Re-
|
| 246 |
|
| 247 |
| Parameter | Type | Default | Description |
|
| 248 |
|---|---|---|---|
|
|
@@ -257,9 +257,9 @@ Re-cleans VAD boundaries with new segmentation parameters and re-runs ASR. Skips
|
|
| 257 |
|
| 258 |
---
|
| 259 |
|
| 260 |
-
### `POST /
|
| 261 |
|
| 262 |
-
Re-
|
| 263 |
|
| 264 |
| Parameter | Type | Default | Description |
|
| 265 |
|---|---|---|---|
|
|
@@ -269,13 +269,13 @@ Re-runs ASR with a different model on the current segment boundaries. Skips audi
|
|
| 269 |
|
| 270 |
**Response:** Same shape as `/process_audio_session`. Session model and results are updated.
|
| 271 |
|
| 272 |
-
> **Note:** Returns an error if `model_name` is the same as the current session's model. To re-run with the same model on different boundaries, use `/
|
| 273 |
|
| 274 |
---
|
| 275 |
|
| 276 |
### `POST /realign_from_timestamps`
|
| 277 |
|
| 278 |
-
|
| 279 |
|
| 280 |
| Parameter | Type | Default | Description |
|
| 281 |
|---|---|---|---|
|
|
@@ -300,13 +300,11 @@ Accepts arbitrary `(start, end)` timestamp pairs and runs ASR + alignment on eac
|
|
| 300 |
|
| 301 |
**Response:** Same shape as `/process_audio_session`. Session boundaries are replaced with the provided timestamps.
|
| 302 |
|
| 303 |
-
This endpoint subsumes split, merge, and boundary adjustment — the client computes the desired timestamps locally and sends them in one call.
|
| 304 |
-
|
| 305 |
---
|
| 306 |
|
| 307 |
-
### `POST /
|
| 308 |
|
| 309 |
-
|
| 310 |
|
| 311 |
| Parameter | Type | Default | Description |
|
| 312 |
|---|---|---|---|
|
|
@@ -320,7 +318,7 @@ result = client.predict(
|
|
| 320 |
"a1b2c3d4e5f67890a1b2c3d4e5f67890", # audio_id
|
| 321 |
None, # segments (null = use stored)
|
| 322 |
"words", # granularity
|
| 323 |
-
api_name="/
|
| 324 |
)
|
| 325 |
```
|
| 326 |
|
|
@@ -332,8 +330,8 @@ result = client.predict(
|
|
| 332 |
{"time_from": 0.48, "time_to": 2.88, "ref_from": "112:1:1", "ref_to": "112:1:4"},
|
| 333 |
{"time_from": 3.12, "time_to": 5.44, "ref_from": "112:2:1", "ref_to": "112:2:3"},
|
| 334 |
],
|
| 335 |
-
"words+chars",
|
| 336 |
-
api_name="/
|
| 337 |
)
|
| 338 |
```
|
| 339 |
|
|
@@ -347,8 +345,8 @@ result = client.predict(
|
|
| 347 |
|
| 348 |
| Field | Type | Required | Description |
|
| 349 |
|---|---|---|---|
|
| 350 |
-
| `time_from` | float | yes | Start time in seconds
|
| 351 |
-
| `time_to` | float | yes | End time in seconds
|
| 352 |
| `ref_from` | str | yes | First word as `"surah:ayah:word"`. Empty for special segments |
|
| 353 |
| `ref_to` | str | yes | Last word as `"surah:ayah:word"`. Empty for special segments |
|
| 354 |
| `segment` | int | no | 1-indexed segment number. Auto-assigned from position if omitted |
|
|
@@ -389,17 +387,17 @@ With `granularity="words+chars"`, each word includes a 4th element — letter ti
|
|
| 389 |
|
| 390 |
---
|
| 391 |
|
| 392 |
-
### `POST /
|
| 393 |
|
| 394 |
-
|
| 395 |
|
| 396 |
| Parameter | Type | Default | Description |
|
| 397 |
|---|---|---|---|
|
| 398 |
-
| `audio` | file | required | Audio file (any format) |
|
| 399 |
| `segments` | list | required | Segment list with `time_from`/`time_to` boundaries |
|
| 400 |
| `granularity` | str | `"words"` | `"words"` or `"words+chars"` |
|
| 401 |
|
| 402 |
-
**Response:** Same shape as `/
|
| 403 |
|
| 404 |
**Example (minimal):**
|
| 405 |
```python
|
|
@@ -410,8 +408,8 @@ result = client.predict(
|
|
| 410 |
{"time_from": 3.12, "time_to": 5.44, "ref_from": "112:2:1", "ref_to": "112:2:3"},
|
| 411 |
],
|
| 412 |
"words+chars",
|
| 413 |
-
api_name="/
|
| 414 |
)
|
| 415 |
```
|
| 416 |
|
| 417 |
-
Segment input format is the same as for `/
|
|
|
|
| 24 |
audio_id = result["audio_id"]
|
| 25 |
|
| 26 |
# Re-segment with different params (reuses cached audio + VAD)
|
| 27 |
+
result = client.predict(audio_id, 600, 1500, 300, "Base", "GPU", api_name="/resegment")
|
| 28 |
|
| 29 |
# Re-transcribe with a different model (reuses cached segments)
|
| 30 |
+
result = client.predict(audio_id, "Large", "GPU", api_name="/retranscribe")
|
| 31 |
|
| 32 |
# Realign with custom timestamps
|
| 33 |
result = client.predict(
|
|
|
|
| 37 |
api_name="/realign_from_timestamps"
|
| 38 |
)
|
| 39 |
|
| 40 |
+
# Get word-level timestamps (uses stored session segments)
|
| 41 |
+
mfa = client.predict(audio_id, None, "words", api_name="/timestamps")
|
| 42 |
|
| 43 |
+
# Get word + letter timestamps
|
| 44 |
+
mfa = client.predict(audio_id, None, "words+chars", api_name="/timestamps")
|
| 45 |
|
| 46 |
+
# Get timestamps without a session (standalone)
|
| 47 |
+
mfa = client.predict("recitation.mp3", result["segments"], "words", api_name="/timestamps_direct")
|
| 48 |
```
|
| 49 |
|
| 50 |
---
|
|
|
|
| 55 |
|
| 56 |
**What the server caches per session:**
|
| 57 |
|
| 58 |
+
| Data | Updated by |
|
| 59 |
+
|---|---|
|
| 60 |
+
| Preprocessed audio | — |
|
| 61 |
+
| Raw VAD speech intervals | — |
|
| 62 |
+
| Cleaned segment boundaries | `/resegment`, `/realign_from_timestamps` |
|
| 63 |
+
| Model name | `/retranscribe` |
|
| 64 |
+
| Alignment segments | Any alignment call |
|
| 65 |
|
| 66 |
If `audio_id` is missing, expired, or invalid:
|
| 67 |
```json
|
|
|
|
| 74 |
|
| 75 |
### `POST /estimate_duration`
|
| 76 |
|
| 77 |
+
Estimate processing time before starting a request.
|
| 78 |
|
| 79 |
| Parameter | Type | Default | Description |
|
| 80 |
|---|---|---|---|
|
| 81 |
| `endpoint` | str | required | Target endpoint name (e.g. `"process_audio_session"`) |
|
| 82 |
| `audio_duration_s` | float | `None` | Audio length in seconds. Required if no `audio_id` |
|
| 83 |
+
| `audio_id` | str | `None` | Session ID — looks up audio duration from the session |
|
| 84 |
| `model_name` | str | `"Base"` | `"Base"` or `"Large"` |
|
| 85 |
| `device` | str | `"GPU"` | `"GPU"` or `"CPU"` |
|
| 86 |
|
|
|
|
| 97 |
print(f"Estimated time: {est['estimated_duration_s']}s")
|
| 98 |
```
|
| 99 |
|
| 100 |
+
**Example — with existing session (e.g. before getting timestamps):**
|
| 101 |
```python
|
| 102 |
est = client.predict(
|
| 103 |
+
"timestamps", # endpoint
|
| 104 |
+
None, # audio_duration_s (looked up from session)
|
| 105 |
audio_id, # audio_id
|
| 106 |
"Base", # model_name
|
| 107 |
"GPU", # device
|
|
|
|
| 122 |
|
| 123 |
### `POST /process_audio_session`
|
| 124 |
|
| 125 |
+
Processes a recitation audio file: detects speech segments, recognizes text, and aligns with the Quran. Creates a session for follow-up calls.
|
| 126 |
|
| 127 |
| Parameter | Type | Default | Description |
|
| 128 |
|---|---|---|---|
|
| 129 |
+
| `audio` | file | required | Audio file (any common format) |
|
| 130 |
| `min_silence_ms` | int | 200 | Minimum silence gap to split segments |
|
| 131 |
| `min_speech_ms` | int | 1000 | Minimum speech duration to keep a segment |
|
| 132 |
| `pad_ms` | int | 100 | Padding added to each side of a segment |
|
| 133 |
+
| `model_name` | str | `"Base"` | `"Base"` (faster) or `"Large"` (more accurate) |
|
| 134 |
| `device` | str | `"GPU"` | `"GPU"` or `"CPU"` |
|
| 135 |
|
| 136 |
+
If the GPU is temporarily unavailable, processing continues on CPU (slower). When this happens, a `"warning"` field is included in the response (see [GPU Fallback Warning](#gpu-fallback-warning) below).
|
| 137 |
|
| 138 |
**Segmentation presets:**
|
| 139 |
|
|
|
|
| 185 |
| `ref_from` | str | First matched word as `"surah:ayah:word"`. Empty string for special segments |
|
| 186 |
| `ref_to` | str | Last matched word as `"surah:ayah:word"`. Empty string for special segments |
|
| 187 |
| `matched_text` | str | Quran text for the matched range (or special segment text) |
|
| 188 |
+
| `confidence` | float | 0.0–1.0 — how well the segment matched the Quran text |
|
| 189 |
+
| `has_missing_words` | bool | Whether some expected words were not found in the audio |
|
| 190 |
| `special_type` | str | Only present for special (non-Quranic) segments — see below. Absent for normal segments |
|
| 191 |
| `error` | str? | Error message if alignment failed, else `null` |
|
| 192 |
|
|
|
|
| 208 |
|
| 209 |
## GPU Fallback Warning
|
| 210 |
|
| 211 |
+
When the server's GPU is temporarily unavailable, processing continues on CPU (slower). All endpoints include a `"warning"` field in the response:
|
| 212 |
|
| 213 |
```json
|
| 214 |
{
|
|
|
|
| 231 |
| Session not found or expired | `"Session not found or expired"` | No |
|
| 232 |
| No speech detected (process) | `"No speech detected in audio"` | No (no session created) |
|
| 233 |
| No segments after resegment | `"No segments with these settings"` | Yes |
|
| 234 |
+
| Retranscribe with same model | `"Model and boundaries unchanged. Change model_name or call /resegment first."` | Yes |
|
| 235 |
| Retranscription failed | `"Retranscription failed"` | Yes |
|
| 236 |
| Realignment failed | `"Alignment failed"` | Yes |
|
| 237 |
+
| No segments in session (timestamps) | `"No segments found in session"` | Yes |
|
| 238 |
+
| Timestamp alignment failed | `"MFA alignment failed: ..."` | Yes (session) / No (direct) |
|
| 239 |
+
| No segments provided (timestamps direct) | `"No segments provided"` | No |
|
| 240 |
|
| 241 |
---
|
| 242 |
|
| 243 |
+
### `POST /resegment`
|
| 244 |
|
| 245 |
+
Re-splits the audio into segments using different silence/speech settings, then re-aligns. Reuses the uploaded audio.
|
| 246 |
|
| 247 |
| Parameter | Type | Default | Description |
|
| 248 |
|---|---|---|---|
|
|
|
|
| 257 |
|
| 258 |
---
|
| 259 |
|
| 260 |
+
### `POST /retranscribe`
|
| 261 |
|
| 262 |
+
Re-recognizes text using a different model on the same segments, then re-aligns.
|
| 263 |
|
| 264 |
| Parameter | Type | Default | Description |
|
| 265 |
|---|---|---|---|
|
|
|
|
| 269 |
|
| 270 |
**Response:** Same shape as `/process_audio_session`. Session model and results are updated.
|
| 271 |
|
| 272 |
+
> **Note:** Returns an error if `model_name` is the same as the current session's model. To re-run with the same model on different boundaries, use `/resegment` or `/realign_from_timestamps` instead (they already include recognition + alignment).
|
| 273 |
|
| 274 |
---
|
| 275 |
|
| 276 |
### `POST /realign_from_timestamps`
|
| 277 |
|
| 278 |
+
Aligns audio using custom time boundaries you provide. Useful for manually adjusting where segments start and end.
|
| 279 |
|
| 280 |
| Parameter | Type | Default | Description |
|
| 281 |
|---|---|---|---|
|
|
|
|
| 300 |
|
| 301 |
**Response:** Same shape as `/process_audio_session`. Session boundaries are replaced with the provided timestamps.
|
| 302 |
|
|
|
|
|
|
|
| 303 |
---
|
| 304 |
|
| 305 |
+
### `POST /timestamps`
|
| 306 |
|
| 307 |
+
Gets precise word-level (and optionally letter-level) timing for each word in the aligned segments.
|
| 308 |
|
| 309 |
| Parameter | Type | Default | Description |
|
| 310 |
|---|---|---|---|
|
|
|
|
| 318 |
"a1b2c3d4e5f67890a1b2c3d4e5f67890", # audio_id
|
| 319 |
None, # segments (null = use stored)
|
| 320 |
"words", # granularity
|
| 321 |
+
api_name="/timestamps",
|
| 322 |
)
|
| 323 |
```
|
| 324 |
|
|
|
|
| 330 |
{"time_from": 0.48, "time_to": 2.88, "ref_from": "112:1:1", "ref_to": "112:1:4"},
|
| 331 |
{"time_from": 3.12, "time_to": 5.44, "ref_from": "112:2:1", "ref_to": "112:2:3"},
|
| 332 |
],
|
| 333 |
+
"words+chars",
|
| 334 |
+
api_name="/timestamps",
|
| 335 |
)
|
| 336 |
```
|
| 337 |
|
|
|
|
| 345 |
|
| 346 |
| Field | Type | Required | Description |
|
| 347 |
|---|---|---|---|
|
| 348 |
+
| `time_from` | float | yes | Start time in seconds |
|
| 349 |
+
| `time_to` | float | yes | End time in seconds |
|
| 350 |
| `ref_from` | str | yes | First word as `"surah:ayah:word"`. Empty for special segments |
|
| 351 |
| `ref_to` | str | yes | Last word as `"surah:ayah:word"`. Empty for special segments |
|
| 352 |
| `segment` | int | no | 1-indexed segment number. Auto-assigned from position if omitted |
|
|
|
|
| 387 |
|
| 388 |
---
|
| 389 |
|
| 390 |
+
### `POST /timestamps_direct`
|
| 391 |
|
| 392 |
+
Same as `/timestamps` but accepts an audio file directly — no session needed.
|
| 393 |
|
| 394 |
| Parameter | Type | Default | Description |
|
| 395 |
|---|---|---|---|
|
| 396 |
+
| `audio` | file | required | Audio file (any common format) |
|
| 397 |
| `segments` | list | required | Segment list with `time_from`/`time_to` boundaries |
|
| 398 |
| `granularity` | str | `"words"` | `"words"` or `"words+chars"` |
|
| 399 |
|
| 400 |
+
**Response:** Same shape as `/timestamps` but without `audio_id`.
|
| 401 |
|
| 402 |
**Example (minimal):**
|
| 403 |
```python
|
|
|
|
| 408 |
{"time_from": 3.12, "time_to": 5.44, "ref_from": "112:2:1", "ref_to": "112:2:3"},
|
| 409 |
],
|
| 410 |
"words+chars",
|
| 411 |
+
api_name="/timestamps_direct",
|
| 412 |
)
|
| 413 |
```
|
| 414 |
|
| 415 |
+
Segment input format is the same as for `/timestamps` — see above.
|
src/api/session_api.py
CHANGED
|
@@ -188,14 +188,14 @@ _SESSION_ERROR = {"error": "Session not found or expired", "segments": []}
|
|
| 188 |
|
| 189 |
_ESTIMABLE_ENDPOINTS = {
|
| 190 |
"process_audio_session",
|
| 191 |
-
"
|
| 192 |
-
"
|
| 193 |
"realign_from_timestamps",
|
| 194 |
-
"
|
| 195 |
-
"
|
| 196 |
}
|
| 197 |
|
| 198 |
-
_MFA_ENDPOINTS = {"
|
| 199 |
_VAD_ENDPOINTS = {"process_audio_session"}
|
| 200 |
|
| 201 |
|
|
@@ -347,7 +347,7 @@ def process_audio_session(audio_data, min_silence_ms, min_speech_ms, pad_ms,
|
|
| 347 |
return _format_response(audio_id, json_output, warning=quota_warning)
|
| 348 |
|
| 349 |
|
| 350 |
-
def
|
| 351 |
model_name="Base", device="GPU",
|
| 352 |
request: gr.Request = None):
|
| 353 |
"""Re-clean VAD boundaries with new params and re-run ASR + alignment."""
|
|
@@ -383,7 +383,7 @@ def resegment_session(audio_id, min_silence_ms, min_speech_ms, pad_ms,
|
|
| 383 |
return _format_response(audio_id, json_output, warning=quota_warning)
|
| 384 |
|
| 385 |
|
| 386 |
-
def
|
| 387 |
request: gr.Request = None):
|
| 388 |
"""Re-run ASR with a different model on current segment boundaries."""
|
| 389 |
session = load_session(audio_id)
|
|
@@ -395,7 +395,7 @@ def retranscribe_session(audio_id, model_name="Base", device="GPU",
|
|
| 395 |
and _intervals_hash(session["intervals"]) == session["intervals_hash"]):
|
| 396 |
return {
|
| 397 |
"audio_id": audio_id,
|
| 398 |
-
"error": "Model and boundaries unchanged. Change model_name or call /
|
| 399 |
"segments": [],
|
| 400 |
}
|
| 401 |
|
|
@@ -557,7 +557,7 @@ def _normalize_segments(segments):
|
|
| 557 |
# MFA timestamp endpoints
|
| 558 |
# ---------------------------------------------------------------------------
|
| 559 |
|
| 560 |
-
def
|
| 561 |
"""Compute MFA word/letter timestamps using session audio."""
|
| 562 |
session = load_session(audio_id)
|
| 563 |
if session is None:
|
|
@@ -590,7 +590,7 @@ def mfa_timestamps_session(audio_id, segments_json=None, granularity="words"):
|
|
| 590 |
return result
|
| 591 |
|
| 592 |
|
| 593 |
-
def
|
| 594 |
"""Compute MFA word/letter timestamps with provided audio and segments."""
|
| 595 |
# Parse segments
|
| 596 |
if isinstance(segments_json, str):
|
|
|
|
| 188 |
|
| 189 |
_ESTIMABLE_ENDPOINTS = {
|
| 190 |
"process_audio_session",
|
| 191 |
+
"resegment",
|
| 192 |
+
"retranscribe",
|
| 193 |
"realign_from_timestamps",
|
| 194 |
+
"timestamps",
|
| 195 |
+
"timestamps_direct",
|
| 196 |
}
|
| 197 |
|
| 198 |
+
_MFA_ENDPOINTS = {"timestamps", "timestamps_direct"}
|
| 199 |
_VAD_ENDPOINTS = {"process_audio_session"}
|
| 200 |
|
| 201 |
|
|
|
|
| 347 |
return _format_response(audio_id, json_output, warning=quota_warning)
|
| 348 |
|
| 349 |
|
| 350 |
+
def resegment(audio_id, min_silence_ms, min_speech_ms, pad_ms,
|
| 351 |
model_name="Base", device="GPU",
|
| 352 |
request: gr.Request = None):
|
| 353 |
"""Re-clean VAD boundaries with new params and re-run ASR + alignment."""
|
|
|
|
| 383 |
return _format_response(audio_id, json_output, warning=quota_warning)
|
| 384 |
|
| 385 |
|
| 386 |
+
def retranscribe(audio_id, model_name="Base", device="GPU",
|
| 387 |
request: gr.Request = None):
|
| 388 |
"""Re-run ASR with a different model on current segment boundaries."""
|
| 389 |
session = load_session(audio_id)
|
|
|
|
| 395 |
and _intervals_hash(session["intervals"]) == session["intervals_hash"]):
|
| 396 |
return {
|
| 397 |
"audio_id": audio_id,
|
| 398 |
+
"error": "Model and boundaries unchanged. Change model_name or call /resegment first.",
|
| 399 |
"segments": [],
|
| 400 |
}
|
| 401 |
|
|
|
|
| 557 |
# MFA timestamp endpoints
|
| 558 |
# ---------------------------------------------------------------------------
|
| 559 |
|
| 560 |
+
def timestamps(audio_id, segments_json=None, granularity="words"):
|
| 561 |
"""Compute MFA word/letter timestamps using session audio."""
|
| 562 |
session = load_session(audio_id)
|
| 563 |
if session is None:
|
|
|
|
| 590 |
return result
|
| 591 |
|
| 592 |
|
| 593 |
+
def timestamps_direct(audio_data, segments_json, granularity="words"):
|
| 594 |
"""Compute MFA word/letter timestamps with provided audio and segments."""
|
| 595 |
# Parse segments
|
| 596 |
if isinstance(segments_json, str):
|
src/ui/event_wiring.py
CHANGED
|
@@ -9,9 +9,9 @@ from src.pipeline import (
|
|
| 9 |
)
|
| 10 |
from src.api.session_api import (
|
| 11 |
estimate_duration,
|
| 12 |
-
process_audio_session,
|
| 13 |
-
|
| 14 |
-
|
| 15 |
)
|
| 16 |
from src.mfa import compute_mfa_timestamps
|
| 17 |
from src.ui.progress_bar import pipeline_progress_bar_html
|
|
@@ -186,7 +186,7 @@ def _wire_resegment_chain(c):
|
|
| 186 |
request: gr.Request = None):
|
| 187 |
# Compute estimate and show progress bar
|
| 188 |
audio_dur = len(audio) / 16000 if audio is not None and hasattr(audio, '__len__') else None
|
| 189 |
-
est = estimate_duration("
|
| 190 |
est_s = est.get("estimated_duration_s") or 15
|
| 191 |
bar_html = pipeline_progress_bar_html(est_s)
|
| 192 |
|
|
@@ -261,7 +261,7 @@ def _wire_retranscribe_chain(c):
|
|
| 261 |
request: gr.Request = None):
|
| 262 |
# Compute estimate and show progress bar
|
| 263 |
audio_dur = len(audio) / 16000 if audio is not None and hasattr(audio, '__len__') else None
|
| 264 |
-
est = estimate_duration("
|
| 265 |
est_s = est.get("estimated_duration_s") or 15
|
| 266 |
bar_html = pipeline_progress_bar_html(est_s)
|
| 267 |
|
|
@@ -540,17 +540,17 @@ def _wire_api_endpoint(c):
|
|
| 540 |
api_name="process_audio_session",
|
| 541 |
)
|
| 542 |
gr.Button(visible=False).click(
|
| 543 |
-
fn=
|
| 544 |
inputs=[c.api_audio_id, c.api_silence, c.api_speech, c.api_pad,
|
| 545 |
c.api_model, c.api_device],
|
| 546 |
outputs=[c.api_result],
|
| 547 |
-
api_name="
|
| 548 |
)
|
| 549 |
gr.Button(visible=False).click(
|
| 550 |
-
fn=
|
| 551 |
inputs=[c.api_audio_id, c.api_model, c.api_device],
|
| 552 |
outputs=[c.api_result],
|
| 553 |
-
api_name="
|
| 554 |
)
|
| 555 |
gr.Button(visible=False).click(
|
| 556 |
fn=realign_from_timestamps,
|
|
@@ -559,16 +559,16 @@ def _wire_api_endpoint(c):
|
|
| 559 |
api_name="realign_from_timestamps",
|
| 560 |
)
|
| 561 |
gr.Button(visible=False).click(
|
| 562 |
-
fn=
|
| 563 |
inputs=[c.api_audio_id, c.api_mfa_segments, c.api_mfa_granularity],
|
| 564 |
outputs=[c.api_result],
|
| 565 |
-
api_name="
|
| 566 |
)
|
| 567 |
gr.Button(visible=False).click(
|
| 568 |
-
fn=
|
| 569 |
inputs=[c.api_audio, c.api_mfa_segments, c.api_mfa_granularity],
|
| 570 |
outputs=[c.api_result],
|
| 571 |
-
api_name="
|
| 572 |
)
|
| 573 |
|
| 574 |
|
|
|
|
| 9 |
)
|
| 10 |
from src.api.session_api import (
|
| 11 |
estimate_duration,
|
| 12 |
+
process_audio_session, resegment,
|
| 13 |
+
retranscribe, realign_from_timestamps,
|
| 14 |
+
timestamps, timestamps_direct,
|
| 15 |
)
|
| 16 |
from src.mfa import compute_mfa_timestamps
|
| 17 |
from src.ui.progress_bar import pipeline_progress_bar_html
|
|
|
|
| 186 |
request: gr.Request = None):
|
| 187 |
# Compute estimate and show progress bar
|
| 188 |
audio_dur = len(audio) / 16000 if audio is not None and hasattr(audio, '__len__') else None
|
| 189 |
+
est = estimate_duration("resegment", audio_dur, model_name=model, device=device)
|
| 190 |
est_s = est.get("estimated_duration_s") or 15
|
| 191 |
bar_html = pipeline_progress_bar_html(est_s)
|
| 192 |
|
|
|
|
| 261 |
request: gr.Request = None):
|
| 262 |
# Compute estimate and show progress bar
|
| 263 |
audio_dur = len(audio) / 16000 if audio is not None and hasattr(audio, '__len__') else None
|
| 264 |
+
est = estimate_duration("retranscribe", audio_dur, model_name=model_name, device=device)
|
| 265 |
est_s = est.get("estimated_duration_s") or 15
|
| 266 |
bar_html = pipeline_progress_bar_html(est_s)
|
| 267 |
|
|
|
|
| 540 |
api_name="process_audio_session",
|
| 541 |
)
|
| 542 |
gr.Button(visible=False).click(
|
| 543 |
+
fn=resegment,
|
| 544 |
inputs=[c.api_audio_id, c.api_silence, c.api_speech, c.api_pad,
|
| 545 |
c.api_model, c.api_device],
|
| 546 |
outputs=[c.api_result],
|
| 547 |
+
api_name="resegment",
|
| 548 |
)
|
| 549 |
gr.Button(visible=False).click(
|
| 550 |
+
fn=retranscribe,
|
| 551 |
inputs=[c.api_audio_id, c.api_model, c.api_device],
|
| 552 |
outputs=[c.api_result],
|
| 553 |
+
api_name="retranscribe",
|
| 554 |
)
|
| 555 |
gr.Button(visible=False).click(
|
| 556 |
fn=realign_from_timestamps,
|
|
|
|
| 559 |
api_name="realign_from_timestamps",
|
| 560 |
)
|
| 561 |
gr.Button(visible=False).click(
|
| 562 |
+
fn=timestamps,
|
| 563 |
inputs=[c.api_audio_id, c.api_mfa_segments, c.api_mfa_granularity],
|
| 564 |
outputs=[c.api_result],
|
| 565 |
+
api_name="timestamps",
|
| 566 |
)
|
| 567 |
gr.Button(visible=False).click(
|
| 568 |
+
fn=timestamps_direct,
|
| 569 |
inputs=[c.api_audio, c.api_mfa_segments, c.api_mfa_granularity],
|
| 570 |
outputs=[c.api_result],
|
| 571 |
+
api_name="timestamps_direct",
|
| 572 |
)
|
| 573 |
|
| 574 |
|