Update README.md
#4
by
GradientDescent2718
- opened
README.md
CHANGED
|
@@ -7,7 +7,7 @@
|
|
| 7 |
|
| 8 |
| Variant | File | Latency | Use Case |
|
| 9 |
|---------|------|---------|----------|
|
| 10 |
-
| **Default** | `Sortformer.mlmodelc` | ~1.
|
| 11 |
| **NVIDIA Low** | `SortformerNvidiaLow.mlmodelc` | ~1.04s | Low latency streaming |
|
| 12 |
| **NVIDIA High** | `SortformerNvidiaHigh.mlmodelc` | ~30.4s | Best quality, offline |
|
| 13 |
|
|
@@ -23,22 +23,43 @@
|
|
| 23 |
|
| 24 |
## Model Input/Output Shapes
|
| 25 |
|
| 26 |
-
|
| 27 |
|
| 28 |
| Input | Shape | Description |
|
| 29 |
|-------|-------|-------------|
|
| 30 |
-
| chunk | [1,
|
| 31 |
-
| chunk_lengths | [1] | Actual chunk length |
|
| 32 |
-
| spkcache | [1,
|
| 33 |
-
| spkcache_lengths | [1] | Actual cache length |
|
| 34 |
-
| fifo | [1,
|
| 35 |
-
| fifo_lengths | [1] | Actual FIFO length |
|
| 36 |
|
| 37 |
| Output | Shape | Description |
|
| 38 |
|--------|-------|-------------|
|
| 39 |
-
| speaker_preds | [
|
| 40 |
-
| chunk_pre_encoder_embs | [
|
| 41 |
-
| chunk_pre_encoder_lengths | [1] | Actual embedding count |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
## Usage with FluidAudio (Swift)
|
| 44 |
|
|
@@ -61,7 +82,7 @@
|
|
| 61 |
}
|
| 62 |
}
|
| 63 |
|
| 64 |
-
// Or
|
| 65 |
let timeline = try diarizer.processComplete(audioSamples)
|
| 66 |
for (speakerIndex, segments) in timeline.segments.enumerated() {
|
| 67 |
for segment in segments {
|
|
@@ -74,7 +95,7 @@
|
|
| 74 |
| Metric | Default | NVIDIA High |
|
| 75 |
|---------------|---------|-------------|
|
| 76 |
| Latency | ~1.12s | ~30.4s |
|
| 77 |
-
| RTFx (M4
|
| 78 |
|
| 79 |
Files
|
| 80 |
|
|
|
|
| 7 |
|
| 8 |
| Variant | File | Latency | Use Case |
|
| 9 |
|---------|------|---------|----------|
|
| 10 |
+
| **Default** | `Sortformer.mlmodelc` | ~1.04s | Low latency streaming |
|
| 11 |
| **NVIDIA Low** | `SortformerNvidiaLow.mlmodelc` | ~1.04s | Low latency streaming |
|
| 12 |
| **NVIDIA High** | `SortformerNvidiaHigh.mlmodelc` | ~30.4s | Best quality, offline |
|
| 13 |
|
|
|
|
| 23 |
|
| 24 |
## Model Input/Output Shapes
|
| 25 |
|
| 26 |
+
**General**:
|
| 27 |
|
| 28 |
| Input | Shape | Description |
|
| 29 |
|-------|-------|-------------|
|
| 30 |
+
| chunk | `[1, 8*(C+L+R), 128]` | Mel spectrogram features |
|
| 31 |
+
| chunk_lengths | `[1]` | Actual chunk length |
|
| 32 |
+
| spkcache | `[1, S, 512]` | Speaker cache embeddings |
|
| 33 |
+
| spkcache_lengths | `[1]` | Actual cache length |
|
| 34 |
+
| fifo | `[1, F, 512]` | FIFO queue embeddings |
|
| 35 |
+
| fifo_lengths | `[1]` | Actual FIFO length |
|
| 36 |
|
| 37 |
| Output | Shape | Description |
|
| 38 |
|--------|-------|-------------|
|
| 39 |
+
| speaker_preds | `[C+L+R+S+F, 4]` | Speaker probabilities (4 speakers) |
|
| 40 |
+
| chunk_pre_encoder_embs | `[C+L+R, 512]` | Embeddings for state update |
|
| 41 |
+
| chunk_pre_encoder_lengths | `[1]` | Actual embedding count |
|
| 42 |
+
| nest_encoder_embs | `[C+L+R+S+F, 192]` | Embeddings for speaker discrimination |
|
| 43 |
+
| nest_encoder_lengths | `[1]` | Actual speaker embedding count |
|
| 44 |
+
|
| 45 |
+
Note: `C = chunk_len`, `L = chunk_left_context`, `R = chunk_right_context`, `S = spkcache_len`, `F = fifo_len`.
|
| 46 |
+
|
| 47 |
+
**Configuration-Specific Shapes**:
|
| 48 |
+
|
| 49 |
+
| Input | Default | NVIDIA Low | NVIDIA High |
|
| 50 |
+
| chunk | `[1, 112, 128]` | `[1, 112, 128]` | `[1, 3048, 128]` |
|
| 51 |
+
| chunk_lengths | `[1]` | `[1]` | `[1]` |
|
| 52 |
+
| spkcache | `[1, 188, 512]` | `[1, 188, 512]` | `[1, 188, 512]` |
|
| 53 |
+
| spkcache_lengths | `[1]` | `[1]` | `[1]` |
|
| 54 |
+
| fifo | `[1, 40, 512]` | `[1, 188, 512]` | `[1, 40, 512]`
|
| 55 |
+
| fifo_lengths | `[1]` | `[1]` | `[1]` |
|
| 56 |
+
|
| 57 |
+
| Output | Default | NVIDIA Low | NVIDIA High |
|
| 58 |
+
| speaker_preds | `[1, 242, 128]` | `[1, 390, 128]` | `[1, 609, 128]` |
|
| 59 |
+
| chunk_pre_encoder_embs | `[1, 14, 512]` | `[1, 14, 512]` | `[1, 381, 512]` |
|
| 60 |
+
| chunk_pre_encoder_lengths | `[1]` | `[1]` | `[1]` |
|
| 61 |
+
| nest_encoder_embs | `[1, 242, 192]` | `[1, 390, 192]` | `[1, 609, 192]` |
|
| 62 |
+
| nest_encoder_lengths | `[1]` | `[1]` | `[1]` |
|
| 63 |
|
| 64 |
## Usage with FluidAudio (Swift)
|
| 65 |
|
|
|
|
| 82 |
}
|
| 83 |
}
|
| 84 |
|
| 85 |
+
// Or file processing
|
| 86 |
let timeline = try diarizer.processComplete(audioSamples)
|
| 87 |
for (speakerIndex, segments) in timeline.segments.enumerated() {
|
| 88 |
for segment in segments {
|
|
|
|
| 95 |
| Metric | Default | NVIDIA High |
|
| 96 |
|---------------|---------|-------------|
|
| 97 |
| Latency | ~1.12s | ~30.4s |
|
| 98 |
+
| RTFx (M4 Max) | ~5.7x | ~125.3x |
|
| 99 |
|
| 100 |
Files
|
| 101 |
|