File size: 1,798 Bytes
09c4aca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""Turn-taking benchmark submission interface. See README.md for usage.

Implement ONE track:
- DiscriminativeModel — you declare the floor per timestep; we run it streaming.
- GenerativeModel — you produce audio; we VAD it into the floor track.
"""

from typing import Literal, Protocol, runtime_checkable

import numpy as np
from numpy.typing import NDArray

SAMPLE_RATE = 48_000  # benchmark feed rate: corpus + model input. (Generative output may be any rate.)

AudioChunk = NDArray[np.float32]  # 1-D mono float32, shape (n_samples,), in [-1, 1]
# `subject` is the channel whose floor you predict; `other` is the conversation
# partner. Subject is a role, not a fixed speaker — the runner puts each speaker
# in the subject slot in turn.
FloorBit = Literal[0, 1]  # 1 = subject holds the floor, 0 = subject does not


@runtime_checkable
class DiscriminativeModel(Protocol):
    input_sample_rate: int  # runner resamples to this before every call

    def reset(self) -> None:  # clear ALL streaming state; called before each pass
        ...                   # over a conversation (once per subject slot)

    def step(self, subject_audio: AudioChunk, other_audio: AudioChunk) -> FloorBit: ...


@runtime_checkable
class GenerativeModel(Protocol):
    output_sample_rate: (
        int  # the rate your generate() output is at — any value; just report it
    )

    def reset(self) -> None:  # clear ALL streaming state; called before each pass
        ...                   # over a conversation (once per subject slot)

    def generate(self, subject_audio: AudioChunk) -> AudioChunk:
        # Return your model's raw audio response covering the same DURATION as subject_audio.
        # We assert len(output) / output_sample_rate == len(subject_audio) / SAMPLE_RATE.
        ...