| """Turn-taking benchmark submission interface. See README.md for usage. | |
| Implement ONE track: | |
| - DiscriminativeModel — you declare the floor per timestep; we run it streaming. | |
| - GenerativeModel — you produce audio; we VAD it into the floor track. | |
| """ | |
| from typing import Literal, Protocol, runtime_checkable | |
| import numpy as np | |
| from numpy.typing import NDArray | |
| SAMPLE_RATE = 48_000 # benchmark feed rate: corpus + model input. (Generative output may be any rate.) | |
| AudioChunk = NDArray[np.float32] # 1-D mono float32, shape (n_samples,), in [-1, 1] | |
| # `subject` is the channel whose floor you predict; `other` is the conversation | |
| # partner. Subject is a role, not a fixed speaker — the runner puts each speaker | |
| # in the subject slot in turn. | |
| FloorBit = Literal[0, 1] # 1 = subject holds the floor, 0 = subject does not | |
| class DiscriminativeModel(Protocol): | |
| input_sample_rate: int # runner resamples to this before every call | |
| def reset(self) -> None: # clear ALL streaming state; called before each pass | |
| ... # over a conversation (once per subject slot) | |
| def step(self, subject_audio: AudioChunk, other_audio: AudioChunk) -> FloorBit: ... | |
| class GenerativeModel(Protocol): | |
| output_sample_rate: ( | |
| int # the rate your generate() output is at — any value; just report it | |
| ) | |
| def reset(self) -> None: # clear ALL streaming state; called before each pass | |
| ... # over a conversation (once per subject slot) | |
| def generate(self, subject_audio: AudioChunk) -> AudioChunk: | |
| # Return your model's raw audio response covering the same DURATION as subject_audio. | |
| # We assert len(output) / output_sample_rate == len(subject_audio) / SAMPLE_RATE. | |
| ... | |