File size: 1,798 Bytes
09c4aca | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | """Turn-taking benchmark submission interface. See README.md for usage.
Implement ONE track:
- DiscriminativeModel — you declare the floor per timestep; we run it streaming.
- GenerativeModel — you produce audio; we VAD it into the floor track.
"""
from typing import Literal, Protocol, runtime_checkable
import numpy as np
from numpy.typing import NDArray
SAMPLE_RATE = 48_000 # benchmark feed rate: corpus + model input. (Generative output may be any rate.)
AudioChunk = NDArray[np.float32] # 1-D mono float32, shape (n_samples,), in [-1, 1]
# `subject` is the channel whose floor you predict; `other` is the conversation
# partner. Subject is a role, not a fixed speaker — the runner puts each speaker
# in the subject slot in turn.
FloorBit = Literal[0, 1] # 1 = subject holds the floor, 0 = subject does not
@runtime_checkable
class DiscriminativeModel(Protocol):
input_sample_rate: int # runner resamples to this before every call
def reset(self) -> None: # clear ALL streaming state; called before each pass
... # over a conversation (once per subject slot)
def step(self, subject_audio: AudioChunk, other_audio: AudioChunk) -> FloorBit: ...
@runtime_checkable
class GenerativeModel(Protocol):
output_sample_rate: (
int # the rate your generate() output is at — any value; just report it
)
def reset(self) -> None: # clear ALL streaming state; called before each pass
... # over a conversation (once per subject slot)
def generate(self, subject_audio: AudioChunk) -> AudioChunk:
# Return your model's raw audio response covering the same DURATION as subject_audio.
# We assert len(output) / output_sample_rate == len(subject_audio) / SAMPLE_RATE.
...
|