viks66
/

mimi-endpointer

Model card Files Files and versions

mimi-endpointer / contract.py

viks66's picture

add contract.py

09c4aca verified 21 days ago

History Blame Contribute Delete

1.8 kB

	"""Turn-taking benchmark submission interface. See README.md for usage.

	Implement ONE track:
	- DiscriminativeModel — you declare the floor per timestep; we run it streaming.
	- GenerativeModel — you produce audio; we VAD it into the floor track.
	"""

	from typing import Literal, Protocol, runtime_checkable

	import numpy as np
	from numpy.typing import NDArray

	SAMPLE_RATE = 48_000 # benchmark feed rate: corpus + model input. (Generative output may be any rate.)

	AudioChunk = NDArray[np.float32] # 1-D mono float32, shape (n_samples,), in [-1, 1]
	# `subject` is the channel whose floor you predict; `other` is the conversation
	# partner. Subject is a role, not a fixed speaker — the runner puts each speaker
	# in the subject slot in turn.
	FloorBit = Literal[0, 1] # 1 = subject holds the floor, 0 = subject does not


	@runtime_checkable
	class DiscriminativeModel(Protocol):
	input_sample_rate: int # runner resamples to this before every call

	def reset(self) -> None: # clear ALL streaming state; called before each pass
	... # over a conversation (once per subject slot)

	def step(self, subject_audio: AudioChunk, other_audio: AudioChunk) -> FloorBit: ...


	@runtime_checkable
	class GenerativeModel(Protocol):
	output_sample_rate: (
	int # the rate your generate() output is at — any value; just report it
	)

	def reset(self) -> None: # clear ALL streaming state; called before each pass
	... # over a conversation (once per subject slot)

	def generate(self, subject_audio: AudioChunk) -> AudioChunk:
	# Return your model's raw audio response covering the same DURATION as subject_audio.
	# We assert len(output) / output_sample_rate == len(subject_audio) / SAMPLE_RATE.
	...