mazesmazes commited on
Commit
627eea2
·
verified ·
1 Parent(s): a207581

Update custom model files, README, and requirements

Browse files
Files changed (3) hide show
  1. .gitattributes +0 -1
  2. asr_config.py +8 -1
  3. s2s_pipeline.py +532 -0
.gitattributes CHANGED
@@ -1,4 +1,3 @@
1
  *.safetensors filter=lfs diff=lfs merge=lfs -text
2
  *.bin filter=lfs diff=lfs merge=lfs -text
3
  tokenizer_config.json -filter -diff -merge text
4
- tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
1
  *.safetensors filter=lfs diff=lfs merge=lfs -text
2
  *.bin filter=lfs diff=lfs merge=lfs -text
3
  tokenizer_config.json -filter -diff -merge text
 
asr_config.py CHANGED
@@ -186,9 +186,16 @@ class ASRConfig(transformers.PretrainedConfig):
186
  "pt": ["AutoModelForSpeechSeq2Seq"],
187
  "tf": [],
188
  "type": "audio",
189
- }
 
 
 
 
 
 
190
  }
191
  self.architectures = ["ASRModel"]
 
192
  self.pipeline_tag = "automatic-speech-recognition"
193
 
194
 
 
186
  "pt": ["AutoModelForSpeechSeq2Seq"],
187
  "tf": [],
188
  "type": "audio",
189
+ },
190
+ "speech-to-speech": {
191
+ "impl": "s2s_pipeline.SpeechToSpeechPipeline",
192
+ "pt": ["AutoModelForSpeechSeq2Seq"],
193
+ "tf": [],
194
+ "type": "audio",
195
+ },
196
  }
197
  self.architectures = ["ASRModel"]
198
+ # Default pipeline is ASR, but model also supports speech-to-speech
199
  self.pipeline_tag = "automatic-speech-recognition"
200
 
201
 
s2s_pipeline.py ADDED
@@ -0,0 +1,532 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Speech-to-Speech pipeline for audio-in, audio-out generation.
2
+
3
+ This pipeline combines ASR (speech-to-text) with TTS (text-to-speech) to create
4
+ a unified speech-to-speech interface that can be used with HuggingFace's pipeline API.
5
+
6
+ Usage:
7
+ from transformers import pipeline
8
+
9
+ # Load as speech-to-speech pipeline
10
+ pipe = pipeline("speech-to-speech", model="mazesmazes/tiny-audio-omni", trust_remote_code=True)
11
+
12
+ # Process audio (outputs 48kHz by default for browser compatibility)
13
+ result = pipe("audio.wav")
14
+ # Returns: {"text": "transcription", "audio": np.array, "sampling_rate": 48000}
15
+
16
+ # With custom TTS voice
17
+ result = pipe("audio.wav", tts_voice="af_bella")
18
+
19
+ # Output at native TTS rate (24kHz) without resampling
20
+ result = pipe("audio.wav", output_sample_rate=24000)
21
+
22
+ # Get only audio output (for streaming/playback)
23
+ audio, sr = result["audio"], result["sampling_rate"]
24
+
25
+ # Streaming with built-in VAD (Voice Activity Detection)
26
+ for result in pipe.stream(audio_chunk_generator()):
27
+ print(result["text"])
28
+ play_audio(result["audio"], result["sampling_rate"])
29
+ """
30
+
31
+ from collections.abc import Generator, Iterator
32
+ from dataclasses import dataclass, field
33
+ from pathlib import Path
34
+ from typing import Any
35
+
36
+ import numpy as np
37
+ import scipy.signal
38
+ import torch
39
+ from transformers import Pipeline
40
+ from transformers.pipelines.audio_utils import ffmpeg_read
41
+
42
+ try:
43
+ from .asr_modeling import ASRModel
44
+ from .asr_pipeline import _truncate_repetitions, strip_thinking
45
+ except ImportError:
46
+ from asr_modeling import ASRModel # type: ignore[no-redef]
47
+ from asr_pipeline import _truncate_repetitions, strip_thinking # type: ignore[no-redef]
48
+
49
+ __all__ = ["SpeechToSpeechPipeline", "VADConfig"]
50
+
51
+ # Default TTS settings
52
+ DEFAULT_TTS_VOICE = "af_heart"
53
+ TTS_SAMPLE_RATE = 24000 # Native Kokoro TTS sample rate
54
+ DEFAULT_OUTPUT_SAMPLE_RATE = 48000 # Browser-friendly sample rate
55
+
56
+ # Default VAD settings
57
+ DEFAULT_VAD_THRESHOLD = 0.5
58
+ DEFAULT_SILENCE_DURATION_MS = 700
59
+ DEFAULT_INPUT_SAMPLE_RATE = 16000
60
+
61
+
62
+ @dataclass
63
+ class VADConfig:
64
+ """Configuration for Voice Activity Detection.
65
+
66
+ Args:
67
+ threshold: VAD probability threshold (0.0-1.0). Higher = stricter.
68
+ silence_duration_ms: Milliseconds of silence before end-of-speech.
69
+ sample_rate: Expected input audio sample rate.
70
+ """
71
+
72
+ threshold: float = DEFAULT_VAD_THRESHOLD
73
+ silence_duration_ms: int = DEFAULT_SILENCE_DURATION_MS
74
+ sample_rate: int = DEFAULT_INPUT_SAMPLE_RATE
75
+
76
+
77
+ @dataclass
78
+ class _VADState:
79
+ """Internal state for VAD streaming."""
80
+
81
+ is_speaking: bool = False
82
+ silence_frames: int = 0
83
+ audio_buffer: list[np.ndarray] = field(default_factory=list)
84
+
85
+ def reset(self):
86
+ """Reset state after processing an utterance."""
87
+ self.is_speaking = False
88
+ self.silence_frames = 0
89
+ self.audio_buffer = []
90
+
91
+
92
+ class SpeechToSpeechPipeline(Pipeline):
93
+ """HuggingFace pipeline for speech-to-speech generation.
94
+
95
+ This pipeline takes audio input, transcribes it using an ASR model,
96
+ and synthesizes the response as speech using Kokoro TTS.
97
+
98
+ Args:
99
+ model: ASRModel instance for transcription
100
+ tts_voice: Default Kokoro TTS voice ID (default: "af_heart")
101
+ output_sample_rate: Output audio sample rate (default: 48000 for browser compatibility)
102
+ **kwargs: Additional arguments passed to Pipeline base class
103
+
104
+ Example:
105
+ >>> from transformers import pipeline
106
+ >>> pipe = pipeline("speech-to-speech", model="mazesmazes/tiny-audio-omni", trust_remote_code=True)
107
+ >>> result = pipe("audio.wav")
108
+ >>> result["text"] # Transcription/response text
109
+ >>> result["audio"] # Audio as numpy array (48kHz)
110
+ >>> result["sampling_rate"] # 48000
111
+ """
112
+
113
+ model: ASRModel
114
+
115
+ def __init__(
116
+ self,
117
+ model: ASRModel,
118
+ tts_voice: str = DEFAULT_TTS_VOICE,
119
+ output_sample_rate: int = DEFAULT_OUTPUT_SAMPLE_RATE,
120
+ vad_config: VADConfig | None = None,
121
+ **kwargs,
122
+ ):
123
+ """Initialize Speech-to-Speech pipeline."""
124
+ feature_extractor = kwargs.pop("feature_extractor", None)
125
+ tokenizer = kwargs.pop("tokenizer", model.tokenizer)
126
+
127
+ if feature_extractor is None:
128
+ feature_extractor = model.get_processor().feature_extractor
129
+
130
+ super().__init__(
131
+ model=model,
132
+ feature_extractor=feature_extractor,
133
+ tokenizer=tokenizer,
134
+ **kwargs,
135
+ )
136
+ self.tts_voice = tts_voice
137
+ self.output_sample_rate = output_sample_rate
138
+ self.vad_config = vad_config or VADConfig()
139
+ self._tts_pipeline = None
140
+ self._vad_model = None
141
+ self._vad_utils = None
142
+
143
+ @property
144
+ def tts_pipeline(self):
145
+ """Lazy-load Kokoro TTS pipeline on first use."""
146
+ if self._tts_pipeline is None:
147
+ try:
148
+ from kokoro import KPipeline
149
+
150
+ self._tts_pipeline = KPipeline(lang_code="a", repo_id="hexgrad/Kokoro-82M")
151
+ except ImportError as e:
152
+ raise ImportError(
153
+ "Kokoro TTS is required for speech-to-speech. "
154
+ "Install with: pip install kokoro>=0.9.2\n"
155
+ "Also requires espeak-ng: apt-get install espeak-ng"
156
+ ) from e
157
+ return self._tts_pipeline
158
+
159
+ @property
160
+ def vad_model(self):
161
+ """Lazy-load Silero VAD model on first use."""
162
+ if self._vad_model is None:
163
+ self._vad_model, self._vad_utils = torch.hub.load(
164
+ repo_or_dir="snakers4/silero-vad",
165
+ model="silero_vad",
166
+ force_reload=False,
167
+ )
168
+ return self._vad_model
169
+
170
+ @property
171
+ def vad_utils(self):
172
+ """Get VAD utilities (loads model if needed)."""
173
+ if self._vad_utils is None:
174
+ # Access vad_model to trigger loading
175
+ _ = self.vad_model
176
+ return self._vad_utils
177
+
178
+ def stream(
179
+ self,
180
+ audio_chunks: Iterator[np.ndarray],
181
+ tts_voice: str | None = None,
182
+ output_sample_rate: int | None = None,
183
+ vad_config: VADConfig | None = None,
184
+ ) -> Generator[dict[str, Any], None, None]:
185
+ """Process streaming audio with VAD and yield responses.
186
+
187
+ Takes an iterator of audio chunks, detects speech using Silero VAD,
188
+ and yields responses when speech ends (after silence threshold).
189
+
190
+ Args:
191
+ audio_chunks: Iterator yielding audio chunks as numpy arrays (float32, 16kHz).
192
+ Each chunk should be ~100-500ms of audio.
193
+ tts_voice: Kokoro voice ID for TTS output (default: self.tts_voice)
194
+ output_sample_rate: Output sample rate (default: self.output_sample_rate)
195
+ vad_config: VAD configuration (default: self.vad_config)
196
+
197
+ Yields:
198
+ Dict with 'text', 'audio', and 'sampling_rate' for each detected utterance.
199
+
200
+ Example:
201
+ >>> def audio_generator():
202
+ ... while True:
203
+ ... chunk = get_audio_chunk() # Get ~100ms of audio
204
+ ... if chunk is None:
205
+ ... break
206
+ ... yield chunk
207
+ >>> for result in pipe.stream(audio_generator()):
208
+ ... print(result["text"])
209
+ ... play_audio(result["audio"], result["sampling_rate"])
210
+ """
211
+ config = vad_config or self.vad_config
212
+ voice = tts_voice or self.tts_voice
213
+ target_sr = output_sample_rate or self.output_sample_rate
214
+
215
+ state = _VADState()
216
+ vad_utils = self.vad_utils
217
+ if vad_utils is None:
218
+ raise RuntimeError("Failed to load Silero VAD model")
219
+ get_speech_timestamps = vad_utils[0]
220
+
221
+ # Calculate silence threshold in frames
222
+ # Assuming ~100ms chunks at 16kHz = 1600 samples per chunk
223
+ # silence_duration_ms / chunk_duration_ms = number of silent chunks
224
+ chunk_duration_ms = 100 # Approximate, will be calculated per chunk
225
+ silence_threshold = max(1, config.silence_duration_ms // chunk_duration_ms)
226
+
227
+ for chunk in audio_chunks:
228
+ # Ensure chunk is float32
229
+ if chunk.dtype != np.float32:
230
+ chunk = chunk.astype(np.float32)
231
+
232
+ # Normalize if needed (int16 range to float32)
233
+ if chunk.max() > 1.0 or chunk.min() < -1.0:
234
+ chunk = chunk / 32768.0
235
+
236
+ # Update chunk duration estimate for silence threshold
237
+ chunk_duration_ms = len(chunk) / config.sample_rate * 1000
238
+ silence_threshold = max(1, int(config.silence_duration_ms / chunk_duration_ms))
239
+
240
+ # Run VAD
241
+ speech_timestamps = get_speech_timestamps(
242
+ torch.from_numpy(chunk),
243
+ self.vad_model,
244
+ sampling_rate=config.sample_rate,
245
+ threshold=config.threshold,
246
+ )
247
+ has_speech = len(speech_timestamps) > 0
248
+
249
+ if has_speech:
250
+ if not state.is_speaking:
251
+ state.is_speaking = True
252
+ state.audio_buffer = []
253
+ state.audio_buffer.append(chunk)
254
+ state.silence_frames = 0
255
+ elif state.is_speaking:
256
+ state.audio_buffer.append(chunk)
257
+ state.silence_frames += 1
258
+
259
+ if state.silence_frames >= silence_threshold:
260
+ # End of speech detected - process the utterance
261
+ if state.audio_buffer:
262
+ full_audio = np.concatenate(state.audio_buffer)
263
+ result = self(
264
+ {"array": full_audio, "sampling_rate": config.sample_rate},
265
+ tts_voice=voice,
266
+ output_sample_rate=target_sr,
267
+ )
268
+ yield result
269
+
270
+ state.reset()
271
+
272
+ def _sanitize_parameters(
273
+ self,
274
+ tts_voice: str | None = None,
275
+ output_sample_rate: int | None = None,
276
+ return_text_only: bool = False,
277
+ user_prompt: str | None = None,
278
+ **kwargs,
279
+ ) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
280
+ """Sanitize and route parameters to preprocessing, forward, and postprocessing."""
281
+ preprocess_kwargs: dict[str, Any] = {}
282
+ forward_kwargs: dict[str, Any] = {}
283
+ postprocess_kwargs: dict[str, Any] = {}
284
+
285
+ if tts_voice is not None:
286
+ postprocess_kwargs["tts_voice"] = tts_voice
287
+ if output_sample_rate is not None:
288
+ postprocess_kwargs["output_sample_rate"] = output_sample_rate
289
+ if return_text_only:
290
+ postprocess_kwargs["return_text_only"] = return_text_only
291
+ if user_prompt is not None:
292
+ forward_kwargs["user_prompt"] = user_prompt
293
+
294
+ return preprocess_kwargs, forward_kwargs, postprocess_kwargs
295
+
296
+ def preprocess(self, inputs, **kwargs) -> dict[str, Any]:
297
+ """Preprocess audio inputs for the model.
298
+
299
+ Handles various input formats:
300
+ - File path (str)
301
+ - Dict with 'array' and 'sampling_rate'
302
+ - Dict with 'raw' audio bytes
303
+ - Raw numpy array
304
+ - Bytes
305
+
306
+ Returns:
307
+ Dict with input_features and attention_mask for the model
308
+ """
309
+ # Extract audio array from various formats
310
+ audio_array = self._extract_audio(inputs)
311
+
312
+ if audio_array is None:
313
+ raise ValueError(f"Could not extract audio from input type: {type(inputs)}")
314
+
315
+ # Use feature extractor to get mel features
316
+ processed = self.feature_extractor(
317
+ audio_array,
318
+ sampling_rate=self.feature_extractor.sampling_rate,
319
+ return_tensors="pt",
320
+ return_attention_mask=True,
321
+ )
322
+
323
+ return {
324
+ "input_features": processed.input_features,
325
+ "attention_mask": processed.attention_mask,
326
+ }
327
+
328
+ def _forward(self, model_inputs: dict, user_prompt: str | None = None) -> dict[str, Any]:
329
+ """Run ASR model to generate text from audio.
330
+
331
+ Args:
332
+ model_inputs: Dict with input_features and attention_mask
333
+ user_prompt: Optional custom prompt for the model
334
+
335
+ Returns:
336
+ Dict with generated token IDs
337
+ """
338
+ input_features = model_inputs["input_features"].to(self.model.device)
339
+ attention_mask = model_inputs["attention_mask"].to(self.model.device)
340
+
341
+ # Set custom prompt if provided
342
+ original_prompt = None
343
+ if user_prompt:
344
+ original_prompt = self.model.TRANSCRIBE_PROMPT
345
+ self.model.TRANSCRIBE_PROMPT = user_prompt
346
+
347
+ try:
348
+ generated_ids = self.model.generate(
349
+ input_features=input_features,
350
+ audio_attention_mask=attention_mask,
351
+ )
352
+ finally:
353
+ if original_prompt is not None:
354
+ self.model.TRANSCRIBE_PROMPT = original_prompt
355
+
356
+ return {"tokens": generated_ids}
357
+
358
+ def postprocess(
359
+ self,
360
+ model_outputs: dict,
361
+ tts_voice: str | None = None,
362
+ output_sample_rate: int | None = None,
363
+ return_text_only: bool = False,
364
+ ) -> dict[str, Any]:
365
+ """Convert model output to text and synthesize speech.
366
+
367
+ Args:
368
+ model_outputs: Dict with 'tokens' containing generated IDs
369
+ tts_voice: Kokoro voice ID (default: self.tts_voice)
370
+ output_sample_rate: Output sample rate (default: self.output_sample_rate)
371
+ return_text_only: If True, skip TTS and return only text
372
+
373
+ Returns:
374
+ Dict with 'text', 'audio' (numpy array), and 'sampling_rate'
375
+ """
376
+ target_sr = output_sample_rate or self.output_sample_rate
377
+ tokens = model_outputs.get("tokens")
378
+
379
+ if tokens is None:
380
+ return {
381
+ "text": "",
382
+ "audio": np.array([], dtype=np.float32),
383
+ "sampling_rate": target_sr,
384
+ }
385
+
386
+ # Convert tokens to text
387
+ if torch.is_tensor(tokens):
388
+ tokens = tokens.cpu()
389
+ if tokens.dim() > 1:
390
+ tokens = tokens[0]
391
+
392
+ # Filter EOS tokens
393
+ if hasattr(self.model, "generation_config") and self.model.generation_config is not None:
394
+ eos_ids = self.model.generation_config.eos_token_id
395
+ if eos_ids is not None:
396
+ eos_set = set(eos_ids) if isinstance(eos_ids, list) else {eos_ids}
397
+ tokens = [t for t in tokens.tolist() if t not in eos_set]
398
+
399
+ text = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
400
+ text = strip_thinking(text)
401
+ text = _truncate_repetitions(text)
402
+
403
+ result = {"text": text}
404
+
405
+ # Synthesize speech unless text-only requested
406
+ if not return_text_only:
407
+ voice = tts_voice or self.tts_voice
408
+ audio = self._synthesize_speech(text, voice)
409
+ # Resample if target sample rate differs from native TTS rate
410
+ audio = self._resample_audio(audio, TTS_SAMPLE_RATE, target_sr)
411
+ result["audio"] = audio
412
+ result["sampling_rate"] = target_sr
413
+
414
+ return result
415
+
416
+ def _synthesize_speech(self, text: str, voice: str) -> np.ndarray:
417
+ """Synthesize speech from text using Kokoro TTS.
418
+
419
+ Args:
420
+ text: Text to synthesize
421
+ voice: Kokoro voice ID
422
+
423
+ Returns:
424
+ Audio as numpy array (float32, 24kHz native TTS rate)
425
+ """
426
+ if not text or not text.strip():
427
+ return np.array([], dtype=np.float32)
428
+
429
+ try:
430
+ audio_chunks = []
431
+ for _, _, audio in self.tts_pipeline(text, voice=voice):
432
+ audio_chunks.append(audio)
433
+
434
+ if audio_chunks:
435
+ return np.concatenate(audio_chunks)
436
+ except Exception:
437
+ pass
438
+
439
+ return np.array([], dtype=np.float32)
440
+
441
+ def _resample_audio(self, audio: np.ndarray, from_sr: int, to_sr: int) -> np.ndarray:
442
+ """Resample audio to target sample rate.
443
+
444
+ Args:
445
+ audio: Input audio array
446
+ from_sr: Source sample rate
447
+ to_sr: Target sample rate
448
+
449
+ Returns:
450
+ Resampled audio array
451
+ """
452
+ if len(audio) == 0 or from_sr == to_sr:
453
+ return audio
454
+
455
+ num_samples = int(len(audio) * to_sr / from_sr)
456
+ return scipy.signal.resample(audio, num_samples).astype(np.float32)
457
+
458
+ def text_to_speech(
459
+ self,
460
+ text: str,
461
+ voice: str | None = None,
462
+ output_sample_rate: int | None = None,
463
+ ) -> dict[str, Any]:
464
+ """Convert text to speech using Kokoro TTS.
465
+
466
+ This is a convenience method for generating audio from text without
467
+ going through the full speech-to-speech pipeline.
468
+
469
+ Args:
470
+ text: Text to synthesize
471
+ voice: Kokoro voice ID (default: self.tts_voice)
472
+ output_sample_rate: Output sample rate (default: self.output_sample_rate)
473
+
474
+ Returns:
475
+ Dict with 'audio' (numpy array) and 'sampling_rate' keys
476
+ """
477
+ voice = voice or self.tts_voice
478
+ target_sr = output_sample_rate or self.output_sample_rate
479
+ audio = self._synthesize_speech(text, voice)
480
+ audio = self._resample_audio(audio, TTS_SAMPLE_RATE, target_sr)
481
+ return {"audio": audio, "sampling_rate": target_sr}
482
+
483
+ def _extract_audio(self, inputs) -> np.ndarray | None:
484
+ """Extract audio array from various input formats.
485
+
486
+ Args:
487
+ inputs: Audio input in various formats
488
+
489
+ Returns:
490
+ Audio as numpy array (float32) or None if extraction fails
491
+ """
492
+ if isinstance(inputs, dict):
493
+ if "array" in inputs:
494
+ audio = inputs["array"]
495
+ if isinstance(audio, np.ndarray):
496
+ return audio.astype(np.float32) if audio.dtype != np.float32 else audio
497
+ return np.array(audio, dtype=np.float32)
498
+ if "raw" in inputs:
499
+ audio = inputs["raw"]
500
+ if isinstance(audio, np.ndarray):
501
+ return audio.astype(np.float32) if audio.dtype != np.float32 else audio
502
+ return np.array(audio, dtype=np.float32)
503
+
504
+ elif isinstance(inputs, str):
505
+ # File path
506
+ with Path(inputs).open("rb") as f:
507
+ return ffmpeg_read(f.read(), sampling_rate=16000)
508
+
509
+ elif isinstance(inputs, bytes):
510
+ return ffmpeg_read(inputs, sampling_rate=16000)
511
+
512
+ elif isinstance(inputs, np.ndarray):
513
+ return inputs.astype(np.float32) if inputs.dtype != np.float32 else inputs
514
+
515
+ return None
516
+
517
+ def __call__(self, inputs, **kwargs) -> dict[str, Any]:
518
+ """Process audio input and return speech output.
519
+
520
+ Args:
521
+ inputs: Audio input (file path, dict with array, numpy array, or bytes)
522
+ tts_voice: Kokoro voice ID for TTS output (default: "af_heart")
523
+ return_text_only: If True, skip TTS and return only transcription
524
+ user_prompt: Custom prompt for the model
525
+
526
+ Returns:
527
+ Dict with:
528
+ - 'text': Transcription/response text
529
+ - 'audio': Synthesized speech as numpy array (float32)
530
+ - 'sampling_rate': Audio sample rate (24000)
531
+ """
532
+ return super().__call__(inputs, **kwargs)