lag information in real time even when no audio is detected
Browse files
whisperlivekit/audio_processor.py
CHANGED
|
@@ -292,6 +292,7 @@ class AudioProcessor:
|
|
| 292 |
"""Process audio chunks for transcription."""
|
| 293 |
self.full_transcription = ""
|
| 294 |
self.sep = self.online.asr.sep
|
|
|
|
| 295 |
|
| 296 |
while True:
|
| 297 |
try:
|
|
@@ -315,25 +316,38 @@ class AudioProcessor:
|
|
| 315 |
)
|
| 316 |
|
| 317 |
# Process transcription
|
| 318 |
-
self.
|
| 319 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
|
| 321 |
if new_tokens:
|
| 322 |
self.full_transcription += self.sep.join([t.text for t in new_tokens])
|
| 323 |
|
| 324 |
# Get buffer information
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
|
| 331 |
# Avoid duplicating content
|
| 332 |
-
if
|
| 333 |
-
|
| 334 |
|
| 335 |
await self.update_transcription(
|
| 336 |
-
new_tokens,
|
| 337 |
)
|
| 338 |
self.transcription_queue.task_done()
|
| 339 |
|
|
|
|
| 292 |
"""Process audio chunks for transcription."""
|
| 293 |
self.full_transcription = ""
|
| 294 |
self.sep = self.online.asr.sep
|
| 295 |
+
cumulative_pcm_duration_stream_time = 0.0
|
| 296 |
|
| 297 |
while True:
|
| 298 |
try:
|
|
|
|
| 316 |
)
|
| 317 |
|
| 318 |
# Process transcription
|
| 319 |
+
duration_this_chunk = len(pcm_array) / self.sample_rate if isinstance(pcm_array, np.ndarray) else 0
|
| 320 |
+
cumulative_pcm_duration_stream_time += duration_this_chunk
|
| 321 |
+
stream_time_end_of_current_pcm = cumulative_pcm_duration_stream_time
|
| 322 |
+
|
| 323 |
+
self.online.insert_audio_chunk(pcm_array, stream_time_end_of_current_pcm)
|
| 324 |
+
new_tokens, current_audio_processed_upto = self.online.process_iter()
|
| 325 |
|
| 326 |
if new_tokens:
|
| 327 |
self.full_transcription += self.sep.join([t.text for t in new_tokens])
|
| 328 |
|
| 329 |
# Get buffer information
|
| 330 |
+
_buffer_transcript_obj = self.online.get_buffer()
|
| 331 |
+
buffer_text = _buffer_transcript_obj.text
|
| 332 |
+
|
| 333 |
+
candidate_end_times = [self.end_buffer]
|
| 334 |
+
|
| 335 |
+
if new_tokens:
|
| 336 |
+
candidate_end_times.append(new_tokens[-1].end)
|
| 337 |
+
|
| 338 |
+
if _buffer_transcript_obj.end is not None:
|
| 339 |
+
candidate_end_times.append(_buffer_transcript_obj.end)
|
| 340 |
+
|
| 341 |
+
candidate_end_times.append(current_audio_processed_upto)
|
| 342 |
+
|
| 343 |
+
new_end_buffer = max(candidate_end_times)
|
| 344 |
|
| 345 |
# Avoid duplicating content
|
| 346 |
+
if buffer_text in self.full_transcription:
|
| 347 |
+
buffer_text = ""
|
| 348 |
|
| 349 |
await self.update_transcription(
|
| 350 |
+
new_tokens, buffer_text, new_end_buffer, self.full_transcription, self.sep
|
| 351 |
)
|
| 352 |
self.transcription_queue.task_done()
|
| 353 |
|
whisperlivekit/whisper_streaming_custom/online_asr.py
CHANGED
|
@@ -144,7 +144,11 @@ class OnlineASRProcessor:
|
|
| 144 |
self.transcript_buffer.last_committed_time = self.buffer_time_offset
|
| 145 |
self.committed: List[ASRToken] = []
|
| 146 |
|
| 147 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
"""Append an audio chunk (a numpy array) to the current audio buffer."""
|
| 149 |
self.audio_buffer = np.append(self.audio_buffer, audio)
|
| 150 |
|
|
@@ -179,18 +183,19 @@ class OnlineASRProcessor:
|
|
| 179 |
return self.concatenate_tokens(self.transcript_buffer.buffer)
|
| 180 |
|
| 181 |
|
| 182 |
-
def process_iter(self) ->
|
| 183 |
"""
|
| 184 |
Processes the current audio buffer.
|
| 185 |
|
| 186 |
-
Returns a
|
| 187 |
"""
|
|
|
|
| 188 |
prompt_text, _ = self.prompt()
|
| 189 |
logger.debug(
|
| 190 |
f"Transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds from {self.buffer_time_offset:.2f}"
|
| 191 |
)
|
| 192 |
res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt_text)
|
| 193 |
-
tokens = self.asr.ts_words(res)
|
| 194 |
self.transcript_buffer.insert(tokens, self.buffer_time_offset)
|
| 195 |
committed_tokens = self.transcript_buffer.flush()
|
| 196 |
self.committed.extend(committed_tokens)
|
|
@@ -210,7 +215,7 @@ class OnlineASRProcessor:
|
|
| 210 |
logger.debug(
|
| 211 |
f"Length of audio buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds"
|
| 212 |
)
|
| 213 |
-
return committed_tokens
|
| 214 |
|
| 215 |
def chunk_completed_sentence(self):
|
| 216 |
"""
|
|
@@ -344,14 +349,16 @@ class OnlineASRProcessor:
|
|
| 344 |
sentences.append(sentence)
|
| 345 |
return sentences
|
| 346 |
|
| 347 |
-
def finish(self) -> List[ASRToken]:
|
| 348 |
"""
|
| 349 |
Flush the remaining transcript when processing ends.
|
|
|
|
| 350 |
"""
|
| 351 |
remaining_tokens = self.transcript_buffer.buffer
|
| 352 |
logger.debug(f"Final non-committed tokens: {remaining_tokens}")
|
| 353 |
-
self.buffer_time_offset +
|
| 354 |
-
|
|
|
|
| 355 |
|
| 356 |
def concatenate_tokens(
|
| 357 |
self,
|
|
@@ -393,28 +400,35 @@ class VACOnlineASRProcessor:
|
|
| 393 |
|
| 394 |
self.vac = FixedVADIterator(model)
|
| 395 |
self.logfile = self.online.logfile
|
|
|
|
| 396 |
self.init()
|
| 397 |
|
| 398 |
def init(self):
|
| 399 |
self.online.init()
|
| 400 |
self.vac.reset_states()
|
| 401 |
self.current_online_chunk_buffer_size = 0
|
|
|
|
| 402 |
self.is_currently_final = False
|
| 403 |
self.status: Optional[str] = None # "voice" or "nonvoice"
|
| 404 |
self.audio_buffer = np.array([], dtype=np.float32)
|
| 405 |
self.buffer_offset = 0 # in frames
|
| 406 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
def clear_buffer(self):
|
| 408 |
self.buffer_offset += len(self.audio_buffer)
|
| 409 |
self.audio_buffer = np.array([], dtype=np.float32)
|
| 410 |
|
| 411 |
-
def insert_audio_chunk(self, audio: np.ndarray):
|
| 412 |
"""
|
| 413 |
Process an incoming small audio chunk:
|
| 414 |
- run VAD on the chunk,
|
| 415 |
- decide whether to send the audio to the online ASR processor immediately,
|
| 416 |
- and/or to mark the current utterance as finished.
|
| 417 |
"""
|
|
|
|
| 418 |
res = self.vac(audio)
|
| 419 |
self.audio_buffer = np.append(self.audio_buffer, audio)
|
| 420 |
|
|
@@ -456,10 +470,11 @@ class VACOnlineASRProcessor:
|
|
| 456 |
self.buffer_offset += max(0, len(self.audio_buffer) - self.SAMPLING_RATE)
|
| 457 |
self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]
|
| 458 |
|
| 459 |
-
def process_iter(self) -> List[ASRToken]:
|
| 460 |
"""
|
| 461 |
Depending on the VAD status and the amount of accumulated audio,
|
| 462 |
process the current audio chunk.
|
|
|
|
| 463 |
"""
|
| 464 |
if self.is_currently_final:
|
| 465 |
return self.finish()
|
|
@@ -468,14 +483,17 @@ class VACOnlineASRProcessor:
|
|
| 468 |
return self.online.process_iter()
|
| 469 |
else:
|
| 470 |
logger.debug("No online update, only VAD")
|
| 471 |
-
return []
|
| 472 |
|
| 473 |
-
def finish(self) -> List[ASRToken]:
|
| 474 |
-
"""
|
| 475 |
-
|
|
|
|
|
|
|
|
|
|
| 476 |
self.current_online_chunk_buffer_size = 0
|
| 477 |
self.is_currently_final = False
|
| 478 |
-
return
|
| 479 |
|
| 480 |
def get_buffer(self):
|
| 481 |
"""
|
|
|
|
| 144 |
self.transcript_buffer.last_committed_time = self.buffer_time_offset
|
| 145 |
self.committed: List[ASRToken] = []
|
| 146 |
|
| 147 |
+
def get_audio_buffer_end_time(self) -> float:
|
| 148 |
+
"""Returns the absolute end time of the current audio_buffer."""
|
| 149 |
+
return self.buffer_time_offset + (len(self.audio_buffer) / self.SAMPLING_RATE)
|
| 150 |
+
|
| 151 |
+
def insert_audio_chunk(self, audio: np.ndarray, audio_stream_end_time: Optional[float] = None):
|
| 152 |
"""Append an audio chunk (a numpy array) to the current audio buffer."""
|
| 153 |
self.audio_buffer = np.append(self.audio_buffer, audio)
|
| 154 |
|
|
|
|
| 183 |
return self.concatenate_tokens(self.transcript_buffer.buffer)
|
| 184 |
|
| 185 |
|
| 186 |
+
def process_iter(self) -> Tuple[List[ASRToken], float]:
|
| 187 |
"""
|
| 188 |
Processes the current audio buffer.
|
| 189 |
|
| 190 |
+
Returns a tuple: (list of committed ASRToken objects, float representing the audio processed up to time).
|
| 191 |
"""
|
| 192 |
+
current_audio_processed_upto = self.get_audio_buffer_end_time()
|
| 193 |
prompt_text, _ = self.prompt()
|
| 194 |
logger.debug(
|
| 195 |
f"Transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds from {self.buffer_time_offset:.2f}"
|
| 196 |
)
|
| 197 |
res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt_text)
|
| 198 |
+
tokens = self.asr.ts_words(res)
|
| 199 |
self.transcript_buffer.insert(tokens, self.buffer_time_offset)
|
| 200 |
committed_tokens = self.transcript_buffer.flush()
|
| 201 |
self.committed.extend(committed_tokens)
|
|
|
|
| 215 |
logger.debug(
|
| 216 |
f"Length of audio buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:.2f} seconds"
|
| 217 |
)
|
| 218 |
+
return committed_tokens, current_audio_processed_upto
|
| 219 |
|
| 220 |
def chunk_completed_sentence(self):
|
| 221 |
"""
|
|
|
|
| 349 |
sentences.append(sentence)
|
| 350 |
return sentences
|
| 351 |
|
| 352 |
+
def finish(self) -> Tuple[List[ASRToken], float]:
|
| 353 |
"""
|
| 354 |
Flush the remaining transcript when processing ends.
|
| 355 |
+
Returns a tuple: (list of remaining ASRToken objects, float representing the final audio processed up to time).
|
| 356 |
"""
|
| 357 |
remaining_tokens = self.transcript_buffer.buffer
|
| 358 |
logger.debug(f"Final non-committed tokens: {remaining_tokens}")
|
| 359 |
+
final_processed_upto = self.buffer_time_offset + (len(self.audio_buffer) / self.SAMPLING_RATE)
|
| 360 |
+
self.buffer_time_offset = final_processed_upto
|
| 361 |
+
return remaining_tokens, final_processed_upto
|
| 362 |
|
| 363 |
def concatenate_tokens(
|
| 364 |
self,
|
|
|
|
| 400 |
|
| 401 |
self.vac = FixedVADIterator(model)
|
| 402 |
self.logfile = self.online.logfile
|
| 403 |
+
self.last_input_audio_stream_end_time: float = 0.0
|
| 404 |
self.init()
|
| 405 |
|
| 406 |
def init(self):
|
| 407 |
self.online.init()
|
| 408 |
self.vac.reset_states()
|
| 409 |
self.current_online_chunk_buffer_size = 0
|
| 410 |
+
self.last_input_audio_stream_end_time = self.online.buffer_time_offset
|
| 411 |
self.is_currently_final = False
|
| 412 |
self.status: Optional[str] = None # "voice" or "nonvoice"
|
| 413 |
self.audio_buffer = np.array([], dtype=np.float32)
|
| 414 |
self.buffer_offset = 0 # in frames
|
| 415 |
|
| 416 |
+
def get_audio_buffer_end_time(self) -> float:
|
| 417 |
+
"""Returns the absolute end time of the audio processed by the underlying OnlineASRProcessor."""
|
| 418 |
+
return self.online.get_audio_buffer_end_time()
|
| 419 |
+
|
| 420 |
def clear_buffer(self):
|
| 421 |
self.buffer_offset += len(self.audio_buffer)
|
| 422 |
self.audio_buffer = np.array([], dtype=np.float32)
|
| 423 |
|
| 424 |
+
def insert_audio_chunk(self, audio: np.ndarray, audio_stream_end_time: float):
|
| 425 |
"""
|
| 426 |
Process an incoming small audio chunk:
|
| 427 |
- run VAD on the chunk,
|
| 428 |
- decide whether to send the audio to the online ASR processor immediately,
|
| 429 |
- and/or to mark the current utterance as finished.
|
| 430 |
"""
|
| 431 |
+
self.last_input_audio_stream_end_time = audio_stream_end_time
|
| 432 |
res = self.vac(audio)
|
| 433 |
self.audio_buffer = np.append(self.audio_buffer, audio)
|
| 434 |
|
|
|
|
| 470 |
self.buffer_offset += max(0, len(self.audio_buffer) - self.SAMPLING_RATE)
|
| 471 |
self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]
|
| 472 |
|
| 473 |
+
def process_iter(self) -> Tuple[List[ASRToken], float]:
|
| 474 |
"""
|
| 475 |
Depending on the VAD status and the amount of accumulated audio,
|
| 476 |
process the current audio chunk.
|
| 477 |
+
Returns a tuple: (list of committed ASRToken objects, float representing the audio processed up to time).
|
| 478 |
"""
|
| 479 |
if self.is_currently_final:
|
| 480 |
return self.finish()
|
|
|
|
| 483 |
return self.online.process_iter()
|
| 484 |
else:
|
| 485 |
logger.debug("No online update, only VAD")
|
| 486 |
+
return [], self.last_input_audio_stream_end_time
|
| 487 |
|
| 488 |
+
def finish(self) -> Tuple[List[ASRToken], float]:
|
| 489 |
+
"""
|
| 490 |
+
Finish processing by flushing any remaining text.
|
| 491 |
+
Returns a tuple: (list of remaining ASRToken objects, float representing the final audio processed up to time).
|
| 492 |
+
"""
|
| 493 |
+
result_tokens, processed_upto = self.online.finish()
|
| 494 |
self.current_online_chunk_buffer_size = 0
|
| 495 |
self.is_currently_final = False
|
| 496 |
+
return result_tokens, processed_upto
|
| 497 |
|
| 498 |
def get_buffer(self):
|
| 499 |
"""
|