Spaces:

ai-coustics
/

VoiceFocus

Running on CPU Upgrade

user4-33 commited on Mar 16

Commit

c62a089

1 Parent(s): aa7cfd7

dynamic sample rate (#1)

- disable autoplay (a7806663ddacb0a6a58b7bdab296d05309239020)
- Support dynamic sample rates (abfb82b92ca57b41ad8001a7e16e54d78e60d5a0)
- Remove unused function (b80bb2398d4d4dca208626a14581afd954465a65)
- Upload file always to wav (fix m4a bug) (b38ade9597e00e1b3845c8780d5f25ceb792228d)

Files changed (5) hide show

app.py +14 -11
offline_pipeline.py +21 -218
stt_streamers/deepgram_streamer.py +1 -2
stt_streamers/soniox_streamer.py +1 -2
utils.py +1 -40

app.py CHANGED Viewed

@@ -50,7 +50,8 @@ def process_with_live_transcript(
     sample_stem,
     stt_model,
     last_sample_stem,
-):
     """Generator that runs the offline pipeline in real time (chunked): enhanced audio and
     both transcripts stream from the first chunk so playback and transcription start immediately."""
     progress_state = {}
@@ -60,10 +61,11 @@ def process_with_live_transcript(
         try:
             result_holder["result"] = run_offline_pipeline_streaming(
                 input_array,
                 enhancement_level,
                 sample_stem,
                 stt_model,
-                progress_state,
             )
         except Exception as e:
             result_holder["error"] = e
@@ -153,6 +155,7 @@ with gr.Blocks() as demo:
     input_array = gr.State()
     enhanced_array = gr.State()
     precomputed_noisy_transcript = gr.State("")
     gr.HTML(
         '<a href="https://ai-coustics.com/" target="_blank">'
@@ -232,14 +235,14 @@ with gr.Blocks() as demo:
                 choices=ALL_FILES, value="en_00412_i_h_36", label="Sample"
             )
             audio_file_from_dataset = gr.Audio(
-                type="filepath", interactive=False, buttons=["download"], autoplay=True
             )
         with gr.Tab("Upload local file") as upload_tab:
             with gr.Row():
                 gr.Markdown(open("docs/local_file.md", "r", encoding="utf-8").read())
             audio_file_upload = gr.Audio(
-                type="filepath", sources=["upload"], buttons=["download"], autoplay=True
             )
         enhance_btn = gr.Button("Enhance with Quail Voice Focus 2.0", scale=2, visible=False)
@@ -250,7 +253,7 @@ with gr.Blocks() as demo:
             type="numpy",
             interactive=False,
             buttons=["download"],
-            autoplay=True,
         )
         with gr.Row(equal_height=True, elem_classes="results-row"):
@@ -281,8 +284,8 @@ with gr.Blocks() as demo:
     def load_dataset_sample_on_tab_visit(dropdown_value):
         """Load the selected sample when visiting the Dataset tab; use default if dropdown is empty."""
         sample_id = dropdown_value or DEFAULT_DATASET_SAMPLE
-        audio_path, arr, stem = load_file_from_dataset(sample_id)
-        return sample_id, audio_path, arr, stem
     stream_tab.select(
         lambda: (
@@ -325,7 +328,7 @@ with gr.Blocks() as demo:
     ).then(
         load_dataset_sample_on_tab_visit,
         inputs=[dataset_dropdown],
-        outputs=[dataset_dropdown, audio_file_from_dataset, input_array, sample_stem],
     )
     stt_model.change(
@@ -379,7 +382,7 @@ with gr.Blocks() as demo:
     ).then(
         load_file_from_dataset,
         inputs=dataset_dropdown,
-        outputs=[audio_file_from_dataset, input_array, sample_stem],
     )
     # Uploading a local file triggers loading the audio file and hiding results until enhancement
@@ -390,13 +393,13 @@ with gr.Blocks() as demo:
     ).then(
         load_local_file,
         inputs=[audio_file_upload],
-        outputs=[input_array, sample_stem]
     )
     # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
     enhance_btn.click(
         process_with_live_transcript,
-        inputs=[input_array, enhancement_level, sample_stem, stt_model, last_sample_stem],
         outputs=[
             results_card,
             enhanced_audio,

     sample_stem,
     stt_model,
     last_sample_stem,
+    current_sample_rate,
+):
     """Generator that runs the offline pipeline in real time (chunked): enhanced audio and
     both transcripts stream from the first chunk so playback and transcription start immediately."""
     progress_state = {}
         try:
             result_holder["result"] = run_offline_pipeline_streaming(
                 input_array,
+                current_sample_rate,
                 enhancement_level,
                 sample_stem,
                 stt_model,
+                progress_state
             )
         except Exception as e:
             result_holder["error"] = e
     input_array = gr.State()
     enhanced_array = gr.State()
     precomputed_noisy_transcript = gr.State("")
+    current_sample_rate = gr.State(16000)  # default sample rate for dataset samples; updated on local file load if different
     gr.HTML(
         '<a href="https://ai-coustics.com/" target="_blank">'
                 choices=ALL_FILES, value="en_00412_i_h_36", label="Sample"
             )
             audio_file_from_dataset = gr.Audio(
+                type="filepath", interactive=False, buttons=["download"], autoplay=False
             )
         with gr.Tab("Upload local file") as upload_tab:
             with gr.Row():
                 gr.Markdown(open("docs/local_file.md", "r", encoding="utf-8").read())
             audio_file_upload = gr.Audio(
+                type="filepath", sources=["upload"], buttons=["download"], autoplay=False, format= "wav"
             )
         enhance_btn = gr.Button("Enhance with Quail Voice Focus 2.0", scale=2, visible=False)
             type="numpy",
             interactive=False,
             buttons=["download"],
+            autoplay=False,
         )
         with gr.Row(equal_height=True, elem_classes="results-row"):
     def load_dataset_sample_on_tab_visit(dropdown_value):
         """Load the selected sample when visiting the Dataset tab; use default if dropdown is empty."""
         sample_id = dropdown_value or DEFAULT_DATASET_SAMPLE
+        audio_path, arr, stem, sample_rate = load_file_from_dataset(sample_id)
+        return sample_id, audio_path, arr, stem, sample_rate
     stream_tab.select(
         lambda: (
     ).then(
         load_dataset_sample_on_tab_visit,
         inputs=[dataset_dropdown],
+        outputs=[dataset_dropdown, audio_file_from_dataset, input_array, sample_stem, current_sample_rate],
     )
     stt_model.change(
     ).then(
         load_file_from_dataset,
         inputs=dataset_dropdown,
+        outputs=[audio_file_from_dataset, input_array, sample_stem, current_sample_rate],
     )
     # Uploading a local file triggers loading the audio file and hiding results until enhancement
     ).then(
         load_local_file,
         inputs=[audio_file_upload],
+        outputs=[input_array, sample_stem, current_sample_rate]
     )
     # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
     enhance_btn.click(
         process_with_live_transcript,
+        inputs=[input_array, enhancement_level, sample_stem, stt_model, last_sample_stem, current_sample_rate],
         outputs=[
             results_card,
             enhanced_audio,

offline_pipeline.py CHANGED Viewed

@@ -1,210 +1,14 @@
 import os
-from concurrent.futures import ThreadPoolExecutor
-from typing import Optional
 import gradio as gr
-import librosa
 from sdk import SDKWrapper
-from utils import spec_image, compute_wer, transcribe_audio, to_gradio_audio
 from hf_dataset_utils import get_audio, get_transcript
-from constants import DEFAULT_SR, APP_TMP_DIR, STREAMER_CLASSES
 import numpy as np
-def retrieve_audio_information(
-    original_array: np.ndarray,
-    enhanced_array: np.ndarray,
-    sample_id: str,
-    stt_model: str,
-    noisy_transcript: str,
-    progress_state: Optional[dict] = None,
-) -> tuple[str, str, str, str, str, str]:
-    """Build spectrograms, transcribe enhanced audio, and compute WER. Caller must supply
-    noisy_transcript (transcription of original_array) so STT on the original is never run here.
-    If progress_state is provided, progress_state['enhanced'] is updated with partial transcript as enhanced STT streams."""
-    if original_array is None or enhanced_array is None:
-        raise ValueError("Audio arrays are not available.")
-    noisy_spec_path = f"{APP_TMP_DIR}/{sample_id}_noisy_spectrogram.png"
-    enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
-    spec_image(original_array).save(noisy_spec_path)
-    spec_image(enhanced_array).save(enhanced_spec_path)
-    on_enhanced = (lambda t: progress_state.__setitem__("enhanced", t)) if progress_state is not None else None
-    enhanced_transcript = transcribe_audio(
-        enhanced_array,
-        DEFAULT_SR,
-        stt_model,
-        stream_name=f"{sample_id}_enhanced",
-        on_update=on_enhanced,
-    )
-    try:
-        original_transcript = get_transcript(sample_id)
-        wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
-        wer_noisy = compute_wer(original_transcript, noisy_transcript)
-        enhanced_transcript += f" (WER: {wer_enhanced * 100:.2f}%)"
-        noisy_transcript += f" (WER: {wer_noisy * 100:.2f}%)"
-    except Exception:
-        original_transcript = "Unavailable"
-    return enhanced_spec_path, noisy_spec_path, enhanced_transcript, original_transcript, noisy_transcript, sample_id
-def denoise_and_transcribe_noisy_parallel(
-    sample_16k: np.ndarray,
-    enhancement_level: float,
-    sample_id: str,
-    stt_model: str,
-    progress_state: Optional[dict] = None,
-) -> tuple[np.ndarray | None, tuple[int, np.ndarray] | None, str]:
-    """Run denoising and noisy transcription in parallel. Returns (enhanced_array, gradio_audio, noisy_transcript).
-    If progress_state is provided, progress_state['noisy'] is updated with partial transcript as noisy STT streams."""
-    if sample_16k is None:
-        raise ValueError("No audio to enhance. Please upload a file first.")
-    on_noisy = (lambda t: progress_state.__setitem__("noisy", t)) if progress_state is not None else None
-    with ThreadPoolExecutor(max_workers=2) as executor:
-        future_denoise = executor.submit(
-            _denoise_audio_impl, sample_16k, float(enhancement_level) / 100.0
-        )
-        future_noisy = executor.submit(
-            transcribe_audio,
-            sample_16k,
-            DEFAULT_SR,
-            stt_model,
-            stream_name=f"{sample_id}_noisy",
-            on_update=on_noisy,
-        )
-        enhanced_array, gradio_enhanced_audio = future_denoise.result()
-        noisy_transcript = future_noisy.result()
-    return enhanced_array, gradio_enhanced_audio, noisy_transcript
-def _denoise_audio_impl(
-    sample_16k: np.ndarray, enhancement_level_frac: float
-) -> tuple[np.ndarray, tuple[int, np.ndarray]]:
-    """Run the enhancer on a single array and return both the array and Gradio-ready audio.
-    Used by denoise_and_transcribe_noisy_parallel so the denoise step can run in a
-    thread without Gradio UI calls. Expects enhancement_level_frac in [0, 1] (not percent).
-    Returns (enhanced_array, (sample_rate, numpy_array)) for Gradio Audio.
-    """
-    sdk = SDKWrapper()
-    sdk.init_processor(
-        sample_rate=DEFAULT_SR,
-        enhancement_level=enhancement_level_frac,
-    )
-    enhanced_array = sdk.process_sync(sample_16k)
-    gradio_enhanced_audio = to_gradio_audio(enhanced_array, DEFAULT_SR)
-    return enhanced_array, gradio_enhanced_audio
-def denoise_audio(
-    sample_16k: np.ndarray,
-    enhancement_level: float = 50.0,
-) -> tuple[np.ndarray | None , tuple[int, np.ndarray]| None]:
-    """Enhance-only entry point: run the SDK on the given audio and return enhanced array + Gradio audio.
-    The main app uses denoise_and_transcribe_noisy_parallel (denoise + noisy STT in parallel)
-    instead. This function remains for backward compatibility, scripts, or any caller that
-    needs only enhancement without transcription (e.g. backup flows, tests).
-    """
-    if sample_16k is None:
-        raise ValueError("No audio to enhance. Please upload a file first.")
-    try:
-        sdk = SDKWrapper()
-        sdk.init_processor(sample_rate=DEFAULT_SR, enhancement_level=float(enhancement_level) / 100.0)
-        enhanced_array = sdk.process_sync(sample_16k)
-    except Exception as e:
-        gr.Warning(f"{e}")
-        raise e
-    gradio_enhanced_audio = to_gradio_audio(enhanced_array, DEFAULT_SR)
-    return enhanced_array, gradio_enhanced_audio
-def run_offline_pipeline_ordered(
-    sample_16k: np.ndarray,
-    enhancement_level: float,
-    sample_id: str,
-    stt_model: str,
-    progress_state: dict,
-) -> tuple[str, str, str, tuple[int, np.ndarray], str, np.ndarray, str]:
-    """Run pipeline in UI order: 1) Denoise only. 2) When done, set progress_state['enhanced_spec_path']
-    and progress_state['enhanced_audio']. 3) Run noisy STT and enhanced STT in parallel (both stream via
-    progress_state['noisy'] and progress_state['enhanced']). 4) Return final transcripts with WER.
-    Returns: (enhanced_spec_path, enhanced_transcript, noisy_transcript_with_wer, enhanced_audio,
-              last_stem, enhanced_array, precomputed_noisy).
-    """
-    if sample_16k is None:
-        raise ValueError("No audio to enhance. Please upload a file first.")
-    # 1) Denoise only
-    enhanced_array, gradio_enhanced_audio = _denoise_audio_impl(
-        sample_16k, float(enhancement_level) / 100.0
-    )
-    # 2) As soon as enhanced audio is ready: build enhanced spectrogram and expose to UI
-    enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
-    spec_image(enhanced_array).save(enhanced_spec_path)
-    progress_state["enhanced_spec_path"] = enhanced_spec_path
-    progress_state["enhanced_audio"] = gradio_enhanced_audio
-    # 3) Noisy and enhanced transcription starting both at the same time (parallel, both with on_update).
-    # Sync so the UI shows both boxes updating together: only write to noisy/enhanced once both have sent at least one update.
-    progress_state["noisy_pending"] = ""
-    progress_state["enhanced_pending"] = ""
-    progress_state["noisy_has_sent"] = False
-    progress_state["enhanced_has_sent"] = False
-    def _flush_both():
-        if progress_state.get("noisy_has_sent") and progress_state.get("enhanced_has_sent"):
-            progress_state["noisy"] = progress_state.get("noisy_pending", "")
-            progress_state["enhanced"] = progress_state.get("enhanced_pending", "")
-    def on_noisy(t: str):
-        progress_state["noisy_pending"] = t
-        progress_state["noisy_has_sent"] = True
-        _flush_both()
-    def on_enhanced(t: str):
-        progress_state["enhanced_pending"] = t
-        progress_state["enhanced_has_sent"] = True
-        _flush_both()
-    with ThreadPoolExecutor(max_workers=2) as executor:
-        future_noisy = executor.submit(
-            transcribe_audio,
-            sample_16k,
-            DEFAULT_SR,
-            stt_model,
-            stream_name=f"{sample_id}_noisy",
-            on_update=on_noisy,
-        )
-        future_enhanced = executor.submit(
-            transcribe_audio,
-            enhanced_array,
-            DEFAULT_SR,
-            stt_model,
-            stream_name=f"{sample_id}_enhanced",
-            on_update=on_enhanced,
-        )
-        noisy_transcript = future_noisy.result()
-        enhanced_transcript = future_enhanced.result()
-    # 4) WER and final strings
-    precomputed_noisy = noisy_transcript
-    try:
-        original_transcript = get_transcript(sample_id)
-        wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
-        wer_noisy = compute_wer(original_transcript, noisy_transcript)
-        enhanced_transcript += f" (WER: {wer_enhanced * 100:.2f}%)"
-        noisy_transcript += f" (WER: {wer_noisy * 100:.2f}%)"
-    except Exception:
-        original_transcript = "Unavailable"
-    return (
-        enhanced_spec_path,
-        enhanced_transcript,
-        noisy_transcript,
-        gradio_enhanced_audio,
-        sample_id,
-        enhanced_array,
-        precomputed_noisy,
-    )
 def _close_stt_stream(streamer) -> None:
     """Signal end-of-stream; streamer type may be Soniox (close_stream) or Deepgram (close)."""
     if hasattr(streamer, "close_stream"):
@@ -212,9 +16,9 @@ def _close_stt_stream(streamer) -> None:
     else:
         streamer.close()
 def run_offline_pipeline_streaming(
-    sample_16k: np.ndarray,
     enhancement_level: float,
     sample_id: str,
     stt_model: str,
@@ -224,13 +28,13 @@ def run_offline_pipeline_streaming(
     via progress_state['noisy'] and progress_state['enhanced']. Enhanced audio is returned
     only at the end; the app plays it automatically when processing is complete.
     Returns same tuple as run_offline_pipeline_ordered."""
-    if sample_16k is None:
         raise ValueError("No audio to enhance. Please upload a file first.")
-    sample_16k = np.asarray(sample_16k, dtype=np.float32).flatten()
     sdk = SDKWrapper()
     sdk.init_processor(
-        sample_rate=DEFAULT_SR,
         enhancement_level=float(enhancement_level) / 100.0,
     )
     chunk_size = sdk.num_frames
@@ -259,14 +63,14 @@ def run_offline_pipeline_streaming(
     if stt_model not in STREAMER_CLASSES:
         raise ValueError(f"Unknown STT model: {stt_model}")
     StreamerClass = STREAMER_CLASSES[stt_model]
-    streamer_noisy = StreamerClass(DEFAULT_SR, f"{sample_id}_noisy", on_update=on_noisy)
-    streamer_enhanced = StreamerClass(DEFAULT_SR, f"{sample_id}_enhanced", on_update=on_enhanced)
     accumulated_enhanced: list[np.ndarray] = []
-    n = len(sample_16k)
     for i in range(0, n, chunk_size):
-        raw_chunk = sample_16k[i : i + chunk_size]
         if raw_chunk.size < chunk_size:
             raw_chunk = np.pad(
                 raw_chunk,
@@ -292,7 +96,7 @@ def run_offline_pipeline_streaming(
         enhanced_transcript = streamer_enhanced.render_tokens(streamer_enhanced.final_tokens, [])
     enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
-    gradio_enhanced_audio = to_gradio_audio(enhanced_array, DEFAULT_SR)
     enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
     spec_image(enhanced_array).save(enhanced_spec_path)
@@ -318,10 +122,9 @@ def run_offline_pipeline_streaming(
         precomputed_noisy,
     )
 def load_local_file(
     sample_path: str
-    ) -> tuple[np.ndarray, str]:
     if not sample_path or not os.path.exists(sample_path):
         gr.Warning("Please upload a valid audio file.")
         raise ValueError("Missing audio sample. Please upload an audio sample or use the microphone input.")
@@ -329,18 +132,18 @@ def load_local_file(
         gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
         raise ValueError("Uploaded file exceeds the 5 MB size limit.")
     new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
-    y_16k, _ = librosa.load(sample_path, sr=DEFAULT_SR, dtype="float32", mono=True)
-    return y_16k, new_sample_stem
-def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str]:
     if not sample_id:
         gr.Warning("Please select a sample from the dropdown.")
-        return None, None, ""
     new_sample_stem = sample_id
     try:
-        y_16k, sr = get_audio(sample_id, prefix="mix")
-        y_16k_for_gradio = to_gradio_audio(y_16k, sr)
     except Exception as e:  # Convert to 16-bit PCM for gradio audio component
         gr.Warning(f"{e}")
         raise e
-    return y_16k_for_gradio, y_16k, new_sample_stem

 import os
 import gradio as gr
+import soundfile as sf
 from sdk import SDKWrapper
+from utils import spec_image, compute_wer, to_gradio_audio
 from hf_dataset_utils import get_audio, get_transcript
+from constants import APP_TMP_DIR, STREAMER_CLASSES
 import numpy as np
 def _close_stt_stream(streamer) -> None:
     """Signal end-of-stream; streamer type may be Soniox (close_stream) or Deepgram (close)."""
     if hasattr(streamer, "close_stream"):
     else:
         streamer.close()
 def run_offline_pipeline_streaming(
+    sample: np.ndarray,
+    sample_rate: int,
     enhancement_level: float,
     sample_id: str,
     stt_model: str,
     via progress_state['noisy'] and progress_state['enhanced']. Enhanced audio is returned
     only at the end; the app plays it automatically when processing is complete.
     Returns same tuple as run_offline_pipeline_ordered."""
+    if sample is None:
         raise ValueError("No audio to enhance. Please upload a file first.")
+    sample = np.asarray(sample, dtype=np.float32).flatten()
     sdk = SDKWrapper()
     sdk.init_processor(
+        sample_rate=sample_rate,
         enhancement_level=float(enhancement_level) / 100.0,
     )
     chunk_size = sdk.num_frames
     if stt_model not in STREAMER_CLASSES:
         raise ValueError(f"Unknown STT model: {stt_model}")
     StreamerClass = STREAMER_CLASSES[stt_model]
+    streamer_noisy = StreamerClass(sample_rate, f"{sample_id}_noisy", on_update=on_noisy)
+    streamer_enhanced = StreamerClass(sample_rate, f"{sample_id}_enhanced", on_update=on_enhanced)
     accumulated_enhanced: list[np.ndarray] = []
+    n = len(sample)
     for i in range(0, n, chunk_size):
+        raw_chunk = sample[i : i + chunk_size]
         if raw_chunk.size < chunk_size:
             raw_chunk = np.pad(
                 raw_chunk,
         enhanced_transcript = streamer_enhanced.render_tokens(streamer_enhanced.final_tokens, [])
     enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
+    gradio_enhanced_audio = to_gradio_audio(enhanced_array, sample_rate)
     enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
     spec_image(enhanced_array).save(enhanced_spec_path)
         precomputed_noisy,
     )
 def load_local_file(
     sample_path: str
+    ) -> tuple[np.ndarray, str, int]:
     if not sample_path or not os.path.exists(sample_path):
         gr.Warning("Please upload a valid audio file.")
         raise ValueError("Missing audio sample. Please upload an audio sample or use the microphone input.")
         gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
         raise ValueError("Uploaded file exceeds the 5 MB size limit.")
     new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
+    y, sample_rate = sf.read(sample_path, dtype="float32", always_2d=False)
+    return y, new_sample_stem, sample_rate
+def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str, int | None]:
     if not sample_id:
         gr.Warning("Please select a sample from the dropdown.")
+        return None, None, "", None
     new_sample_stem = sample_id
     try:
+        y, sample_rate = get_audio(sample_id, prefix="mix")
+        y_for_gradio = to_gradio_audio(y, sample_rate)
     except Exception as e:  # Convert to 16-bit PCM for gradio audio component
         gr.Warning(f"{e}")
         raise e
+    return y_for_gradio, y, new_sample_stem, sample_rate

stt_streamers/deepgram_streamer.py CHANGED Viewed

@@ -78,12 +78,11 @@ class DeepgramStreamer:
         """
         Returns parameters for the Deepgram V1 URL query string.
         """
-        assert fs_hz == 16000, "Only 16 kHz audio is supported."
         return {
             "model": "nova-3",  # Recommended general model
             "encoding": "linear16",  # Corresponds to pcm_s16le
-            "sample_rate": 16000,
             "channels": 1,
             "smart_format": "true",  # handling punctuation/formatting
             "interim_results": "true",  # required for non-final updates

         """
         Returns parameters for the Deepgram V1 URL query string.
         """
         return {
             "model": "nova-3",  # Recommended general model
             "encoding": "linear16",  # Corresponds to pcm_s16le
+            "sample_rate": fs_hz,
             "channels": 1,
             "smart_format": "true",  # handling punctuation/formatting
             "interim_results": "true",  # required for non-final updates

stt_streamers/soniox_streamer.py CHANGED Viewed

@@ -13,7 +13,6 @@ class SonioxStreamer:
         api_key = os.environ.get("SONIOX_API_KEY")
         if not api_key:
             raise RuntimeError("Missing SONIOX_API_KEY.")
-        assert fs_hz == 16000, "Only 16 kHz audio is supported."
         self.stream_name = stream_name
         self.api_name = "Soniox RT"
@@ -69,7 +68,7 @@ class SonioxStreamer:
             "api_key": api_key,
             "model": "stt-rt-v3",
             "audio_format": "pcm_s16le",
-            "sample_rate": 16000,
             "num_channels": 1,
             "language_hints": ["en", "de"],
             "language_hints_strict": True,

         api_key = os.environ.get("SONIOX_API_KEY")
         if not api_key:
             raise RuntimeError("Missing SONIOX_API_KEY.")
         self.stream_name = stream_name
         self.api_name = "Soniox RT"
             "api_key": api_key,
             "model": "stt-rt-v3",
             "audio_format": "pcm_s16le",
+            "sample_rate": fs_hz,
             "num_channels": 1,
             "language_hints": ["en", "de"],
             "language_hints_strict": True,

utils.py CHANGED Viewed

@@ -93,43 +93,4 @@ def compute_wer(reference: str, hypothesis: str) -> float:
                 d[i - 1][j - 1] + cost,  # Substitution
             )
     wer = d[len(ref_words)][len(hyp_words)] / max(len(ref_words), 1)
-    return wer
-def transcribe_audio(
-    audio_array: np.ndarray,
-    sr: int,
-    streamer_type: str = "Deepgram",
-    stream_name: str = "RAW",
-    on_update: Optional[Callable[[str], None]] = None,
-):
-    """
-    Transcribe an audio array using the specified STT streamer.
-    Args:
-        audio_array (np.ndarray): Audio data array
-        sr (int): Sample rate of the audio array
-        streamer_type (str): "Soniox" or "Deepgram"
-        stream_name (str): Optional label for streamer instance ("RAW", "ENHANCED", etc.)
-        on_update: Optional callback(text: str) called with partial transcript as results stream in.
-    Returns:
-        str: Final transcript text
-    """
-    if sr != DEFAULT_SR:
-        audio_array = resampy.resample(audio_array, sr, DEFAULT_SR)
-        sr = DEFAULT_SR
-    if streamer_type not in STREAMER_CLASSES:
-        raise ValueError(
-            f"Invalid streamer_type '{streamer_type}'. "
-            f"Choose from: {', '.join(STREAMER_CLASSES.keys())}"
-        )
-    StreamerClass = STREAMER_CLASSES[streamer_type]
-    streamer = StreamerClass(sr, stream_name, on_update=on_update)
-    transcript = streamer.stream_array(audio_array)
-    return transcript

                 d[i - 1][j - 1] + cost,  # Substitution
             )
     wer = d[len(ref_words)][len(hyp_words)] / max(len(ref_words), 1)
+    return wer