Spaces:

Krish-05
/

fast_rep_voice

Paused

App Files Files Community

Krish-05 commited on Jul 25, 2025

Commit

a9ca228

verified ·

1 Parent(s): e563ca3

Update stt_module.py

Browse files

Files changed (1) hide show

stt_module.py +37 -82

stt_module.py CHANGED Viewed

@@ -1,85 +1,40 @@
-import os
-import logging
-import asyncio
-from typing import Optional
-from faster_whisper import WhisperModel
-logger = logging.getLogger(__name__)
-# Global model variable for singleton pattern
-_stt_model: Optional[WhisperModel] = None
-_model_initialized = False
-def initialize_stt():
-    """Initializes the Whisper model globally if not already initialized."""
-    global _stt_model, _model_initialized
-    if _model_initialized:
-        logger.info("STT model already initialized.")
-        return True
-    try:
-        logger.info("Loading Whisper model (base) on CPU...")
-        # Explicitly set device to CPU and compute type to int8 for better performance on CPU.
-        # Consider 'tiny' or 'small' for faster inference on limited CPU resources.
-        _stt_model = WhisperModel(
-            "base", # You can try "tiny" or "small" for faster but less accurate results
-            device="cpu",
-            compute_type="int8" # For CPU optimization
-        )
-        _model_initialized = True
-        logger.info("STT model initialized successfully on CPU.")
-        return True
-    except Exception as e:
-        logger.error(f"Failed to initialize STT model: {e}")
-        _model_initialized = False # Mark as failed
-        return False
-def get_stt_model() -> Optional[WhisperModel]:
-    """Returns the initialized STT model, initializing it if necessary."""
-    if not _model_initialized:
-        initialize_stt()
-    return _stt_model
-async def transcribe_audio_file(audio_path: str) -> Optional[str]:
     """
-    Asynchronously transcribes an audio file to text using faster_whisper.
-    Wraps the synchronous faster_whisper transcribe call in an asyncio.to_thread
-    to prevent blocking the FastAPI event loop.
     """
-    model = get_stt_model()
-    if model is None:
-        logger.error("STT model is not loaded. Cannot transcribe audio.")
-        return None
-    if not os.path.exists(audio_path):
-        logger.error(f"Audio file not found for transcription: {audio_path}")
-        return None
-    if os.path.getsize(audio_path) == 0:
-        logger.warning(f"Audio file is empty: {audio_path}")
-        return ""
-    logger.info(f"Starting transcription of {audio_path}...")
-    try:
-        # Run the synchronous transcription in a separate thread
-        segments, info = await asyncio.to_thread(
-            model.transcribe,
-            audio_path,
-            beam_size=5, # Number of beams for beam search, common value
-            vad_filter=True # Use Voice Activity Detection to filter out non-speech segments
-        )
-        text_segments = []
-        for segment in segments:
-            if segment.text.strip():
-                text_segments.append(segment.text.strip())
-        transcribed_text = " ".join(text_segments)
-        logger.info(f"Transcription complete. Detected language: {info.language} with probability {info.language_probability:.4f}. Text: {transcribed_text[:100]}...")
-        return transcribed_text
-    except Exception as e:
-        logger.error(f"Error during audio transcription: {e}", exc_info=True)
-        return None
-def is_model_loaded() -> bool:
-    """Checks if the STT model is loaded and ready."""
-    return _stt_model is not None and _model_initialized

+import threading
+import pydub
+import av
+import streamlit as st # Only imported for st.session_state access in recv method
+from streamlit_webrtc import AudioProcessorBase
+class AudioBufferProcessor(AudioProcessorBase):
     """
+    An audio processor that buffers incoming audio frames.
+    It accumulates audio only when `st.session_state.is_recording` is True.
     """
+    def __init__(self) -> None:
+        self._audio_buffer = pydub.AudioSegment.empty()
+        self._lock = threading.Lock() # Use a lock for thread-safe access to the buffer
+    def recv(self, frame: av.AudioFrame) -> None:
+        """
+        Receives audio frames from the WebRTC stream.
+        If recording is active, appends the frame to the internal buffer.
+        """
+        if st.session_state.is_recording:
+            sound = pydub.AudioSegment(
+                data=frame.to_ndarray().tobytes(),
+                sample_width=frame.format.bytes,
+                frame_rate=frame.sample_rate,
+                channels=len(frame.layout.channels),
+            )
+            sound = sound.set_channels(1).set_frame_rate(16000)
+            with self._lock:
+                self._audio_buffer += sound
+    def get_and_clear_buffered_audio(self) -> pydub.AudioSegment:
+        """
+        Retrieves the accumulated audio and clears the buffer.
+        This method is called when recording stops.
+        """
+        with self._lock:
+            recorded_audio = self._audio_buffer
+            self._audio_buffer = pydub.AudioSegment.empty() # Clear the buffer
+            return recorded_audio