Spaces:

Krish-05
/

fast_rep_voice

Paused

App Files Files Community

Krish-05 commited on Jul 25, 2025

Commit

d4ce0b6

verified ·

1 Parent(s): e1b30b5

Create stt_module.py

Browse files

Files changed (1) hide show

stt_module.py +85 -0

stt_module.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os
+import logging
+import asyncio
+from typing import Optional
+from faster_whisper import WhisperModel
+logger = logging.getLogger(__name__)
+# Global model variable for singleton pattern
+_stt_model: Optional[WhisperModel] = None
+_model_initialized = False
+def initialize_stt():
+    """Initializes the Whisper model globally if not already initialized."""
+    global _stt_model, _model_initialized
+    if _model_initialized:
+        logger.info("STT model already initialized.")
+        return True
+    try:
+        logger.info("Loading Whisper model (base) on CPU...")
+        # Explicitly set device to CPU and compute type to int8 for better performance on CPU.
+        # Consider 'tiny' or 'small' for faster inference on limited CPU resources.
+        _stt_model = WhisperModel(
+            "base", # You can try "tiny" or "small" for faster but less accurate results
+            device="cpu",
+            compute_type="int8" # For CPU optimization
+        )
+        _model_initialized = True
+        logger.info("STT model initialized successfully on CPU.")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to initialize STT model: {e}")
+        _model_initialized = False # Mark as failed
+        return False
+def get_stt_model() -> Optional[WhisperModel]:
+    """Returns the initialized STT model, initializing it if necessary."""
+    if not _model_initialized:
+        initialize_stt()
+    return _stt_model
+async def transcribe_audio_file(audio_path: str) -> Optional[str]:
+    """
+    Asynchronously transcribes an audio file to text using faster_whisper.
+    Wraps the synchronous faster_whisper transcribe call in an asyncio.to_thread
+    to prevent blocking the FastAPI event loop.
+    """
+    model = get_stt_model()
+    if model is None:
+        logger.error("STT model is not loaded. Cannot transcribe audio.")
+        return None
+    if not os.path.exists(audio_path):
+        logger.error(f"Audio file not found for transcription: {audio_path}")
+        return None
+    if os.path.getsize(audio_path) == 0:
+        logger.warning(f"Audio file is empty: {audio_path}")
+        return ""
+    logger.info(f"Starting transcription of {audio_path}...")
+    try:
+        # Run the synchronous transcription in a separate thread
+        segments, info = await asyncio.to_thread(
+            model.transcribe,
+            audio_path,
+            beam_size=5, # Number of beams for beam search, common value
+            vad_filter=True # Use Voice Activity Detection to filter out non-speech segments
+        )
+        text_segments = []
+        for segment in segments:
+            if segment.text.strip():
+                text_segments.append(segment.text.strip())
+        transcribed_text = " ".join(text_segments)
+        logger.info(f"Transcription complete. Detected language: {info.language} with probability {info.language_probability:.4f}. Text: {transcribed_text[:100]}...")
+        return transcribed_text
+    except Exception as e:
+        logger.error(f"Error during audio transcription: {e}", exc_info=True)
+        return None
+def is_model_loaded() -> bool:
+    """Checks if the STT model is loaded and ready."""
+    return _stt_model is not None and _model_initialized