import torch from transformers import pipeline import logging import tempfile import os logger = logging.getLogger(__name__) # Global STT pipeline stt_pipeline = None def load_stt_model(): """Load the free Whisper model for speech-to-text""" global stt_pipeline try: logger.info("Loading Whisper-medium STT model...") stt_pipeline = pipeline( "automatic-speech-recognition", model="openai/whisper-medium", device="cpu" ) logger.info("✓ Whisper-medium STT model loaded successfully") except Exception as e: logger.error(f"✗ Failed to load Whisper-medium model: {str(e)}") stt_pipeline = None async def speech_to_text(audio_bytes: bytes, filename: str) -> str: """ Convert audio bytes to text using free Whisper model. Args: audio_bytes: Raw audio file bytes filename: Name of the audio file Returns: Transcribed text """ global stt_pipeline try: if stt_pipeline is None: load_stt_model() if stt_pipeline is None: raise Exception("STT model failed to load") logger.info(f"Converting audio to text using Whisper-medium") # Save audio bytes to temporary file with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio: temp_audio.write(audio_bytes) temp_audio_path = temp_audio.name try: # Transcribe using Whisper result = stt_pipeline(temp_audio_path) transcribed_text = result.get("text", "").strip() if not transcribed_text: transcribed_text = "Sorry, I couldn't understand the audio." logger.info(f"✓ STT successful: '{transcribed_text}'") return transcribed_text finally: # Clean up temporary file if os.path.exists(temp_audio_path): os.unlink(temp_audio_path) except Exception as e: logger.error(f"✗ STT failed: {str(e)}") raise Exception(f"Speech-to-text conversion failed: {str(e)}")