import logging import tempfile import os from transformers import pipeline import librosa import numpy as np logger = logging.getLogger(__name__) stt_pipeline = None def load_stt_model(): global stt_pipeline try: logger.info("Loading Whisper-base STT model...") stt_pipeline = pipeline( "automatic-speech-recognition", model="openai/whisper-base", device="cpu", # Use "cuda" if GPU available chunk_length_s=30, ) logger.info("✓ Whisper STT model loaded successfully") except Exception as e: logger.error(f"✗ Failed to load STT model: {str(e)}") stt_pipeline = None async def speech_to_text(audio_bytes: bytes, filename: str) -> str: """ Convert audio bytes to text using Whisper. Handles WAV, MP3, M4A formats automatically. """ global stt_pipeline try: if stt_pipeline is None: load_stt_model() if stt_pipeline is None: raise Exception("STT model not loaded") logger.info(f"STT: Converting audio file '{filename}'") # Save to temporary file with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp: tmp.write(audio_bytes) tmp_path = tmp.name try: # Load and resample audio to 16kHz audio, sr = librosa.load(tmp_path, sr=16000) # Transcribe result = stt_pipeline(audio, generate_kwargs={"language": "english"}) text = result["text"].strip() if not text: text = "[Silent audio or unrecognizable speech]" logger.info(f"✓ STT Success: '{text}'") return text finally: if os.path.exists(tmp_path): os.unlink(tmp_path) except Exception as e: logger.error(f"✗ STT Error: {str(e)}") raise Exception(f"STT failed: {str(e)}")