import torch from transformers import pipeline import logging import tempfile import os import subprocess logger = logging.getLogger(__name__) # Global STT pipeline stt_pipeline = None def load_stt_model(): """Load the free Whisper model for speech-to-text""" global stt_pipeline try: # Check if ffmpeg is available if not check_ffmpeg(): logger.warning("ffmpeg not found. STT may not work properly.") logger.info("Loading Whisper-medium STT model...") stt_pipeline = pipeline( "automatic-speech-recognition", model="openai/whisper-medium", device="cpu" ) logger.info("✓ Whisper-medium STT model loaded successfully") except Exception as e: logger.error(f"✗ Failed to load Whisper-medium model: {str(e)}") stt_pipeline = None def check_ffmpeg(): """Check if ffmpeg is available""" try: subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True) return True except (subprocess.CalledProcessError, FileNotFoundError): return False async def speech_to_text(audio_bytes: bytes, filename: str) -> str: """ Convert audio bytes to text using free Whisper model. """ global stt_pipeline try: if stt_pipeline is None: load_stt_model() if stt_pipeline is None: raise Exception("STT model failed to load") # Check ffmpeg again before processing if not check_ffmpeg(): return "Error: ffmpeg is required for audio processing but is not installed. Please install ffmpeg on the server." logger.info(f"Converting audio to text using Whisper-medium") # Save audio bytes to temporary file with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio: temp_audio.write(audio_bytes) temp_audio_path = temp_audio.name try: # Transcribe using Whisper result = stt_pipeline(temp_audio_path) transcribed_text = result.get("text", "").strip() if not transcribed_text: transcribed_text = "No speech detected in the audio." logger.info(f"✓ STT successful: '{transcribed_text}'") return transcribed_text finally: # Clean up temporary file if os.path.exists(temp_audio_path): os.unlink(temp_audio_path) except Exception as e: logger.error(f"✗ STT failed: {str(e)}") if "ffmpeg" in str(e).lower(): return "Audio processing failed: ffmpeg is required but not installed. Please install ffmpeg on the server." raise Exception(f"Speech-to-text conversion failed: {str(e)}")