# rentbot/stt_handler.py import whisper import numpy as np import asyncio from concurrent.futures import ThreadPoolExecutor # --- Model Loading --- # This is a CPU/memory intensive operation, so it's done once when the server starts. print("Loading Whisper model...") try: # Use a smaller model for faster loading and lower resource usage, ideal for real-time. # 'base.en' is a good starting point. model = whisper.load_model("base.en") print("Whisper model 'base.en' loaded successfully.") except Exception as e: print(f"Error loading Whisper model: {e}") # Exit if the model can't be loaded, as the app is non-functional without it. exit() # --- End Model Loading --- # We use a thread pool to run the blocking Whisper transcription # without blocking the main async event loop. executor = ThreadPoolExecutor(max_workers=4) def _transcribe(audio_np: np.ndarray): """ Internal synchronous function to run in a separate thread. """ # The audio data is 16-bit PCM. Whisper expects 32-bit float. # Normalize the audio from integers to the range [-1.0, 1.0] audio_float32 = audio_np.astype(np.float32) / 32768.0 result = model.transcribe( audio_float32, language="en", fp16=False # Set to False for CPU-based inference ) return result.get("text", "").strip() async def transcribe_audio_chunk(audio_chunk: np.ndarray) -> str: """ Transcribes an audio chunk using Whisper in a non-blocking way. """ if audio_chunk.size == 0: return "" loop = asyncio.get_event_loop() # Run the blocking _transcribe function in the thread pool text = await loop.run_in_executor( executor, _transcribe, audio_chunk ) return text