| # rentbot/stt_handler.py | |
| import whisper | |
| import numpy as np | |
| import asyncio | |
| from concurrent.futures import ThreadPoolExecutor | |
| # --- Model Loading --- | |
| # This is a CPU/memory intensive operation, so it's done once when the server starts. | |
| print("Loading Whisper model...") | |
| try: | |
| # Use a smaller model for faster loading and lower resource usage, ideal for real-time. | |
| # 'base.en' is a good starting point. | |
| model = whisper.load_model("base.en") | |
| print("Whisper model 'base.en' loaded successfully.") | |
| except Exception as e: | |
| print(f"Error loading Whisper model: {e}") | |
| # Exit if the model can't be loaded, as the app is non-functional without it. | |
| exit() | |
| # --- End Model Loading --- | |
| # We use a thread pool to run the blocking Whisper transcription | |
| # without blocking the main async event loop. | |
| executor = ThreadPoolExecutor(max_workers=4) | |
| def _transcribe(audio_np: np.ndarray): | |
| """ | |
| Internal synchronous function to run in a separate thread. | |
| """ | |
| # The audio data is 16-bit PCM. Whisper expects 32-bit float. | |
| # Normalize the audio from integers to the range [-1.0, 1.0] | |
| audio_float32 = audio_np.astype(np.float32) / 32768.0 | |
| result = model.transcribe( | |
| audio_float32, | |
| language="en", | |
| fp16=False # Set to False for CPU-based inference | |
| ) | |
| return result.get("text", "").strip() | |
| async def transcribe_audio_chunk(audio_chunk: np.ndarray) -> str: | |
| """ | |
| Transcribes an audio chunk using Whisper in a non-blocking way. | |
| """ | |
| if audio_chunk.size == 0: | |
| return "" | |
| loop = asyncio.get_event_loop() | |
| # Run the blocking _transcribe function in the thread pool | |
| text = await loop.run_in_executor( | |
| executor, | |
| _transcribe, | |
| audio_chunk | |
| ) | |
| return text |