rentbot / stt_handler.py
mgbam's picture
Rename stt_handler.pyc to stt_handler.py
4587acc verified
# rentbot/stt_handler.py
import whisper
import numpy as np
import asyncio
from concurrent.futures import ThreadPoolExecutor
# --- Model Loading ---
# This is a CPU/memory intensive operation, so it's done once when the server starts.
print("Loading Whisper model...")
try:
# Use a smaller model for faster loading and lower resource usage, ideal for real-time.
# 'base.en' is a good starting point.
model = whisper.load_model("base.en")
print("Whisper model 'base.en' loaded successfully.")
except Exception as e:
print(f"Error loading Whisper model: {e}")
# Exit if the model can't be loaded, as the app is non-functional without it.
exit()
# --- End Model Loading ---
# We use a thread pool to run the blocking Whisper transcription
# without blocking the main async event loop.
executor = ThreadPoolExecutor(max_workers=4)
def _transcribe(audio_np: np.ndarray):
"""
Internal synchronous function to run in a separate thread.
"""
# The audio data is 16-bit PCM. Whisper expects 32-bit float.
# Normalize the audio from integers to the range [-1.0, 1.0]
audio_float32 = audio_np.astype(np.float32) / 32768.0
result = model.transcribe(
audio_float32,
language="en",
fp16=False # Set to False for CPU-based inference
)
return result.get("text", "").strip()
async def transcribe_audio_chunk(audio_chunk: np.ndarray) -> str:
"""
Transcribes an audio chunk using Whisper in a non-blocking way.
"""
if audio_chunk.size == 0:
return ""
loop = asyncio.get_event_loop()
# Run the blocking _transcribe function in the thread pool
text = await loop.run_in_executor(
executor,
_transcribe,
audio_chunk
)
return text