File size: 1,782 Bytes
a3c6905 0e94c69 a3c6905 0e94c69 a3c6905 0e94c69 a3c6905 0e94c69 a3c6905 0e94c69 a3c6905 0e94c69 a3c6905 0e94c69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# rentbot/stt_handler.py
import whisper
import numpy as np
import asyncio
from concurrent.futures import ThreadPoolExecutor
# --- Model Loading ---
# This is a CPU/memory intensive operation, so it's done once when the server starts.
print("Loading Whisper model...")
try:
# Use a smaller model for faster loading and lower resource usage, ideal for real-time.
# 'base.en' is a good starting point.
model = whisper.load_model("base.en")
print("Whisper model 'base.en' loaded successfully.")
except Exception as e:
print(f"Error loading Whisper model: {e}")
# Exit if the model can't be loaded, as the app is non-functional without it.
exit()
# --- End Model Loading ---
# We use a thread pool to run the blocking Whisper transcription
# without blocking the main async event loop.
executor = ThreadPoolExecutor(max_workers=4)
def _transcribe(audio_np: np.ndarray):
"""
Internal synchronous function to run in a separate thread.
"""
# The audio data is 16-bit PCM. Whisper expects 32-bit float.
# Normalize the audio from integers to the range [-1.0, 1.0]
audio_float32 = audio_np.astype(np.float32) / 32768.0
result = model.transcribe(
audio_float32,
language="en",
fp16=False # Set to False for CPU-based inference
)
return result.get("text", "").strip()
async def transcribe_audio_chunk(audio_chunk: np.ndarray) -> str:
"""
Transcribes an audio chunk using Whisper in a non-blocking way.
"""
if audio_chunk.size == 0:
return ""
loop = asyncio.get_event_loop()
# Run the blocking _transcribe function in the thread pool
text = await loop.run_in_executor(
executor,
_transcribe,
audio_chunk
)
return text |