malek-messaoudii
Update files
674469e
raw
history blame
2.04 kB
import logging
import tempfile
import os
from transformers import pipeline
import librosa
import numpy as np
logger = logging.getLogger(__name__)
stt_pipeline = None
def load_stt_model():
global stt_pipeline
try:
logger.info("Loading Whisper-base STT model...")
stt_pipeline = pipeline(
"automatic-speech-recognition",
model="openai/whisper-base",
device="cpu", # Use "cuda" if GPU available
chunk_length_s=30,
)
logger.info("βœ“ Whisper STT model loaded successfully")
except Exception as e:
logger.error(f"βœ— Failed to load STT model: {str(e)}")
stt_pipeline = None
async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
"""
Convert audio bytes to text using Whisper.
Handles WAV, MP3, M4A formats automatically.
"""
global stt_pipeline
try:
if stt_pipeline is None:
load_stt_model()
if stt_pipeline is None:
raise Exception("STT model not loaded")
logger.info(f"STT: Converting audio file '{filename}'")
# Save to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
tmp.write(audio_bytes)
tmp_path = tmp.name
try:
# Load and resample audio to 16kHz
audio, sr = librosa.load(tmp_path, sr=16000)
# Transcribe
result = stt_pipeline(audio, generate_kwargs={"language": "english"})
text = result["text"].strip()
if not text:
text = "[Silent audio or unrecognizable speech]"
logger.info(f"βœ“ STT Success: '{text}'")
return text
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except Exception as e:
logger.error(f"βœ— STT Error: {str(e)}")
raise Exception(f"STT failed: {str(e)}")