|
|
import logging |
|
|
import tempfile |
|
|
import os |
|
|
from transformers import pipeline |
|
|
import librosa |
|
|
import numpy as np |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
stt_pipeline = None |
|
|
|
|
|
def load_stt_model(): |
|
|
global stt_pipeline |
|
|
try: |
|
|
logger.info("Loading Whisper-base STT model...") |
|
|
stt_pipeline = pipeline( |
|
|
"automatic-speech-recognition", |
|
|
model="openai/whisper-base", |
|
|
device="cpu", |
|
|
chunk_length_s=30, |
|
|
) |
|
|
logger.info("β Whisper STT model loaded successfully") |
|
|
except Exception as e: |
|
|
logger.error(f"β Failed to load STT model: {str(e)}") |
|
|
stt_pipeline = None |
|
|
|
|
|
async def speech_to_text(audio_bytes: bytes, filename: str) -> str: |
|
|
""" |
|
|
Convert audio bytes to text using Whisper. |
|
|
Handles WAV, MP3, M4A formats automatically. |
|
|
""" |
|
|
global stt_pipeline |
|
|
|
|
|
try: |
|
|
if stt_pipeline is None: |
|
|
load_stt_model() |
|
|
if stt_pipeline is None: |
|
|
raise Exception("STT model not loaded") |
|
|
|
|
|
logger.info(f"STT: Converting audio file '{filename}'") |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp: |
|
|
tmp.write(audio_bytes) |
|
|
tmp_path = tmp.name |
|
|
|
|
|
try: |
|
|
|
|
|
audio, sr = librosa.load(tmp_path, sr=16000) |
|
|
|
|
|
|
|
|
result = stt_pipeline(audio, generate_kwargs={"language": "english"}) |
|
|
text = result["text"].strip() |
|
|
|
|
|
if not text: |
|
|
text = "[Silent audio or unrecognizable speech]" |
|
|
|
|
|
logger.info(f"β STT Success: '{text}'") |
|
|
return text |
|
|
|
|
|
finally: |
|
|
if os.path.exists(tmp_path): |
|
|
os.unlink(tmp_path) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"β STT Error: {str(e)}") |
|
|
raise Exception(f"STT failed: {str(e)}") |