File size: 2,231 Bytes
a8c8142 4a13628 95cb26e 4a13628 a8c8142 d4b6133 a8c8142 4a13628 95cb26e 4a13628 d4b6133 a8c8142 4a13628 a8c8142 4a13628 95cb26e 4a13628 95cb26e a8c8142 95cb26e 4a13628 a8c8142 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import torch
from transformers import pipeline
import logging
import tempfile
import os
logger = logging.getLogger(__name__)
# Global STT pipeline
stt_pipeline = None
def load_stt_model():
"""Load the free Whisper model for speech-to-text"""
global stt_pipeline
try:
logger.info("Loading Whisper-medium STT model...")
stt_pipeline = pipeline(
"automatic-speech-recognition",
model="openai/whisper-medium",
device="cpu"
)
logger.info("✓ Whisper-medium STT model loaded successfully")
except Exception as e:
logger.error(f"✗ Failed to load Whisper-medium model: {str(e)}")
stt_pipeline = None
async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
"""
Convert audio bytes to text using free Whisper model.
Args:
audio_bytes: Raw audio file bytes
filename: Name of the audio file
Returns:
Transcribed text
"""
global stt_pipeline
try:
if stt_pipeline is None:
load_stt_model()
if stt_pipeline is None:
raise Exception("STT model failed to load")
logger.info(f"Converting audio to text using Whisper-medium")
# Save audio bytes to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
temp_audio.write(audio_bytes)
temp_audio_path = temp_audio.name
try:
# Transcribe using Whisper
result = stt_pipeline(temp_audio_path)
transcribed_text = result.get("text", "").strip()
if not transcribed_text:
transcribed_text = "Sorry, I couldn't understand the audio."
logger.info(f"✓ STT successful: '{transcribed_text}'")
return transcribed_text
finally:
# Clean up temporary file
if os.path.exists(temp_audio_path):
os.unlink(temp_audio_path)
except Exception as e:
logger.error(f"✗ STT failed: {str(e)}")
raise Exception(f"Speech-to-text conversion failed: {str(e)}") |