|
|
import torch |
|
|
from transformers import pipeline |
|
|
import logging |
|
|
import tempfile |
|
|
import os |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
stt_pipeline = None |
|
|
|
|
|
def load_stt_model(): |
|
|
"""Load the free Whisper model for speech-to-text""" |
|
|
global stt_pipeline |
|
|
try: |
|
|
logger.info("Loading Whisper-medium STT model...") |
|
|
stt_pipeline = pipeline( |
|
|
"automatic-speech-recognition", |
|
|
model="openai/whisper-medium", |
|
|
device="cpu" |
|
|
) |
|
|
logger.info("β Whisper-medium STT model loaded successfully") |
|
|
except Exception as e: |
|
|
logger.error(f"β Failed to load Whisper-medium model: {str(e)}") |
|
|
stt_pipeline = None |
|
|
|
|
|
async def speech_to_text(audio_bytes: bytes, filename: str) -> str: |
|
|
""" |
|
|
Convert audio bytes to text using free Whisper model. |
|
|
|
|
|
Args: |
|
|
audio_bytes: Raw audio file bytes |
|
|
filename: Name of the audio file |
|
|
|
|
|
Returns: |
|
|
Transcribed text |
|
|
""" |
|
|
global stt_pipeline |
|
|
|
|
|
try: |
|
|
if stt_pipeline is None: |
|
|
load_stt_model() |
|
|
if stt_pipeline is None: |
|
|
raise Exception("STT model failed to load") |
|
|
|
|
|
logger.info(f"Converting audio to text using Whisper-medium") |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio: |
|
|
temp_audio.write(audio_bytes) |
|
|
temp_audio_path = temp_audio.name |
|
|
|
|
|
try: |
|
|
|
|
|
result = stt_pipeline(temp_audio_path) |
|
|
transcribed_text = result.get("text", "").strip() |
|
|
|
|
|
if not transcribed_text: |
|
|
transcribed_text = "Sorry, I couldn't understand the audio." |
|
|
|
|
|
logger.info(f"β STT successful: '{transcribed_text}'") |
|
|
return transcribed_text |
|
|
|
|
|
finally: |
|
|
|
|
|
if os.path.exists(temp_audio_path): |
|
|
os.unlink(temp_audio_path) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"β STT failed: {str(e)}") |
|
|
raise Exception(f"Speech-to-text conversion failed: {str(e)}") |