|
|
import requests |
|
|
import logging |
|
|
import tempfile |
|
|
import os |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
async def speech_to_text(audio_bytes: bytes, filename: str) -> str: |
|
|
""" |
|
|
Convert audio bytes to text using Hugging Face Inference API (free). |
|
|
|
|
|
Args: |
|
|
audio_bytes: Raw audio file bytes |
|
|
filename: Name of the audio file |
|
|
|
|
|
Returns: |
|
|
Transcribed text |
|
|
""" |
|
|
try: |
|
|
logger.info(f"Converting audio to text using Hugging Face API") |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio: |
|
|
temp_audio.write(audio_bytes) |
|
|
temp_audio_path = temp_audio.name |
|
|
|
|
|
try: |
|
|
|
|
|
API_URL = "https://api-inference.huggingface.co/models/openai/whisper-medium" |
|
|
headers = {"Authorization": "Bearer YOUR_HF_TOKEN"} |
|
|
|
|
|
with open(temp_audio_path, "rb") as f: |
|
|
response = requests.post(API_URL, headers=headers, data=f) |
|
|
|
|
|
if response.status_code == 200: |
|
|
result = response.json() |
|
|
transcribed_text = result.get("text", "").strip() |
|
|
else: |
|
|
|
|
|
transcribed_text = await fallback_stt(audio_bytes, filename) |
|
|
|
|
|
if not transcribed_text: |
|
|
transcribed_text = "Sorry, I couldn't understand the audio." |
|
|
|
|
|
logger.info(f"✓ STT successful: '{transcribed_text}'") |
|
|
return transcribed_text |
|
|
|
|
|
finally: |
|
|
|
|
|
if os.path.exists(temp_audio_path): |
|
|
os.unlink(temp_audio_path) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"✗ STT failed: {str(e)}") |
|
|
return "Sorry, there was an error processing your audio." |
|
|
|
|
|
|
|
|
async def fallback_stt(audio_bytes: bytes, filename: str) -> str: |
|
|
"""Fallback STT using a simpler approach""" |
|
|
try: |
|
|
|
|
|
|
|
|
return "Audio received but transcription service is temporarily unavailable." |
|
|
except Exception as e: |
|
|
logger.error(f"Fallback STT also failed: {str(e)}") |
|
|
return "Audio processing failed." |