malek-messaoudii
Enhance configuration validation and audio processing limits
918acab
raw
history blame
1.93 kB
from services.gemini_client import get_gemini_client
import base64
import mimetypes
import logging
logger = logging.getLogger(__name__)
async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
"""
Convert audio bytes to text using Gemini API.
Args:
audio_bytes: Raw audio file bytes
filename: Name of the audio file (used to detect format)
Returns:
Transcribed text
Raises:
Exception: If transcription fails
"""
try:
client = get_gemini_client()
# Detect MIME type from filename
mime_type, _ = mimetypes.guess_type(filename)
if mime_type is None:
mime_type = "audio/wav" # fallback
logger.info(f"Converting audio to text (format: {mime_type})")
# Convert audio to base64
audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')
# Create proper content structure for Gemini
contents = [
{
"parts": [
{
"inline_data": {
"mime_type": mime_type,
"data": audio_b64
}
},
{
"text": "Transcribe this audio to text."
}
]
}
]
# Call Gemini API
response = client.models.generate_content(
model="gemini-2.0-flash-exp", # Using a model that supports multimodal
contents=contents
)
transcribed_text = response.text.strip()
logger.info(f"✓ STT successful: '{transcribed_text}'")
return transcribed_text
except Exception as e:
logger.error(f"✗ STT failed: {str(e)}")
raise Exception(f"Speech-to-text conversion failed: {str(e)}")