from services.gemini_client import get_gemini_client import base64 import mimetypes import logging logger = logging.getLogger(__name__) async def speech_to_text(audio_bytes: bytes, filename: str) -> str: """ Convert audio bytes to text using Gemini API. Args: audio_bytes: Raw audio file bytes filename: Name of the audio file (used to detect format) Returns: Transcribed text Raises: Exception: If transcription fails """ try: client = get_gemini_client() # Detect MIME type from filename mime_type, _ = mimetypes.guess_type(filename) if mime_type is None: mime_type = "audio/wav" # fallback logger.info(f"Converting audio to text (format: {mime_type})") # Convert audio to base64 audio_b64 = base64.b64encode(audio_bytes).decode('utf-8') # Create proper content structure for Gemini contents = [ { "parts": [ { "inline_data": { "mime_type": mime_type, "data": audio_b64 } }, { "text": "Transcribe this audio to text." } ] } ] # Call Gemini API response = client.models.generate_content( model="gemini-2.0-flash-exp", # Using a model that supports multimodal contents=contents ) transcribed_text = response.text.strip() logger.info(f"✓ STT successful: '{transcribed_text}'") return transcribed_text except Exception as e: logger.error(f"✗ STT failed: {str(e)}") raise Exception(f"Speech-to-text conversion failed: {str(e)}")