| from services.gemini_client import get_gemini_client | |
| import base64 | |
| import mimetypes | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| async def speech_to_text(audio_bytes: bytes, filename: str) -> str: | |
| """ | |
| Convert audio bytes to text using Gemini API. | |
| Args: | |
| audio_bytes: Raw audio file bytes | |
| filename: Name of the audio file (used to detect format) | |
| Returns: | |
| Transcribed text | |
| Raises: | |
| Exception: If transcription fails | |
| """ | |
| try: | |
| client = get_gemini_client() | |
| # Detect MIME type from filename | |
| mime_type, _ = mimetypes.guess_type(filename) | |
| if mime_type is None: | |
| mime_type = "audio/wav" # fallback | |
| logger.info(f"Converting audio to text (format: {mime_type})") | |
| # Convert audio to base64 | |
| audio_b64 = base64.b64encode(audio_bytes).decode('utf-8') | |
| # Create proper content structure for Gemini | |
| contents = [ | |
| { | |
| "parts": [ | |
| { | |
| "inline_data": { | |
| "mime_type": mime_type, | |
| "data": audio_b64 | |
| } | |
| }, | |
| { | |
| "text": "Transcribe this audio to text." | |
| } | |
| ] | |
| } | |
| ] | |
| # Call Gemini API | |
| response = client.models.generate_content( | |
| model="gemini-2.0-flash-exp", # Using a model that supports multimodal | |
| contents=contents | |
| ) | |
| transcribed_text = response.text.strip() | |
| logger.info(f"✓ STT successful: '{transcribed_text}'") | |
| return transcribed_text | |
| except Exception as e: | |
| logger.error(f"✗ STT failed: {str(e)}") | |
| raise Exception(f"Speech-to-text conversion failed: {str(e)}") |