Spaces:

NLP-Debater-Project
/

FastAPI-Backend-Models

Sleeping

File size: 1,929 Bytes

from services.gemini_client import get_gemini_client
import base64
import mimetypes
import logging

logger = logging.getLogger(__name__)


async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
    """
    Convert audio bytes to text using Gemini API.
    
    Args:
        audio_bytes: Raw audio file bytes
        filename: Name of the audio file (used to detect format)
    
    Returns:
        Transcribed text
    
    Raises:
        Exception: If transcription fails
    """
    try:
        client = get_gemini_client()
        
        # Detect MIME type from filename
        mime_type, _ = mimetypes.guess_type(filename)
        if mime_type is None:
            mime_type = "audio/wav"  # fallback
        
        logger.info(f"Converting audio to text (format: {mime_type})")
        
        # Convert audio to base64
        audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')
        
        # Create proper content structure for Gemini
        contents = [
            {
                "parts": [
                    {
                        "inline_data": {
                            "mime_type": mime_type,
                            "data": audio_b64
                        }
                    },
                    {
                        "text": "Transcribe this audio to text."
                    }
                ]
            }
        ]
        
        # Call Gemini API
        response = client.models.generate_content(
            model="gemini-2.0-flash-exp",  # Using a model that supports multimodal
            contents=contents
        )
        
        transcribed_text = response.text.strip()
        logger.info(f"✓ STT successful: '{transcribed_text}'")
        
        return transcribed_text
        
    except Exception as e:
        logger.error(f"✗ STT failed: {str(e)}")
        raise Exception(f"Speech-to-text conversion failed: {str(e)}")