File size: 1,929 Bytes
73d4f3c
918acab
4a13628
 
 
 
 
c7fc3b6
d4b6133
 
4a13628
 
 
 
 
 
 
 
 
 
 
d4b6133
4a13628
 
 
 
 
 
 
 
 
 
918acab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a13628
 
 
918acab
 
4a13628
 
918acab
4a13628
 
 
 
 
 
918acab
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from services.gemini_client import get_gemini_client
import base64
import mimetypes
import logging

logger = logging.getLogger(__name__)


async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
    """
    Convert audio bytes to text using Gemini API.
    
    Args:
        audio_bytes: Raw audio file bytes
        filename: Name of the audio file (used to detect format)
    
    Returns:
        Transcribed text
    
    Raises:
        Exception: If transcription fails
    """
    try:
        client = get_gemini_client()
        
        # Detect MIME type from filename
        mime_type, _ = mimetypes.guess_type(filename)
        if mime_type is None:
            mime_type = "audio/wav"  # fallback
        
        logger.info(f"Converting audio to text (format: {mime_type})")
        
        # Convert audio to base64
        audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')
        
        # Create proper content structure for Gemini
        contents = [
            {
                "parts": [
                    {
                        "inline_data": {
                            "mime_type": mime_type,
                            "data": audio_b64
                        }
                    },
                    {
                        "text": "Transcribe this audio to text."
                    }
                ]
            }
        ]
        
        # Call Gemini API
        response = client.models.generate_content(
            model="gemini-2.0-flash-exp",  # Using a model that supports multimodal
            contents=contents
        )
        
        transcribed_text = response.text.strip()
        logger.info(f"✓ STT successful: '{transcribed_text}'")
        
        return transcribed_text
        
    except Exception as e:
        logger.error(f"✗ STT failed: {str(e)}")
        raise Exception(f"Speech-to-text conversion failed: {str(e)}")