File size: 2,440 Bytes
47a3efb
4a13628
95cb26e
 
4a13628
 
 
d4b6133
 
47a3efb
4a13628
 
 
95cb26e
4a13628
 
 
d4b6133
4a13628
47a3efb
4a13628
95cb26e
 
 
 
4a13628
95cb26e
47a3efb
 
 
95cb26e
47a3efb
 
 
 
 
 
 
 
 
 
95cb26e
 
 
 
 
 
 
 
 
 
 
4a13628
 
47a3efb
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import requests
import logging
import tempfile
import os

logger = logging.getLogger(__name__)

async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
    """
    Convert audio bytes to text using Hugging Face Inference API (free).
    
    Args:
        audio_bytes: Raw audio file bytes
        filename: Name of the audio file
    
    Returns:
        Transcribed text
    """
    try:
        logger.info(f"Converting audio to text using Hugging Face API")
        
        # Save audio bytes to temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
            temp_audio.write(audio_bytes)
            temp_audio_path = temp_audio.name
        
        try:
            # Use Hugging Face Inference API (free)
            API_URL = "https://api-inference.huggingface.co/models/openai/whisper-medium"
            headers = {"Authorization": "Bearer YOUR_HF_TOKEN"}  # Optional for free tier
            
            with open(temp_audio_path, "rb") as f:
                response = requests.post(API_URL, headers=headers, data=f)
            
            if response.status_code == 200:
                result = response.json()
                transcribed_text = result.get("text", "").strip()
            else:
                # Fallback to local model if API fails
                transcribed_text = await fallback_stt(audio_bytes, filename)
                
            if not transcribed_text:
                transcribed_text = "Sorry, I couldn't understand the audio."
                
            logger.info(f"✓ STT successful: '{transcribed_text}'")
            return transcribed_text
            
        finally:
            # Clean up temporary file
            if os.path.exists(temp_audio_path):
                os.unlink(temp_audio_path)
                
    except Exception as e:
        logger.error(f"✗ STT failed: {str(e)}")
        return "Sorry, there was an error processing your audio."


async def fallback_stt(audio_bytes: bytes, filename: str) -> str:
    """Fallback STT using a simpler approach"""
    try:
        # Simple fallback - you could implement a basic speech recognition here
        # For now, return a placeholder
        return "Audio received but transcription service is temporarily unavailable."
    except Exception as e:
        logger.error(f"Fallback STT also failed: {str(e)}")
        return "Audio processing failed."