File size: 2,226 Bytes
e8aa76b
4a13628
95cb26e
 
4a13628
 
 
d4b6133
 
e8aa76b
 
d4b6133
4a13628
e8aa76b
 
 
 
a8c8142
e8aa76b
 
4a13628
e8aa76b
 
4a13628
e8aa76b
 
a8c8142
95cb26e
 
e8aa76b
95cb26e
 
 
 
e8aa76b
 
 
 
 
95cb26e
4a13628
 
e8aa76b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import requests
import logging
import tempfile
import os

logger = logging.getLogger(__name__)

async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
    """
    Convert audio bytes to text using Hugging Face Inference API.
    No ffmpeg required!
    """
    try:
        logger.info(f"Converting audio to text using Hugging Face API")
        
        # Use Hugging Face Inference API (free, no ffmpeg needed)
        API_URL = "https://api-inference.huggingface.co/models/openai/whisper-medium"
        
        # For Hugging Face Spaces, you might not need an API key for public models
        headers = {}
        
        # Send audio bytes directly to Hugging Face API
        response = requests.post(API_URL, headers=headers, data=audio_bytes)
        
        if response.status_code == 200:
            result = response.json()
            transcribed_text = result.get("text", "").strip()
            
            if not transcribed_text:
                transcribed_text = "No speech detected in the audio."
                
            logger.info(f"✓ STT successful: '{transcribed_text}'")
            return transcribed_text
            
        else:
            # If API fails, use fallback
            error_msg = f"Hugging Face API error: {response.status_code}"
            logger.error(error_msg)
            return await fallback_stt(audio_bytes, filename)
                
    except Exception as e:
        logger.error(f"✗ STT failed: {str(e)}")
        return await fallback_stt(audio_bytes, filename)


async def fallback_stt(audio_bytes: bytes, filename: str) -> str:
    """Fallback STT using a simpler approach"""
    try:
        # Simple fallback that doesn't require ffmpeg
        file_size = len(audio_bytes)
        file_type = filename.split('.')[-1] if '.' in filename else 'unknown'
        
        return f"Audio file '{filename}' ({file_type}, {file_size} bytes) received successfully. For full STT, please ensure ffmpeg is installed or use the Hugging Face API directly."
        
    except Exception as e:
        logger.error(f"Fallback STT also failed: {str(e)}")
        return "Audio processing failed. Please try a different audio format or install ffmpeg."