Spaces:

nihun
/

Translator

Sleeping

File size: 3,365 Bytes

85c18a5

"""

Speech-to-Text Module

=====================

Converts voice/audio input to text using speech recognition.



Functions:

    - transcribe_audio: Convert audio file to text

    - transcribe_from_microphone: Real-time microphone transcription

"""

import speech_recognition as sr
from pathlib import Path
from typing import Optional, Tuple
import tempfile
from pydub import AudioSegment


def transcribe_audio(

    audio_path: str,

    language: str = "en-US"

) -> Tuple[str, bool]:
    """

    Transcribe audio file to text using Google Speech Recognition.

    

    Supports various audio formats (WAV, MP3, etc.) and converts

    them automatically for processing.

    

    Args:

        audio_path: Path to the audio file

        language: Language code for recognition

                 - 'en-US' for English (US)

                 - 'hi-IN' for Hindi (India)

                 

    Returns:

        Tuple of (transcribed_text, success_flag)

        - If successful: (text, True)

        - If failed: (error_message, False)

        

    Example:

        >>> text, success = transcribe_audio("recording.wav", "en-US")

        >>> if success:

        ...     print(f"You said: {text}")

        ... else:

        ...     print(f"Error: {text}")

    

    Supported Formats:

        - WAV (recommended)

        - MP3

        - FLAC

        - OGG

    """
    recognizer = sr.Recognizer()
    
    try:
        # Convert audio to WAV format if needed
        audio_path = Path(audio_path)
        
        if audio_path.suffix.lower() != '.wav':
            # Convert to WAV using pydub
            audio = AudioSegment.from_file(str(audio_path))
            
            # Create temporary WAV file
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
                wav_path = tmp.name
                audio.export(wav_path, format='wav')
        else:
            wav_path = str(audio_path)
        
        # Load audio file for recognition
        with sr.AudioFile(wav_path) as source:
            # Adjust for ambient noise
            recognizer.adjust_for_ambient_noise(source, duration=0.5)
            
            # Record the audio
            audio_data = recognizer.record(source)
        
        # Perform speech recognition
        text = recognizer.recognize_google(audio_data, language=language)
        
        return text, True
        
    except sr.UnknownValueError:
        return "Could not understand the audio. Please speak clearly.", False
        
    except sr.RequestError as e:
        return f"Speech recognition service error: {str(e)}", False
        
    except Exception as e:
        return f"Error processing audio: {str(e)}", False


def get_language_code(lang: str) -> str:
    """

    Convert short language code to full speech recognition code.

    

    Args:

        lang: Short language code ('en' or 'hi')

        

    Returns:

        Full language code for speech recognition

        

    Example:

        >>> get_language_code('en')

        'en-US'

        >>> get_language_code('hi')

        'hi-IN'

    """
    language_map = {
        'en': 'en-US',
        'hi': 'hi-IN',
        'auto': 'en-US'  # Default to English for auto
    }
    return language_map.get(lang, 'en-US')