""" Speech-to-Text Module ===================== Converts voice/audio input to text using speech recognition. Functions: - transcribe_audio: Convert audio file to text - transcribe_from_microphone: Real-time microphone transcription """ import speech_recognition as sr from pathlib import Path from typing import Optional, Tuple import tempfile from pydub import AudioSegment def transcribe_audio( audio_path: str, language: str = "en-US" ) -> Tuple[str, bool]: """ Transcribe audio file to text using Google Speech Recognition. Supports various audio formats (WAV, MP3, etc.) and converts them automatically for processing. Args: audio_path: Path to the audio file language: Language code for recognition - 'en-US' for English (US) - 'hi-IN' for Hindi (India) Returns: Tuple of (transcribed_text, success_flag) - If successful: (text, True) - If failed: (error_message, False) Example: >>> text, success = transcribe_audio("recording.wav", "en-US") >>> if success: ... print(f"You said: {text}") ... else: ... print(f"Error: {text}") Supported Formats: - WAV (recommended) - MP3 - FLAC - OGG """ recognizer = sr.Recognizer() try: # Convert audio to WAV format if needed audio_path = Path(audio_path) if audio_path.suffix.lower() != '.wav': # Convert to WAV using pydub audio = AudioSegment.from_file(str(audio_path)) # Create temporary WAV file with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp: wav_path = tmp.name audio.export(wav_path, format='wav') else: wav_path = str(audio_path) # Load audio file for recognition with sr.AudioFile(wav_path) as source: # Adjust for ambient noise recognizer.adjust_for_ambient_noise(source, duration=0.5) # Record the audio audio_data = recognizer.record(source) # Perform speech recognition text = recognizer.recognize_google(audio_data, language=language) return text, True except sr.UnknownValueError: return "Could not understand the audio. Please speak clearly.", False except sr.RequestError as e: return f"Speech recognition service error: {str(e)}", False except Exception as e: return f"Error processing audio: {str(e)}", False def get_language_code(lang: str) -> str: """ Convert short language code to full speech recognition code. Args: lang: Short language code ('en' or 'hi') Returns: Full language code for speech recognition Example: >>> get_language_code('en') 'en-US' >>> get_language_code('hi') 'hi-IN' """ language_map = { 'en': 'en-US', 'hi': 'hi-IN', 'auto': 'en-US' # Default to English for auto } return language_map.get(lang, 'en-US')