Spaces:
Sleeping
Sleeping
| """Legacy compatibility functions for STT functionality.""" | |
| import logging | |
| from pathlib import Path | |
| from typing import Union | |
| from .provider_factory import STTProviderFactory | |
| from ...domain.models.audio_content import AudioContent | |
| from ...domain.exceptions import SpeechRecognitionException | |
| logger = logging.getLogger(__name__) | |
| def transcribe_audio(audio_path: Union[str, Path], model_name: str = "whisper") -> str: | |
| """ | |
| Convert audio file to text using specified STT model (legacy interface). | |
| This function maintains backward compatibility with the original utils/stt.py interface. | |
| Args: | |
| audio_path: Path to input audio file | |
| model_name: Name of the STT model/provider to use (whisper) | |
| Returns: | |
| str: Transcribed English text | |
| Raises: | |
| SpeechRecognitionException: If transcription fails | |
| """ | |
| logger.info(f"Starting transcription for: {audio_path} using {model_name} model") | |
| try: | |
| # Convert path to Path object | |
| audio_path = Path(audio_path) | |
| if not audio_path.exists(): | |
| raise SpeechRecognitionException(f"Audio file not found: {audio_path}") | |
| # Read audio file and create AudioContent | |
| with open(audio_path, 'rb') as f: | |
| audio_data = f.read() | |
| # Determine audio format from file extension | |
| audio_format = audio_path.suffix.lower().lstrip('.') | |
| if audio_format not in ['wav', 'mp3', 'flac', 'ogg']: | |
| audio_format = 'wav' # Default fallback | |
| # Create AudioContent (we'll use reasonable placeholder values) | |
| # The provider will handle the actual audio analysis during preprocessing | |
| try: | |
| audio_content = AudioContent( | |
| data=audio_data, | |
| format=audio_format, | |
| sample_rate=16000, # Standard rate for STT | |
| duration=max(1.0, len(audio_data) / (16000 * 2)), # Rough estimate | |
| filename=audio_path.name | |
| ) | |
| except ValueError: | |
| # If validation fails, try with minimal valid values | |
| audio_content = AudioContent( | |
| data=audio_data, | |
| format=audio_format, | |
| sample_rate=16000, | |
| duration=1.0, # Minimum valid duration | |
| filename=audio_path.name | |
| ) | |
| # Get the appropriate provider | |
| try: | |
| provider = STTProviderFactory.create_provider(model_name) | |
| except SpeechRecognitionException: | |
| # Fallback to any available provider | |
| logger.warning(f"Requested provider {model_name} not available, using fallback") | |
| provider = STTProviderFactory.create_provider_with_fallback(model_name) | |
| # Get the default model for the provider | |
| model = provider.get_default_model() | |
| # Transcribe audio | |
| text_content = provider.transcribe(audio_content, model) | |
| result = text_content.text | |
| logger.info(f"Transcription completed: {result}") | |
| return result | |
| except Exception as e: | |
| logger.error(f"Transcription failed: {str(e)}", exc_info=True) | |
| raise SpeechRecognitionException(f"Transcription failed: {str(e)}") from e | |
| def create_audio_content_from_file(audio_path: Union[str, Path]) -> AudioContent: | |
| """ | |
| Create AudioContent from an audio file with proper metadata detection. | |
| Args: | |
| audio_path: Path to the audio file | |
| Returns: | |
| AudioContent: The audio content object | |
| Raises: | |
| SpeechRecognitionException: If file cannot be processed | |
| """ | |
| try: | |
| from pydub import AudioSegment | |
| audio_path = Path(audio_path) | |
| # Load audio file to get metadata | |
| audio_segment = AudioSegment.from_file(audio_path) | |
| # Read raw audio data | |
| with open(audio_path, 'rb') as f: | |
| audio_data = f.read() | |
| # Determine format | |
| audio_format = audio_path.suffix.lower().lstrip('.') | |
| if audio_format not in ['wav', 'mp3', 'flac', 'ogg']: | |
| audio_format = 'wav' | |
| # Create AudioContent with actual metadata | |
| return AudioContent( | |
| data=audio_data, | |
| format=audio_format, | |
| sample_rate=audio_segment.frame_rate, | |
| duration=len(audio_segment) / 1000.0, # Convert ms to seconds | |
| filename=audio_path.name | |
| ) | |
| except ImportError: | |
| # Fallback without pydub | |
| logger.warning("pydub not available, using placeholder metadata") | |
| with open(audio_path, 'rb') as f: | |
| audio_data = f.read() | |
| audio_format = Path(audio_path).suffix.lower().lstrip('.') | |
| if audio_format not in ['wav', 'mp3', 'flac', 'ogg']: | |
| audio_format = 'wav' | |
| return AudioContent( | |
| data=audio_data, | |
| format=audio_format, | |
| sample_rate=16000, # Default | |
| duration=1.0, # Placeholder | |
| filename=Path(audio_path).name | |
| ) | |
| except Exception as e: | |
| raise SpeechRecognitionException(f"Failed to create AudioContent from file: {str(e)}") from e |