Spaces:

DroolingPanda
/

teachingAssistant

Sleeping

teachingAssistant / src /infrastructure /stt /legacy_compatibility.py

Michael Hu

refactor: remove parakeet ASR provider and update all references to Whisper only

619b266 4 months ago

5.1 kB

	"""Legacy compatibility functions for STT functionality."""

	import logging
	from pathlib import Path
	from typing import Union

	from .provider_factory import STTProviderFactory
	from ...domain.models.audio_content import AudioContent
	from ...domain.exceptions import SpeechRecognitionException

	logger = logging.getLogger(__name__)


	def transcribe_audio(audio_path: Union[str, Path], model_name: str = "whisper") -> str:
	"""
	Convert audio file to text using specified STT model (legacy interface).

	This function maintains backward compatibility with the original utils/stt.py interface.

	Args:
	audio_path: Path to input audio file
	model_name: Name of the STT model/provider to use (whisper)

	Returns:
	str: Transcribed English text

	Raises:
	SpeechRecognitionException: If transcription fails
	"""
	logger.info(f"Starting transcription for: {audio_path} using {model_name} model")

	try:
	# Convert path to Path object
	audio_path = Path(audio_path)

	if not audio_path.exists():
	raise SpeechRecognitionException(f"Audio file not found: {audio_path}")

	# Read audio file and create AudioContent
	with open(audio_path, 'rb') as f:
	audio_data = f.read()

	# Determine audio format from file extension
	audio_format = audio_path.suffix.lower().lstrip('.')
	if audio_format not in ['wav', 'mp3', 'flac', 'ogg']:
	audio_format = 'wav' # Default fallback

	# Create AudioContent (we'll use reasonable placeholder values)
	# The provider will handle the actual audio analysis during preprocessing
	try:
	audio_content = AudioContent(
	data=audio_data,
	format=audio_format,
	sample_rate=16000, # Standard rate for STT
	duration=max(1.0, len(audio_data) / (16000 * 2)), # Rough estimate
	filename=audio_path.name
	)
	except ValueError:
	# If validation fails, try with minimal valid values
	audio_content = AudioContent(
	data=audio_data,
	format=audio_format,
	sample_rate=16000,
	duration=1.0, # Minimum valid duration
	filename=audio_path.name
	)

	# Get the appropriate provider
	try:
	provider = STTProviderFactory.create_provider(model_name)
	except SpeechRecognitionException:
	# Fallback to any available provider
	logger.warning(f"Requested provider {model_name} not available, using fallback")
	provider = STTProviderFactory.create_provider_with_fallback(model_name)

	# Get the default model for the provider
	model = provider.get_default_model()

	# Transcribe audio
	text_content = provider.transcribe(audio_content, model)
	result = text_content.text

	logger.info(f"Transcription completed: {result}")
	return result

	except Exception as e:
	logger.error(f"Transcription failed: {str(e)}", exc_info=True)
	raise SpeechRecognitionException(f"Transcription failed: {str(e)}") from e


	def create_audio_content_from_file(audio_path: Union[str, Path]) -> AudioContent:
	"""
	Create AudioContent from an audio file with proper metadata detection.

	Args:
	audio_path: Path to the audio file

	Returns:
	AudioContent: The audio content object

	Raises:
	SpeechRecognitionException: If file cannot be processed
	"""
	try:
	from pydub import AudioSegment

	audio_path = Path(audio_path)

	# Load audio file to get metadata
	audio_segment = AudioSegment.from_file(audio_path)

	# Read raw audio data
	with open(audio_path, 'rb') as f:
	audio_data = f.read()

	# Determine format
	audio_format = audio_path.suffix.lower().lstrip('.')
	if audio_format not in ['wav', 'mp3', 'flac', 'ogg']:
	audio_format = 'wav'

	# Create AudioContent with actual metadata
	return AudioContent(
	data=audio_data,
	format=audio_format,
	sample_rate=audio_segment.frame_rate,
	duration=len(audio_segment) / 1000.0, # Convert ms to seconds
	filename=audio_path.name
	)

	except ImportError:
	# Fallback without pydub
	logger.warning("pydub not available, using placeholder metadata")

	with open(audio_path, 'rb') as f:
	audio_data = f.read()

	audio_format = Path(audio_path).suffix.lower().lstrip('.')
	if audio_format not in ['wav', 'mp3', 'flac', 'ogg']:
	audio_format = 'wav'

	return AudioContent(
	data=audio_data,
	format=audio_format,
	sample_rate=16000, # Default
	duration=1.0, # Placeholder
	filename=Path(audio_path).name
	)

	except Exception as e:
	raise SpeechRecognitionException(f"Failed to create AudioContent from file: {str(e)}") from e