HF_Agents_Final_Project

Runtime error

HF_Agents_Final_Project / src /speech_to_text.py

Yago Bolivar

Refactor speech_to_text.py to implement a singleton ASR pipeline, enhance error handling, and introduce SpeechToTextTool for better integration. Update spreadsheet_tool.py to support querying and improve parsing functionality, including CSV support. Enhance video_processing_tool.py with new tasks for metadata extraction and frame extraction, while improving object detection capabilities and initialization checks.

87aa741 10 months ago

raw

history blame contribute delete

5.85 kB

	from transformers import pipeline
	import librosa # Or soundfile
	import os
	from smolagents.tools import Tool # Added import
	from typing import Optional # Added for type hinting

	# Initialize the ASR pipeline once
	_asr_pipeline_instance = None


	def get_asr_pipeline():
	global _asr_pipeline_instance
	if _asr_pipeline_instance is None:
	try:
	# Using a smaller Whisper model for quicker setup, but larger models offer better accuracy
	_asr_pipeline_instance = pipeline(
	"automatic-speech-recognition",
	model="openai/whisper-tiny.en", # Consider making model configurable
	)
	print("ASR pipeline initialized.") # For feedback
	except Exception as e:
	print(f"Error initializing ASR pipeline: {e}")
	# Handle error appropriately, e.g., raise or log
	return _asr_pipeline_instance


	# Original transcription function, renamed to be internal
	def _transcribe_audio_file(audio_filepath: str, asr_pipeline_instance) -> str:
	"""
	Converts speech in an audio file to text using the provided ASR pipeline.
	Args:
	audio_filepath (str): Path to the audio file.
	asr_pipeline_instance: The initialized ASR pipeline.
	Returns:
	str: Transcribed text from the audio or an error message.
	"""
	if not asr_pipeline_instance:
	return "Error: ASR pipeline is not available."
	if not os.path.exists(audio_filepath):
	return f"Error: Audio file not found at {audio_filepath}"
	try:
	# Ensure the file can be loaded by librosa (or your chosen audio library)
	# This step can help catch corrupted or unsupported audio formats early.
	y, sr = librosa.load(audio_filepath, sr=None) # Load with original sample rate
	if sr != 16000: # Whisper models expect 16kHz
	y = librosa.resample(y, orig_sr=sr, target_sr=16000)

	# Pass the numpy array to the pipeline
	transcription_result = asr_pipeline_instance(
	{"raw": y, "sampling_rate": 16000}, return_timestamps=False
	) # Changed to False for simplicity
	return transcription_result["text"]
	except Exception as e:
	return f"Error during transcription of {audio_filepath}: {e}"


	class SpeechToTextTool(Tool):
	"""
	Transcribes audio from a given audio file path to text.
	"""

	name = "speech_to_text_transcriber"
	description = "Converts speech in an audio file (e.g., .mp3, .wav) to text using speech recognition."
	inputs = {
	"audio_filepath": {"type": "string", "description": "Path to the audio file to transcribe."}
	}
	outputs = {
	"transcribed_text": {
	"type": "string",
	"description": "The transcribed text from the audio, or an error message.",
	}
	}
	output_type = "string"

	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)
	self.asr_pipeline = get_asr_pipeline() # Initialize or get the shared pipeline
	self.is_initialized = True if self.asr_pipeline else False

	def forward(self, audio_filepath: str) -> str:
	"""
	Wrapper for the _transcribe_audio_file function.
	"""
	if not self.is_initialized or not self.asr_pipeline:
	return "Error: SpeechToTextTool was not initialized properly (ASR pipeline missing)."
	return _transcribe_audio_file(audio_filepath, self.asr_pipeline)


	# Expose the original function name if needed by other parts of the system (optional)
	# transcribe_audio = _transcribe_audio_file # This would need adjustment if it expects the pipeline passed in

	# Example usage:
	if __name__ == "__main__":
	tool_instance = SpeechToTextTool()

	# Create a dummy MP3 file for testing (requires ffmpeg to be installed for pydub to work)
	# This part is tricky to make universally runnable without external dependencies for audio creation.
	# For a simple test, we'll assume a file exists or skip this part if it doesn't.

	# Path to a test audio file (replace with an actual .mp3 or .wav file for testing)
	# You might need to download a short sample audio file and place it in your project.
	# e.g., create a `test_data` directory and put `sample.mp3` there.
	test_audio_file = "./data/downloaded_files/1f975693-876d-457b-a649-393859e79bf3.mp3" # GAIA example
	# test_audio_file_2 = "./data/downloaded_files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3" # GAIA example

	if tool_instance.is_initialized:
	if os.path.exists(test_audio_file):
	print(f"Attempting to transcribe: {test_audio_file}")
	transcribed_text = tool_instance.forward(test_audio_file)
	print(f"Transcription:\n{transcribed_text}")
	else:
	print(
	f"Test audio file not found: {test_audio_file}. Skipping transcription test."
	)
	print("Please place a sample .mp3 or .wav file at that location for testing.")

	# if os.path.exists(test_audio_file_2):
	# print(f"\nAttempting to transcribe: {test_audio_file_2}")
	# transcribed_text_2 = tool_instance.forward(test_audio_file_2)
	# print(f"Transcription 2:\n{transcribed_text_2}")
	# else:
	# print(f"Test audio file 2 not found: {test_audio_file_2}. Skipping.")

	else:
	print(
	"SpeechToTextTool could not be initialized (ASR pipeline missing). Transcription test skipped."
	)

	# Test with a non-existent file
	non_existent_file = "./non_existent_audio.mp3"
	print(f"\nAttempting to transcribe non-existent file: {non_existent_file}")
	error_text = tool_instance.forward(non_existent_file)
	print(f"Result for non-existent file:\n{error_text}")
	assert "Error:" in error_text # Expect an error message