Spaces:
Runtime error
Runtime error
Yago Bolivar
Refactor speech_to_text.py to implement a singleton ASR pipeline, enhance error handling, and introduce SpeechToTextTool for better integration. Update spreadsheet_tool.py to support querying and improve parsing functionality, including CSV support. Enhance video_processing_tool.py with new tasks for metadata extraction and frame extraction, while improving object detection capabilities and initialization checks.
87aa741 | from transformers import pipeline | |
| import librosa # Or soundfile | |
| import os | |
| from smolagents.tools import Tool # Added import | |
| from typing import Optional # Added for type hinting | |
| # Initialize the ASR pipeline once | |
| _asr_pipeline_instance = None | |
| def get_asr_pipeline(): | |
| global _asr_pipeline_instance | |
| if _asr_pipeline_instance is None: | |
| try: | |
| # Using a smaller Whisper model for quicker setup, but larger models offer better accuracy | |
| _asr_pipeline_instance = pipeline( | |
| "automatic-speech-recognition", | |
| model="openai/whisper-tiny.en", # Consider making model configurable | |
| ) | |
| print("ASR pipeline initialized.") # For feedback | |
| except Exception as e: | |
| print(f"Error initializing ASR pipeline: {e}") | |
| # Handle error appropriately, e.g., raise or log | |
| return _asr_pipeline_instance | |
| # Original transcription function, renamed to be internal | |
| def _transcribe_audio_file(audio_filepath: str, asr_pipeline_instance) -> str: | |
| """ | |
| Converts speech in an audio file to text using the provided ASR pipeline. | |
| Args: | |
| audio_filepath (str): Path to the audio file. | |
| asr_pipeline_instance: The initialized ASR pipeline. | |
| Returns: | |
| str: Transcribed text from the audio or an error message. | |
| """ | |
| if not asr_pipeline_instance: | |
| return "Error: ASR pipeline is not available." | |
| if not os.path.exists(audio_filepath): | |
| return f"Error: Audio file not found at {audio_filepath}" | |
| try: | |
| # Ensure the file can be loaded by librosa (or your chosen audio library) | |
| # This step can help catch corrupted or unsupported audio formats early. | |
| y, sr = librosa.load(audio_filepath, sr=None) # Load with original sample rate | |
| if sr != 16000: # Whisper models expect 16kHz | |
| y = librosa.resample(y, orig_sr=sr, target_sr=16000) | |
| # Pass the numpy array to the pipeline | |
| transcription_result = asr_pipeline_instance( | |
| {"raw": y, "sampling_rate": 16000}, return_timestamps=False | |
| ) # Changed to False for simplicity | |
| return transcription_result["text"] | |
| except Exception as e: | |
| return f"Error during transcription of {audio_filepath}: {e}" | |
| class SpeechToTextTool(Tool): | |
| """ | |
| Transcribes audio from a given audio file path to text. | |
| """ | |
| name = "speech_to_text_transcriber" | |
| description = "Converts speech in an audio file (e.g., .mp3, .wav) to text using speech recognition." | |
| inputs = { | |
| "audio_filepath": {"type": "string", "description": "Path to the audio file to transcribe."} | |
| } | |
| outputs = { | |
| "transcribed_text": { | |
| "type": "string", | |
| "description": "The transcribed text from the audio, or an error message.", | |
| } | |
| } | |
| output_type = "string" | |
| def __init__(self, *args, **kwargs): | |
| super().__init__(*args, **kwargs) | |
| self.asr_pipeline = get_asr_pipeline() # Initialize or get the shared pipeline | |
| self.is_initialized = True if self.asr_pipeline else False | |
| def forward(self, audio_filepath: str) -> str: | |
| """ | |
| Wrapper for the _transcribe_audio_file function. | |
| """ | |
| if not self.is_initialized or not self.asr_pipeline: | |
| return "Error: SpeechToTextTool was not initialized properly (ASR pipeline missing)." | |
| return _transcribe_audio_file(audio_filepath, self.asr_pipeline) | |
| # Expose the original function name if needed by other parts of the system (optional) | |
| # transcribe_audio = _transcribe_audio_file # This would need adjustment if it expects the pipeline passed in | |
| # Example usage: | |
| if __name__ == "__main__": | |
| tool_instance = SpeechToTextTool() | |
| # Create a dummy MP3 file for testing (requires ffmpeg to be installed for pydub to work) | |
| # This part is tricky to make universally runnable without external dependencies for audio creation. | |
| # For a simple test, we'll assume a file exists or skip this part if it doesn't. | |
| # Path to a test audio file (replace with an actual .mp3 or .wav file for testing) | |
| # You might need to download a short sample audio file and place it in your project. | |
| # e.g., create a `test_data` directory and put `sample.mp3` there. | |
| test_audio_file = "./data/downloaded_files/1f975693-876d-457b-a649-393859e79bf3.mp3" # GAIA example | |
| # test_audio_file_2 = "./data/downloaded_files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3" # GAIA example | |
| if tool_instance.is_initialized: | |
| if os.path.exists(test_audio_file): | |
| print(f"Attempting to transcribe: {test_audio_file}") | |
| transcribed_text = tool_instance.forward(test_audio_file) | |
| print(f"Transcription:\n{transcribed_text}") | |
| else: | |
| print( | |
| f"Test audio file not found: {test_audio_file}. Skipping transcription test." | |
| ) | |
| print("Please place a sample .mp3 or .wav file at that location for testing.") | |
| # if os.path.exists(test_audio_file_2): | |
| # print(f"\nAttempting to transcribe: {test_audio_file_2}") | |
| # transcribed_text_2 = tool_instance.forward(test_audio_file_2) | |
| # print(f"Transcription 2:\n{transcribed_text_2}") | |
| # else: | |
| # print(f"Test audio file 2 not found: {test_audio_file_2}. Skipping.") | |
| else: | |
| print( | |
| "SpeechToTextTool could not be initialized (ASR pipeline missing). Transcription test skipped." | |
| ) | |
| # Test with a non-existent file | |
| non_existent_file = "./non_existent_audio.mp3" | |
| print(f"\nAttempting to transcribe non-existent file: {non_existent_file}") | |
| error_text = tool_instance.forward(non_existent_file) | |
| print(f"Result for non-existent file:\n{error_text}") | |
| assert "Error:" in error_text # Expect an error message |