Spaces:
Sleeping
Sleeping
| from smolagents import SpeechToTextTool, Tool | |
| class EnglishSpeechToTextTool(SpeechToTextTool): | |
| def encode(self, audio): | |
| from smolagents.agent_types import AgentAudio | |
| audio = AgentAudio(audio).to_raw() | |
| return self.pre_processor(audio, return_tensors="pt", sampling_rate=16_000) | |
| def forward(self, inputs): | |
| return self.model.generate(inputs["input_features"], language="en") | |
| def decode(self, outputs): | |
| return "## Transcription\n\n" + self.pre_processor.batch_decode(outputs, skip_special_tokens=True)[0] | |
| class GoogleSTTTool(Tool): | |
| description = "This is a tool that transcribes an audio into text. It returns the transcribed text." | |
| name = "transcriber" | |
| inputs = { | |
| "audio": { | |
| "type": "audio", | |
| "description": "The audio to transcribe. Can be a local path, an url, or a tensor.", | |
| } | |
| } | |
| output_type = "string" | |
| def forward(self, inputs): | |
| from google.cloud.speech_v2 import SpeechClient | |
| from google.cloud.speech_v2.types import cloud_speech | |
| audio_file = inputs["audio"] | |
| with open(audio_file, "rb") as f: | |
| audio_content = f.read() | |
| # Instantiates a client | |
| client = SpeechClient() | |
| config = cloud_speech.RecognitionConfig( | |
| auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(), | |
| language_codes=["en-US"], | |
| model="long", | |
| ) | |
| request = cloud_speech.RecognizeRequest( | |
| recognizer=f"projects/{PROJECT_ID}/locations/global/recognizers/_", | |
| config=config, | |
| content=audio_content, | |
| ) | |
| # Transcribes the audio into text | |
| response = client.recognize(request=request) | |
| for result in response.results: | |
| print(f"Transcript: {result.alternatives[0].transcript}") | |
| return response | |