su3su2u1's picture
Add Google STT
b679b6b unverified
from smolagents import SpeechToTextTool, Tool
class EnglishSpeechToTextTool(SpeechToTextTool):
def encode(self, audio):
from smolagents.agent_types import AgentAudio
audio = AgentAudio(audio).to_raw()
return self.pre_processor(audio, return_tensors="pt", sampling_rate=16_000)
def forward(self, inputs):
return self.model.generate(inputs["input_features"], language="en")
def decode(self, outputs):
return "## Transcription\n\n" + self.pre_processor.batch_decode(outputs, skip_special_tokens=True)[0]
class GoogleSTTTool(Tool):
description = "This is a tool that transcribes an audio into text. It returns the transcribed text."
name = "transcriber"
inputs = {
"audio": {
"type": "audio",
"description": "The audio to transcribe. Can be a local path, an url, or a tensor.",
}
}
output_type = "string"
def forward(self, inputs):
from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech
audio_file = inputs["audio"]
with open(audio_file, "rb") as f:
audio_content = f.read()
# Instantiates a client
client = SpeechClient()
config = cloud_speech.RecognitionConfig(
auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
language_codes=["en-US"],
model="long",
)
request = cloud_speech.RecognizeRequest(
recognizer=f"projects/{PROJECT_ID}/locations/global/recognizers/_",
config=config,
content=audio_content,
)
# Transcribes the audio into text
response = client.recognize(request=request)
for result in response.results:
print(f"Transcript: {result.alternatives[0].transcript}")
return response