# Exploration for Speech Recognition

In [None]:
%pip install --upgrade pip
%pip install devtools
%pip install torch transformers smolagents openai

In [3]:
from dotenv import load_dotenv
from pathlib import Path
import sys

sys.path.insert(0, str(Path.cwd().parent))
load_dotenv()

True

We are going to use `openai/whisper-large-v3-turbo` model from HuggingFace Hub for speech recognition.

First, we are going to create valid configuration.

Be sure `ffmpeg` is installed in the system (e.g. with `brew install fffmpeg`) or with `pip install static-ffmpeg`.

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from devtools import pprint

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True,
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
    return_timestamps=True,
)

result = pipe(
    "data/tasks/1f975693-876d-457b-a649-393859e79bf3/1f975693-876d-457b-a649-393859e79bf3.mp3"
)

print(f"Text:\n{result['text']}")
print("Chunks:")
pprint(result["chunks"])

Device set to use cpu
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


Text:
 Before you all go, I want to remind you that the midterm is next week. Here's a little hint. You should be familiar with the differential equations on page 245. Problems that are very similar to problems 32, 33 and 44 from that page might be on the test. And also some of you might want to brush up on the last page in the integration section, page 197. I know some of you struggled on last week's quiz. I foresee problem 22 from page 197 being on your midterm. Oh, and don't forget to brush up on the section on related rates on pages 132, 133 and 134
Chunks:
[
    {
        'timestamp': (
            0.0,
            3.62,
        ),
        'text': ' Before you all go, I want to remind you that the midterm is next week.',
    },
    {
        'timestamp': (
            4.2,
            4.98,
        ),
        'text': " Here's a little hint.",
    },
    {
        'timestamp': (
            5.48,
            9.0,
        ),
        'text': ' You should be familiar with the differen

Working with URL directly:

In [None]:
result = pipe(
    "https://agents-course-unit4-scoring.hf.space/files/1f975693-876d-457b-a649-393859e79bf3"
)
print(f"Text:\n{result['text']}")



Text:
 Before you all go, I want to remind you that the midterm is next week. Here's a little hint. You should be familiar with the differential equations on page 245. Problems that are very similar to problems 32, 33 and 44 from that page might be on the test. And also some of you might want to brush up on the last page in the integration section, page 197. I know some of you struggled on last week's quiz. I foresee problem 22 from page 197 being on your midterm. Oh, and don't forget to brush up on the section on related rates on pages 132, 133 and 134


We are going to wrap this code as a smolagents tool. As chunks timestamp are relative to chunk we are going to fix chunk length and recalculate absolute timestamps so it may be matched with other data (e.g. vide frames). Also to clean up output we are going to suppress undesired messages and warnings. 

In [26]:
from smolagents import Tool
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, logging
import warnings


class SpeechRecognitionTool(Tool):
    name = "speech_to_text"
    description = """Transcribes speech from audio."""

    inputs = {
        "audio": {
            "type": "string",
            "description": "Path to the audio file to transcribe.",
        },
        "with_time_markers": {
            "type": "boolean",
            "description": "Whether to include timestamps in the transcription output. Each timestamp appears on its own line in the format [float, float], indicating the number of seconds elapsed from the start of the audio.",
            "nullable": True,
            "default": False,
        },
    }
    output_type = "string"

    chunk_length_s = 30

    def __new__(cls, *args, **kwargs):
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

        model_id = "openai/whisper-large-v3-turbo"
        model = AutoModelForSpeechSeq2Seq.from_pretrained(
            model_id,
            torch_dtype=torch_dtype,
            low_cpu_mem_usage=True,
            use_safetensors=True,
        )
        model.to(device)
        processor = AutoProcessor.from_pretrained(model_id)

        logging.set_verbosity_error()
        warnings.filterwarnings(
            "ignore",
            category=FutureWarning,
            message=r".*The input name `inputs` is deprecated.*",
        )
        cls.pipe = pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            torch_dtype=torch_dtype,
            device=device,
            chunk_length_s=cls.chunk_length_s,
            return_timestamps=True,
        )

        return super().__new__(cls, *args, **kwargs)

    def forward(self, audio: str, with_time_markers: bool = False) -> str:
        """
        Transcribes speech from audio.

        Args:
            audio (str): Path to the audio file to transcribe.
            with_time_markers (bool): Whether to include timestamps in the transcription output. Each timestamp appears on its own line in the format [float], indicating the number of seconds elapsed from the start of the audio.

        Returns:
            str: The transcribed text.
        """
        result = self.pipe(audio)
        if not with_time_markers:
            return result["text"].strip()

        txt = ""
        chunk_length_s = self.chunk_length_s
        absolute_offset = 0.0
        chunk_offset = 0.0
        for chunk in result["chunks"]:
            timestamp_start = chunk["timestamp"][0]
            timestamp_end = chunk["timestamp"][1]
            if timestamp_start < chunk_offset:
                absolute_offset += chunk_length_s
                chunk_offset = timestamp_start
            absolute_start = absolute_offset + timestamp_start

            if timestamp_end < timestamp_start:
                absolute_offset += chunk_length_s
            absolute_end = absolute_offset + timestamp_end
            chunk_offset = timestamp_end

            chunk_text = chunk["text"].strip()
            if chunk_text:
                txt += f"[{absolute_start:.2f}]\n{chunk_text}\n[{absolute_end:.2f}]\n"
        return txt.strip()


speech_to_text = SpeechRecognitionTool()

Verify tool implementation:

In [23]:
transcription = speech_to_text(
    audio="data/tasks/1f975693-876d-457b-a649-393859e79bf3/1f975693-876d-457b-a649-393859e79bf3.mp3",
    with_time_markers=True,
)

print(transcription)

[0.00]
Before you all go, I want to remind you that the midterm is next week. Here's a little hint. You should be familiar with the differential equations on page 245. Problems that are very similar to problems 32, 33 and 44 from that page might be on the test. And also some of you might want to brush up on the last page in the integration section, page 197. I know some of you struggled on last week's quiz. I foresee problem 22 from page 197 being on your midterm.
[27.52]
[28.20]
Oh, and don't forget to brush up on the section on related rates
[30.70]
[30.70]
on pages 132, 133 and 134.
[34.78]


Now time to verify if agent can use our tools in GAIA challenges environment

In [None]:
from smolagents import ToolCallingAgent, OpenAIServerModel
from tools import GetAttachmentTool

get_attachment = GetAttachmentTool()
model = OpenAIServerModel(model_id="gpt-4.1")
agent = ToolCallingAgent(
    model=model,
    tools=[get_attachment, speech_to_text],
)

for task_with_audio_attachment in [
    "1f975693-876d-457b-a649-393859e79bf3",
    "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
]:
    get_attachment.attachment_for(task_with_audio_attachment)
    agent.run("Transcribe attached audio")

And now let's verify full GAIA task execution.

In [None]:
for task_with_audio_attachment, question in {
    "1f975693-876d-457b-a649-393859e79bf3": """\
Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(
Could you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.
""",
    "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3": """\
Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.
In your response, please only list the ingredients, not any measurements. So if the recipe calls for "a pinch of salt" or "two cups of ripe strawberries" the ingredients on the list would be "salt" and "ripe strawberries".
Please format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.
""",
}.items():
    get_attachment.attachment_for(task_with_audio_attachment)
    agent.run(question)