Real_Final_Assignment_AI_Agents_Course

Sleeping

App Files Files Community

Gabandino commited on Jun 20, 2025

Commit

9de0414

verified ·

1 Parent(s): 3a62410

Upload all tools for final submission and init.py file

Browse files

Upload all tools:
Chess tools
Classifier tool
Content retriever tool
Get attachments tool
Google search tools
YouTube video tool

Also, upload __init__.py file to ensure all tools can be referenced by other files within space.

Files changed (8) hide show

tools/__init__.py +19 -0
tools/chess_tools.py +123 -0
tools/classifier_tool.py +83 -0
tools/content_retriever_tool.py +85 -0
tools/get_attachments_tool.py +76 -0
tools/google_search_tools.py +79 -0
tools/speech_recognition_tool.py +115 -0
tools/youtube_video_tool.py +393 -0

tools/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from .get_attachments_tool import GetAttachmentTool
+from .google_search_tools import GoogleSearchTool, GoogleSiteSearchTool
+from .content_retriever_tool import ContentRetrieverTool
+from .speech_recognition_tool import SpeechRecognitionTool
+from .youtube_video_tool import YouTubeVideoTool
+from .classifier_tool import ClassifierTool
+from .chess_tools import ImageToChessBoardFENTool, chess_engine_locator
+__all__ = [
+    "GetAttachmentTool",
+    "GoogleSearchTool",
+    "GoogleSiteSearchTool",
+    "ContentRetrieverTool",
+    "SpeechRecognitionTool",
+    "YoutubeVideoTool",
+    "ClassifierTool",
+    "ImageToChessBoardFENTool",
+    "chess_engine_locator",
+]

tools/chess_tools.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from smolagents import Tool, tool
+from openai import OpenAI
+import shutil
+@tool
+def chess_engine_locator() -> str | None:
+    """
+    Get the path to the chess engine binary. Can be used with chess.engine.SimpleEngine.popen_uci function from chess.engine Python module.
+    Returns:
+        str: Path to the chess engine.
+    """
+    path = shutil.which('stockfish')
+    return path if path else None
+class ImageToChessBoardFENTool(Tool):
+    name = 'image_to_chess_board_fen'
+    description = '''Convert a chessboard image to board part of the FEN.'''
+    inputs = {
+        'image_url': {
+            'type': 'string',
+            'description': 'Public URL of the image (preferred) or base64 encoded image in data URL format',
+        }
+    }
+    output_type = 'string'
+    def __init__(self, client: OpenAI | None = None, **kwargs):
+        self.client = client if client is not None else OpenAI()
+        super().__init__(**kwargs)
+    def attachment_for(self, task_id: str | None):
+        self.task_id = task_id
+    def forward(self, image_url: str) -> str:
+        """
+        Convert a chessboard image to board part of the FEN.
+        Args:
+            image_url (str): Public URL of the image (preferred) or base64 encoded image in data URL format.
+        Returns:
+            str: Board part of the FEN.
+        """
+        client = self.client
+        response =  client.response.create(
+            model='gpt-4.1',
+            input=[
+                {
+                    'role': 'user',
+                    'content': [
+                        {
+                            'type': 'input_text',
+                            'text': 'Describe the position of the pieces on the chessboard from the image. Please, nothing else but description.',
+                        },
+                        {'type': 'input_image', 'image_url': image_url},
+                    ],
+                },
+            ],
+        )
+        response = client.responses.create(
+            model='gpt-4.1',
+            input=[
+                {
+                    'role': 'user',
+                    'content': [
+                        {
+                            'type': 'input_text',
+                            'text': 'Describe the position of the pieces on the chessboard from the image. Please, nothing else but description.',
+                        },
+                    ],
+                },
+            ]
+            + response.output
+            + [
+                {
+                    'role': 'user',
+                    'content': [
+                        {
+                            'type': 'input_text',
+                            'text': """\
+          Write down all positions with known pieces.
+          Use a standard one-letter code to name pieces.
+          It is important to use the correct case for piece code. Use upper case for white and lower case for black.
+          It is important to include information about all the mentioned positions.
+          Describe each position in a new line.
+          Follow format: <piece><position> (piece first, then position, no spaces)
+          Return nothing but lines with positions.
+          """,
+                        },
+                    ],
+                }
+            ],
+        )
+        board_pos = response.output_text
+        pos_dict = {}
+        for post_str in board_pos.splitlines():
+            pos_str = pos_str.strip()
+            if len(pos_str) != 3:
+                continue
+            piece = post_str[0]
+            pos = pos_str[1:3]
+            pos_dict[pos] = piece
+        board_fen = ''
+        for rank in range (8, 0, -1):
+            empty = 0
+            for file_c in range(ord('a'), ord('h') +1):
+                file = chr(file_c)
+                square = file + str(rank)
+                if square in pos_dict:
+                    if empty > 0:
+                        board_fen += str(empty)
+                        empty = 0
+                    board_fen += pos_dict[square]
+                else:
+                    empty += 1
+            if empty > 0:
+                board_fen += str(empty)
+            if rank != 1:
+                board_fen += '/'
+        return board_fen

tools/classifier_tool.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from smolagents import Tool
+from openai import OpenAI
+class ClassifierTool(Tool):
+    name = 'open_classifier'
+    description = """Classifies given items into given categories from perspective of specific knowledge area."""
+    inputs = {
+        'knowledge_area': {
+            'type': 'string',
+            'description': 'The knowledge area that should be used for classification',
+        },
+        'environment': { # context makes models too verbose
+            'type': 'string',
+            'description': 'Couple words that describe the environment or location in which items should be classified in case of plural meaning or if only part of item relevant for classification.'
+        },
+        'categories': {
+            'type': 'string',
+            'description': 'Comma separated list of categories to distribute objects.',
+        },
+        'items': {
+            'type': 'string',
+            'description': 'Comma separated list of items to be classified. Please include adjectives if available.',
+        },
+    }
+    output_type = 'string'
+    def __init__(
+            self,
+            client: OpenAI | None = None,
+            model_id: str = 'gpt-4.1-mini',
+            **kwargs,
+    ):
+        self.client = client or OpenAI()
+        self.model_id = model_id
+        super().__init__(**kwargs)
+    def forward(
+            self, knowledge_area: str, environment: str, categories: str, items: str
+    ) -> str:
+        response = self.client.responses.create(
+            model=self.model_id,
+            input=[
+                {
+                    'role': 'user',
+                    'content': [
+                        {
+                            'type': 'input_text',
+                            'text': self._prompt(
+                                knowledge_area=knowledge_area,
+                                context=environment,
+                                categories=categories,
+                                items=items,
+                            ),
+                        },
+                    ],
+                },
+            ],
+        )
+        answer = response.output_text
+        return answer
+    def _prompt(
+            self, knowledge_area: str, context: str, categories: str, items: str
+    ) -> str:
+        return f"""\
+You are {knowledge_area} classifier located in {context} context.
+I will provide you a list of items and a list of categories and context in which items should be considered.
+Your task is to classify the items into the categories.
+Use context to determine the meaning of the items and decide if you need to classify entire item or only part of it.
+Do not miss any item and do not add any item to the list of categories.
+Use highest probability category for each item.
+You can add category "Other" if you are not sure about the classification.
+Use only considerations from the {knowledge_area} perspective.
+Explain your reasoning from {knowledge_area} perspective in {context} context and then provide final answer.
+Important: Do not allow {context} influence your judgment for classification.
+ITEMS: {items}
+CATEGORIES: {categories}
+Now provide your reasoning and finalize it with the classification in the following format:
+Category 1: items list
+Category 2: items list
+Other (if needed): items list
+"""

tools/content_retriever_tool.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from smolagents import Tool
+from docling.document_converter import DocumentConverter
+from docling.chunking import HierarchicalChunker
+from sentence_transformers import SentenceTransformer, util
+import torch
+class ContentRetrieverTool(Tool):
+    name = 'retrieve_content'
+    description = """Retrieve content of a webpage or document in markdown format. Supports PDF, DOCX, XLSX, HTML, images, and more."""
+    inputs = {
+        "url": {
+            "type": "string",
+            "description": "The URL or local path of the webpage or document to retrieve.",
+        },
+        "query": {
+            "type": "string",
+            "description": 'The subject on the page you are looking for. The shorter, the more relevant content is returned.',
+        },
+    }
+    output_type = "string"
+    def __init__(
+            self,
+            model_name: str | None = None,
+            threshold: float = 0.2,
+            **kwargs,
+    ):
+        self.threshold = threshold
+        self._document_converter = DocumentConverter()
+        self._model = SentenceTransformer(
+            model_name if model_name is not None else 'all-MiniLM-L6-v2'
+        )
+        self._chunker = HierarchicalChunker()
+        super().__init__(**kwargs)
+    def forward(self, url: str, query: str) -> str:
+            document = self._document_converter.convert(url).document
+            chunks = list(self._chunker.chunk(dl_doc=document))
+            if len(chunks) == 0:
+                return 'No content found.'
+            chunks_text = [chunk.text for chunk in chunks]
+            chunks_with_context = [self._chunker.contextualize(chunk) for chunk in chunks]
+            chunks_context = [
+                chunks_with_context[i].replace(chunks_text[i], "").strip()
+                for i in range(len(chunks))
+            ]
+            chunk_embeddings = self._model.encode(chunks_text, convert_to_tensor=True)
+            context_embeddings = self._model.encode(chunks_context, convert_to_tensor=True)
+            query_embedding = self._model.encode(
+                [term.strip() for term in query.split(",") if term.strip()],
+                convert_to_tensor=True,
+            )
+            selected_indices = [] # aggregate indexes across chunks and context matches and for all queries
+            for embeddings in [
+            context_embeddings,
+            chunk_embeddings,
+        ]:
+            # Compute cosine similarities (returns 1D tensor)
+                for cos_scores in util.pytorch_cos_sim(query_embedding, embeddings):
+                    # Convert to softmax probabilities
+                    probabilities = torch.nn.functional.softmax(cos_scores, dim=0)
+                    # Sort by probability descending
+                    sorted_indices = torch.argsort(probabilities, descending=True)
+                    # Accumulate until total probability reaches threshold
+                    cumulative = 0.0
+                    for i in sorted_indices:
+                        cumulative += probabilities[i].item()
+                        selected_indices.append(i.item())
+                        if cumulative >= self.threshold:
+                            break
+            selected_indices = list(
+                dict.fromkeys(selected_indices)
+            ) # remove duplicates and preserve order
+            selected_indices = selected_indices[::-1] # make most relevant items last for better focus
+            if len(selected_indices) == 0:
+                return "No content found."
+            return "\n\n".join([chunks_with_context[idx] for idx in selected_indices])

tools/get_attachments_tool.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from smolagents import Tool
+import requests
+from urllib.parse import urljoin
+import base64
+import tempfile
+class GetAttachmentTool(Tool):
+    name = "get_attachment"
+    description = """
+    Format to retrieve attachment. Options are: URL (preferred), DATA_URL, LOCAL_FILE_PATH, TEXT. URL returns the URL of the file, DATA_URL returns a base64 encoded data URL, LOCAL_FILE_PATH returns a local file path to the downloaded file, and TEXT returns the content of the file as text.
+"""
+    inputs = {
+        "fmt": {
+            "type": "string",
+            "description": """Format to retrieve attachment. Options are: URL, DATA_URL, LOCAL_FILE_PATH (preferred for current testing environment), TEXT. URL returns the URL of the file, DATA_URL returns a base64 encoded data URL, LOCAL_FILE_PATH returns a local file path to the downloaded file, and TEXT returns the content of the file as text.""",
+            "nullable": True,
+            "default": "URL",
+        }
+    }
+    output_type = "string"
+    def __init__(
+            self,
+            agent_evaluation_api: str | None = None,
+            task_id: str | None = None,
+            **kwargs,
+    ):
+        # Default to Hugging Face GAIA testing space
+        self.agent_evaluation_api = (
+            agent_evaluation_api
+            if agent_evaluation_api is not None
+            else "https://agents-course-unit4-scoring.hf.space/"
+        )
+        self.task_id = task_id
+        super().__init__(**kwargs)
+    def attachment_for(self, task_id: str| None):
+        self.task_id = task_id
+    def forward(self, fmt: str = "URL") -> str:
+        # Ensure the format is uppercase for comparison
+        fmt = fmt.upper()
+        assert fmt in ["URL", "DATA_URL", "LOCAL_FILE_PATH", "TEXT"]
+        if not self.task_id:
+            return "No task_id provided to retrieve attachment."
+        file_url = urljoin(self.agent_evaluation_api, f"files/{self.task_id}")
+        if fmt == "URL":
+            return file_url
+        response = requests.get(
+            file_url,
+            headers={
+                "Content-Type": "application/json",
+                "Accept": "application/json",
+            },
+        )
+        if 400 <= response.status_code < 500:
+            raise ValueError(f"Error fetching file: {response.status_code} {response.reason}")
+        response.raise_for_status()
+        mime = response.headers.get("content-type", "text/plain")
+        if fmt == "TEXT":
+            if mime.startswith("text/"):
+                return response.text
+            else:
+                raise ValueError(f"Content of file type {mime} cannot be retrieved as TEXT")
+        elif fmt == "DATA_URL":
+            return f"data:{mime};base64,{base64.b64encode(response.content).decode("utf-8")}"
+        elif fmt == "LOCAL_FILE_PATH":
+            with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+                tmp_file.write(response.content)
+                return tmp_file.name
+        else:
+            raise ValueError(f"Unsupported format: {fmt}. Supported formats are URL, DATA_URL, LOCAL_FILEPATH, and TEXT.")

tools/google_search_tools.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from smolagents import Tool
+from googleapiclient.discovery import build
+import os
+class GoogleSearchTool(Tool):
+    name = "web_search"
+    description = """Performs a google web search for a query then returns top search results in markdown format."""
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The query to perform a web search for"
+        }
+    }
+    output_type = "string"
+    skip_forward_signature_validation = True
+    def __init__(
+        self,
+        api_key: str | None = None,
+        search_engine_id: str | None = None,
+        num_results: int = 10,
+        **kwargs,
+    ):
+        from dotenv import load_dotenv
+        load_dotenv()
+        api_key = os.getenv("GOOGLE_API_KEY")
+        search_engine_id = os.getenv("GOOGLE_SEARCH_ENGINE_ID")
+        if not api_key:
+            raise ValueError("GOOGLE_API_KEY is not set")
+        if not search_engine_id:
+            raise ValueError("GOOGLE_SEARCH_ENGINE_ID is not set")
+        self.cse = build(
+            "customsearch",
+            "v1",
+            developerKey=api_key
+        ).cse()
+        self.cx = search_engine_id
+        self.num = num_results
+        super().__init__(**kwargs)
+    def _collect_params(self) -> dict:
+        return {}
+    def forward(self, query: str, *args, **kwargs) -> str:
+        params = {
+            "q": query,
+            "cx": self.cx,
+            "fields": "items(link, title, snippet)",
+            "num": self.num,
+        }
+        params = params | self._collect_params(*args, **kwargs)
+        res = self.cse.list(**params).execute()
+        if "items" not in res:
+            return "No results found"
+        return "\n\n".join(f"{item['title']}\n{item['link']}\n{item['snippet']}" for item in res["items"])
+class GoogleSiteSearchTool(GoogleSearchTool):
+    name = "site_search"
+    description = """Searches a specific website for a given query and returns the site contents in markdown format. Use when information is likely to be found on a particular domain, such as reddit.com, wikipedia.org, ieee.org, or arxiv.org."""
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The query to perform search."
+        },
+        "site": {
+            "type": "string",
+            "description": "The domain of the site on which to search",
+        },
+    }
+    def _collect_params(self, site: str) -> dict:
+        return {
+            "siteSearch": site,
+            "siteSearchFilter": "i",
+        }

tools/speech_recognition_tool.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from smolagents import Tool
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, logging
+import warnings
+class SpeechRecognitionTool(Tool):
+    name = 'speech_to_text'
+    description = '''Transcribes speech from audio'''
+    inputs = {
+        'audio': {
+            'type': 'string',
+            'description': 'Path to the audio file to transcribe.',
+        },
+        'with_time_markers': {
+            'type': 'boolean',
+            'description': 'Whether to include timestamps in the transcription output. Each timestamp appears on its own line in the format [float, float], indicating the number of seconds elapsed from the start of the audio.',
+            'nullable': True,
+            'default': False,
+        },
+    }
+    output_type = 'string'
+    chunk_length_s = 30
+    def __new__(cls, *args, **kwargs):
+        device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        model_id = 'openai/whisper-large-v3-turbo'
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_id,
+            torch_dtype=torch_dtype,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        )
+        model.to(device)
+        processor = AutoProcessor.from_pretrained(model_id)
+        logging.set_verbosity_error()
+        warnings.filterwarnings(
+            'ignore',
+            category=FutureWarning,
+            message=r'.*The input name "inputs" is deprecated.*',
+        )
+        cls.pipe = pipeline(
+            'automatic-speech-recognition',
+            model=model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            torch_dtype=torch_dtype,
+            device=device,
+            chunk_length_s=cls.chunk_length_s,
+            return_timestamps=True,
+        )
+        return super().__new__(cls, *args, **kwargs)
+    def forward(self, audio: str, with_time_markers: bool = False) -> str:
+        '''
+        Transcribes speech from audio.
+        Args:
+            audio (str): Path to the audio file to transcribe.
+            with_time_markers (bool): Whether to include timestamps in the transcription output. Each timestamp appears on its own line in the format [float], indicating the number of seconds elapsed from the start of the audio
+        Returns:
+            str: The transcribed text.
+        '''
+        result = self.pipe(audio)
+        if not with_time_markers:
+            return result['text'].strip()
+        txt = ""
+        for chunk in self._normalize_chunks(result["chunks"]):
+            txt += f"[{chunk['start']:0.2f}]\n{chunk['text']}\n[{chunk['end']:0.2f}]\n"
+        return txt.strip()
+    def _normalize_chunks(self, chunks):
+        chunk_length_s = self.chunk_length_s
+        absolute_offset = 0.0
+        chunk_offset = 0.0
+        normalized = []
+        for chunk in chunks:
+            timestamp_start = chunk['timestamp'][0]
+            timestamp_end = chunk['timestamp'][1]
+            if timestamp_start < chunk_offset:
+                absolute_offset += chunk_length_s
+                chunk_offset = timestamp_start
+            absolute_start = absolute_offset + timestamp_start
+            if timestamp_end < timestamp_start:
+                absolute_offset += chunk_length_s
+            absolute_end = absolute_offset + timestamp_end
+            chunk_offset = timestamp_end
+            chunk_text = chunk['text'].strip()
+            if chunk_text:
+                normalized.append(
+                    {
+                        'start': absolute_start,
+                        'end': absolute_end,
+                        'text': chunk_text,
+                    }
+                )
+        return normalized
+# TEST THE SCRIPT (UNCOMMENT LINES BELOW)
+# speech_to_text = SpeechRecognitionTool()
+# transcription = speech_to_text(
+#     audio="https://agents-course-unit4-scoring.hf.space/files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
+#     with_time_markers=True,
+# )
+# print(transcription)

tools/youtube_video_tool.py ADDED Viewed

	@@ -0,0 +1,393 @@

+from smolagents import Tool
+from openai import OpenAI
+from tools.speech_recognition_tool import SpeechRecognitionTool
+from io import BytesIO
+import yt_dlp
+import av
+import torchaudio
+import subprocess
+import requests
+import base64
+import tempfile
+import re
+import os
+class YouTubeVideoTool(Tool):
+    name = 'youtube_video'
+    description = '''Process the video and return the requested information from it.'''
+    inputs = {
+        'url': {
+            'type': 'string',
+            'description': 'The URL of the YouTube video.',
+        },
+        'query': {
+            'type': 'string',
+            'description': 'The question to answer',
+        },
+    }
+    output_type = 'string'
+    def __init__(
+            self,
+            video_quality: int = 360,
+            frames_interval: int | float | None = 2,
+            chunk_duration: int | float | None = 2,
+            speech_recognition_tool: SpeechRecognitionTool | None = None,
+            client: OpenAI | None = None,
+            model_id: str = 'gpt-4.1-mini',
+            debug: bool = False,
+            **kwargs,
+    ):
+        self.video_quality = video_quality
+        self.speech_recognition_tool = speech_recognition_tool
+        self.frames_interval = frames_interval
+        self.chunk_duration = chunk_duration
+        self.client = client or OpenAI()
+        self.model_id = model_id
+        self.debug = debug
+        super().__init__(**kwargs)
+    def forward(self, url: str, query: str):
+        '''
+        Process the video and return the requested information.
+        Args:
+            url(str): The URL of the YouTube video.
+            query(str): The question to answer.
+        Returns:
+            str: Answer to the query
+        '''
+        answer = ''
+        for chunk in self._split_video_into_chunks(url):
+            prompt = self._prompt(
+                chunk,
+                query,
+                answer,
+            )
+            response = self.client.responses.create(
+                model='gpt-4.1-mini',
+                input=[
+                    {
+                        'role': 'user',
+                        'content': [
+                            {
+                                'type': 'input_text',
+                                'text': prompt,
+                            },
+                        ],
+                    },
+                ],
+            )
+            answer = response.output_text
+            if self.debug:
+                print(
+                    f"CHUNK {chunk['start']} - {chunk['end']}:\n\n{prompt}\n\nANSWER:\n{answer}"
+                )
+        if answer.strip() == 'I need to keep watching.':
+            answer = ''
+        return answer
+    def _prompt(self, chunk, query, aggregated_answer):
+        prompt = [
+            f"""\
+These are some frames of a YouTube video.
+I will ask a question about the entire video, but you'll only see part of it at a time.
+Aggregate answer about the entire video, use information about previous parts but do not reference the previous parts in the answer directly.
+Ground your answer based on video title, description, captions, vide frames or answer from previous parts.
+If no evidences presented just say "I need to keep watching".
+VIDEO TITLE:
+{chunk["title"]}
+VIDEO DESCRIPTION:
+{chunk["description"]}
+FRAMES SUBTITLES:
+{chunk["captions"]}"""
+        ]
+        if aggregated_answer:
+            prompt.append(f"""\
+Here is the answer to the same question based on the previous video parts:
+BASED ON PREVIOUS PARTS:
+{aggregated_answer}""")
+        prompt.append(f"""\
+QUESTION:
+{query}""")
+        return "\n\n".join(prompt)
+    def _split_video_into_chunks(
+            self, url: str, with_captions: bool = True, with_frames: bool = True
+    ):
+        video = self._process_video(
+            url, with_captions=with_captions, with_frames=with_frames
+        )
+        video_duration = video['duration']
+        chunk_duration = self.chunk_duration or video_duration
+        chunk_start = 0.0
+        while chunk_start < video_duration:
+            chunk_end = min(chunk_start + chunk_duration, video_duration)
+            chunk = self._get_video_chunk(video, chunk_start, chunk_end)
+            yield chunk
+            chunk_start += chunk_duration
+    def _get_video_chunk(self, video, start, end):
+        chunk_captions = [
+            c for c in video['captions'] if c['start'] <= end and c['end'] >=  start
+        ]
+        chunk_frames = [
+            f
+            for f in video['frames']
+            if f['timestamp'] >= start and f['timestamp'] <= end
+        ]
+        return {
+            'title': video['title'],
+            'description': video['description'],
+            'start': start,
+            'end': end,
+            'captions': '\n'.join([c['text'] for c in chunk_captions]),
+            'frames': chunk_frames,
+        }
+    def _process_video(
+            self, url: str, with_captions: bool = True, with_frames: bool = True
+    ):
+        lang = 'en'
+        info = self._get_video_info(url, lang)
+        if with_captions:
+            captions = self._extract_captions(
+                lang, info.get('subtitles', {}), info.get('automatic_captions', {})
+            )
+            if not captions and self.speech_recognition_tool:
+                audio_url = self._select_audio_format(info['formats'])
+                audio = self._capture_audio(audio_url)
+                with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
+                    tmp.write(audio.read())
+                    tmp.flush()
+                    tmp_path = tmp.name  # Save the path before closing
+                # Now the file is closed, safe to use
+                try:
+                    transcription = self.speech_recognition_tool(audio=tmp_path, with_time_markers=True)
+                    captions = []
+                    pattern = re.compile(r'\[(\d+\.\d+)\]\n(.+?)\n\[(\d+\.\d+)\]', re.DOTALL)
+                    for match in pattern.finditer(transcription):
+                        start, text, end = match.groups()
+                        captions.append({
+                            'start': float(start),
+                            'end': float(end),
+                            'text': text.strip(),
+                        })
+                finally:
+                    os.remove(tmp_path)  # Clean up the temp file
+        else:
+            captions = []
+        if with_frames:
+            video_url = self._select_video_format(info['formats'], 360)['url']
+            frames = self._capture_video_frames(video_url, self.frames_interval)
+        else:
+            frames = []
+        return {
+            'id': info['id'],
+            'title': info['title'],
+            'description': info['description'],
+            'duration': info['duration'],
+            'captions': captions,
+            'frames': frames,
+        }
+    def _get_video_info(self, url: str, lang: str):
+        ydl_opts = {
+            'quiet': True,
+            'skip_download': True,
+            'format': 'bestvideo[ext=mp4][height<=360]+bestaudio[ext=m4a]/best[height<=360]',
+            "forceurl": True,
+            "noplaylist": True,
+            "writesubtitles": True,
+            "writeautomaticsub": True,
+            "subtitlesformat": "vtt",
+            "subtitleslangs": [lang],
+        }
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(url, download=False)
+        return info
+    def _extract_captions(self, lang, subtitles, auto_captions):
+        caption_tracks = subtitles.get(lang) or auto_captions.get(lang) or []
+        structured_captions = []
+        srt_track = next(
+            (track for track in caption_tracks if track['ext'] == 'srt'), None
+        )
+        vtt_track = next(
+            (track for track in caption_tracks if track['ext'] == 'vtt'), None
+        )
+        if srt_track:
+            import pysrt
+            response = requests.get(srt_track['url'])
+            response.raise_for_status()
+            srt_data = response.content.decode('utf-8')
+            def to_sec(t):
+                return (
+                    t.hours * 3600 + t.minutes * 60 + t.seconds + t.milliseconds / 1000
+                )
+            structured_captions = [
+                {
+                    "start": to_sec(sub.start),
+                    "end": to_sec(sub.end),
+                    "text": sub.text.strip(),
+                }
+                for sub in pysrt.from_string(srt_data)
+            ]
+        if vtt_track:
+            import webvtt
+            from io import StringIO
+            response = requests.get(vtt_track['url'])
+            response.raise_for_status()
+            vtt_data = response.text
+            vtt_file =  StringIO(vtt_data)
+            def to_sec(t):
+                """Convert 'HH:MM:SS.mmm' to float seconds"""
+                h, m, s = t.split(":")
+                s, ms = s.split(".")
+                return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000
+            for caption in webvtt.read_buffer(vtt_file):
+                structured_captions.append(
+                    {
+                        'start': to_sec(caption.start),
+                        "end": to_sec(caption.end),
+                        "text": caption.text.strip(),
+                    }
+                )
+        return structured_captions
+    def _select_video_format(self, formats, video_quality):
+        video_format = next(
+            f
+            for f in formats
+            if f.get('vcodec') != 'none' and f.get('height') == video_quality
+        )
+        return video_format
+    def _capture_video_frames(self, video_url, capture_interval_sec=None):
+        import tempfile
+        import os
+        with tempfile.NamedTemporaryFile(suffix='.mkv', delete=False) as tmp:
+            tmp_path = tmp.name
+        try:
+            ffmpeg_cmd = [
+                "ffmpeg",
+                "-y",  # Overwrite output file if needed
+                "-i", video_url,
+                "-f", "matroska",
+                tmp_path,
+            ]
+            result = subprocess.run(ffmpeg_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            if result.returncode != 0 or not os.path.exists(tmp_path) or os.path.getsize(tmp_path) == 0:
+                raise RuntimeError(f"ffmpeg failed to create a valid video file for {video_url}")
+            container = av.open(tmp_path)
+            stream = container.streams.video[0]
+            time_base = stream.time_base
+            frames = []
+            next_capture_time = 0
+            for frame in container.decode(stream):
+                if frame.pts is None:
+                    continue
+                timestamp = float(frame.pts * time_base)
+                if capture_interval_sec is None or timestamp >= next_capture_time:
+                    frames.append(
+                        {
+                            'timestamp': timestamp,
+                            'image': frame.to_image(),  # PIL image
+                        }
+                    )
+                    if capture_interval_sec is not None:
+                        next_capture_time += capture_interval_sec
+            container.close()
+            return frames
+        finally:
+            os.remove(tmp_path)
+    def _base64_frames(self, frames):
+        base64_frames = []
+        for f in frames:
+            buffered = BytesIO()
+            f['image'].save(buffered, format='JPEG')
+            encoded = base64.b64encode(buffered.getvalue()).decode('utf-8')
+            base64_frames.append(encoded)
+        return base64_frames
+    def _select_audio_format(self, formats):
+        audio_formats = [
+            f
+            for f in formats
+            if f.get('vcodec') == 'none'
+            and f.get('acodec')
+            and f.get('acodec') != 'none'
+        ]
+        if not audio_formats:
+            raise ValueError('No valid audio-only formats found')
+        # Prefer m4a > webm, highest abr first
+        preferred_exts = ['m4a', 'webm']
+        def sort_key(f):
+            ext_score = (
+                preferred_exts.index(f['ext']) if f['ext'] in preferred_exts else 99
+            )
+            abr =  f.get('abr') or 0
+            return (ext_score, -abr)
+        audio_formats.sort(key=sort_key)
+        return audio_formats[0]['url']
+    def _capture_audio(self, audio_url) -> BytesIO:
+        audio_buffer = BytesIO()
+        ffmpeg_audio_cmd = [
+            "ffmpeg",
+            "-i",
+            audio_url,
+            "-f",
+            "wav",
+            "-acodec",
+            "pcm_s16le",  # Whisper prefers PCM
+            "-ac",
+            "1",  # Mono
+            "-ar",
+            "16000",  # 16kHz for Whisper
+            "-",
+        ]
+        result = subprocess.run(
+            ffmpeg_audio_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        if result.returncode != 0:
+            raise RuntimeError('ffmpeg failed:\n' + result.stderr.decode())
+        audio_buffer = BytesIO(result.stdout)
+        audio_buffer.seek(0)
+        return audio_buffer

Upload all tools for final submission and __init__.py file

Upload all tools for final submission and init.py file