import tempfile import os from typing import List, Tuple import cv2 import yt_dlp from PIL import Image from smolagents import tool from src.utils.utils import image_to_base64 from smolagents.models import OpenAIServerModel @tool def download_video(url: str) -> Tuple[str, str]: """ Tool to download a video from a given URL using yt-dlp. Args: url (str): The URL of the video to download. Returns: str: The path to the downloaded video file within the temp local folder. """ temp_dir = tempfile.mkdtemp() output_path = os.path.join(temp_dir, 'video.mp4') ydl_opts = { 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/mp4', 'outtmpl': output_path, 'quiet': True, 'merge_output_format': 'mp4', } with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) return output_path @tool def extract_frames(video_path: str, frame_interval: int=5) -> List[str]: """Tool to extract frames from a video at a specified interval. Args: video_path (str): The path to the video file in local storage. frame_interval (int): The interval at which to extract frames (in seconds). Returns: list: A list of paths to the extracted frames. """ cap = cv2.VideoCapture(video_path) fps = cap.get(cv2.CAP_PROP_FPS) frame_number = 0 images = [] while cap.isOpened(): ret, frame = cap.read() if not ret: break if int(cap.get(cv2.CAP_PROP_POS_FRAMES)) % int(fps * frame_interval) == 0: img_path = tempfile.mktemp(suffix=".jpg") cv2.imwrite(img_path, frame) images.append(img_path) frame_number += 1 cap.release() return images @tool def analyze_frame_with_vision_model(img_path: str, query_to_image: str) -> str: """ Tool to analyze a frame using a vision model. This returns a text that describes the analysis result of the image. Args: img_path (str): The path to the image file. query_to_image (str): The question to ask about the image. Returns: str: The text description of the analysis result of the image. This result is always text, you need to analyze it to extract the answer to the question. """ sys_prompt = """You will receive a question about an image. Answer it very briefly, return only the answer, and nothign else. If the question is related to count something within the image, then you have to respond with just a number. """ model_id = "gpt-4o" vision_model = OpenAIServerModel(model_id=model_id, temperature=0.0) image_inb64 = image_to_base64(Image.open(img_path).resize((200, 200))) messages_i = [ { "role": "user", "content": [ { "type": "text", "text": query_to_image }, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{image_inb64}", }, }, ], } ] response = vision_model.generate(messages_i) return response.content