| | import tempfile |
| | import os |
| | from typing import List, Tuple |
| | import cv2 |
| | import yt_dlp |
| | from PIL import Image |
| | from smolagents import tool |
| | from src.utils.utils import image_to_base64 |
| | from smolagents.models import OpenAIServerModel |
| |
|
| |
|
| | @tool |
| | def download_video(url: str) -> Tuple[str, str]: |
| | """ |
| | Tool to download a video from a given URL using yt-dlp. |
| | Args: |
| | url (str): The URL of the video to download. |
| | Returns: |
| | str: The path to the downloaded video file within the temp local folder. |
| | """ |
| | temp_dir = tempfile.mkdtemp() |
| | output_path = os.path.join(temp_dir, 'video.mp4') |
| |
|
| | ydl_opts = { |
| | 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/mp4', |
| | 'outtmpl': output_path, |
| | 'quiet': True, |
| | 'merge_output_format': 'mp4', |
| | } |
| |
|
| | with yt_dlp.YoutubeDL(ydl_opts) as ydl: |
| | ydl.download([url]) |
| |
|
| | return output_path |
| |
|
| |
|
| | @tool |
| | def extract_frames(video_path: str, frame_interval: int=5) -> List[str]: |
| | """Tool to extract frames from a video at a specified interval. |
| | Args: |
| | video_path (str): The path to the video file in local storage. |
| | frame_interval (int): The interval at which to extract frames (in seconds). |
| | Returns: |
| | list: A list of paths to the extracted frames. |
| | """ |
| | cap = cv2.VideoCapture(video_path) |
| | fps = cap.get(cv2.CAP_PROP_FPS) |
| | frame_number = 0 |
| | images = [] |
| |
|
| | while cap.isOpened(): |
| | ret, frame = cap.read() |
| | if not ret: |
| | break |
| |
|
| | if int(cap.get(cv2.CAP_PROP_POS_FRAMES)) % int(fps * frame_interval) == 0: |
| | img_path = tempfile.mktemp(suffix=".jpg") |
| | cv2.imwrite(img_path, frame) |
| | images.append(img_path) |
| |
|
| | frame_number += 1 |
| |
|
| | cap.release() |
| | return images |
| |
|
| |
|
| | @tool |
| | def analyze_frame_with_vision_model(img_path: str, query_to_image: str) -> str: |
| | """ |
| | Tool to analyze a frame using a vision model. This returns a text that |
| | describes the analysis result of the image. |
| | Args: |
| | img_path (str): The path to the image file. |
| | query_to_image (str): The question to ask about the image. |
| | Returns: |
| | str: The text description of the analysis result of the image. This |
| | result is always text, you need to analyze it to extract the answer |
| | to the question. |
| | """ |
| |
|
| | sys_prompt = """You will receive a question about an image. Answer it very briefly, |
| | return only the answer, and nothign else. If the question |
| | is related to count something within the image, then you |
| | have to respond with just a number. |
| | |
| | """ |
| | model_id = "gpt-4o" |
| | vision_model = OpenAIServerModel(model_id=model_id, temperature=0.0) |
| | image_inb64 = image_to_base64(Image.open(img_path).resize((200, 200))) |
| | messages_i = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | { "type": "text", "text": query_to_image }, |
| | { |
| | "type": "image_url", |
| | "image_url": { |
| | "url": f"data:image/jpeg;base64,{image_inb64}", |
| | }, |
| | }, |
| | ], |
| | } |
| | ] |
| | response = vision_model.generate(messages_i) |
| |
|
| | return response.content |
| |
|