Final_Assignment_Template

Sleeping

App Files Files Community

Final_Assignment_Template / src /tools /tools.py

ccgalindog

Upload 14 files

c741672 verified 10 months ago

raw

history blame contribute delete

3.27 kB

	import tempfile
	import os
	from typing import List, Tuple
	import cv2
	import yt_dlp
	from PIL import Image
	from smolagents import tool
	from src.utils.utils import image_to_base64
	from smolagents.models import OpenAIServerModel


	@tool
	def download_video(url: str) -> Tuple[str, str]:
	"""
	Tool to download a video from a given URL using yt-dlp.
	Args:
	url (str): The URL of the video to download.
	Returns:
	str: The path to the downloaded video file within the temp local folder.
	"""
	temp_dir = tempfile.mkdtemp()
	output_path = os.path.join(temp_dir, 'video.mp4')

	ydl_opts = {
	'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/mp4',
	'outtmpl': output_path,
	'quiet': True,
	'merge_output_format': 'mp4',
	}

	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.download([url])

	return output_path


	@tool
	def extract_frames(video_path: str, frame_interval: int=5) -> List[str]:
	"""Tool to extract frames from a video at a specified interval.
	Args:
	video_path (str): The path to the video file in local storage.
	frame_interval (int): The interval at which to extract frames (in seconds).
	Returns:
	list: A list of paths to the extracted frames.
	"""
	cap = cv2.VideoCapture(video_path)
	fps = cap.get(cv2.CAP_PROP_FPS)
	frame_number = 0
	images = []

	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break

	if int(cap.get(cv2.CAP_PROP_POS_FRAMES)) % int(fps * frame_interval) == 0:
	img_path = tempfile.mktemp(suffix=".jpg")
	cv2.imwrite(img_path, frame)
	images.append(img_path)

	frame_number += 1

	cap.release()
	return images


	@tool
	def analyze_frame_with_vision_model(img_path: str, query_to_image: str) -> str:
	"""
	Tool to analyze a frame using a vision model. This returns a text that
	describes the analysis result of the image.
	Args:
	img_path (str): The path to the image file.
	query_to_image (str): The question to ask about the image.
	Returns:
	str: The text description of the analysis result of the image. This
	result is always text, you need to analyze it to extract the answer
	to the question.
	"""

	sys_prompt = """You will receive a question about an image. Answer it very briefly,
	return only the answer, and nothign else. If the question
	is related to count something within the image, then you
	have to respond with just a number.

	"""
	model_id = "gpt-4o"
	vision_model = OpenAIServerModel(model_id=model_id, temperature=0.0)
	image_inb64 = image_to_base64(Image.open(img_path).resize((200, 200)))
	messages_i = [
	{
	"role": "user",
	"content": [
	{ "type": "text", "text": query_to_image },
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{image_inb64}",
	},
	},
	],
	}
	]
	response = vision_model.generate(messages_i)

	return response.content