ccgalindog's picture
Upload 14 files
c741672 verified
import tempfile
import os
from typing import List, Tuple
import cv2
import yt_dlp
from PIL import Image
from smolagents import tool
from src.utils.utils import image_to_base64
from smolagents.models import OpenAIServerModel
@tool
def download_video(url: str) -> Tuple[str, str]:
"""
Tool to download a video from a given URL using yt-dlp.
Args:
url (str): The URL of the video to download.
Returns:
str: The path to the downloaded video file within the temp local folder.
"""
temp_dir = tempfile.mkdtemp()
output_path = os.path.join(temp_dir, 'video.mp4')
ydl_opts = {
'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/mp4',
'outtmpl': output_path,
'quiet': True,
'merge_output_format': 'mp4',
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
return output_path
@tool
def extract_frames(video_path: str, frame_interval: int=5) -> List[str]:
"""Tool to extract frames from a video at a specified interval.
Args:
video_path (str): The path to the video file in local storage.
frame_interval (int): The interval at which to extract frames (in seconds).
Returns:
list: A list of paths to the extracted frames.
"""
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_number = 0
images = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if int(cap.get(cv2.CAP_PROP_POS_FRAMES)) % int(fps * frame_interval) == 0:
img_path = tempfile.mktemp(suffix=".jpg")
cv2.imwrite(img_path, frame)
images.append(img_path)
frame_number += 1
cap.release()
return images
@tool
def analyze_frame_with_vision_model(img_path: str, query_to_image: str) -> str:
"""
Tool to analyze a frame using a vision model. This returns a text that
describes the analysis result of the image.
Args:
img_path (str): The path to the image file.
query_to_image (str): The question to ask about the image.
Returns:
str: The text description of the analysis result of the image. This
result is always text, you need to analyze it to extract the answer
to the question.
"""
sys_prompt = """You will receive a question about an image. Answer it very briefly,
return only the answer, and nothign else. If the question
is related to count something within the image, then you
have to respond with just a number.
"""
model_id = "gpt-4o"
vision_model = OpenAIServerModel(model_id=model_id, temperature=0.0)
image_inb64 = image_to_base64(Image.open(img_path).resize((200, 200)))
messages_i = [
{
"role": "user",
"content": [
{ "type": "text", "text": query_to_image },
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_inb64}",
},
},
],
}
]
response = vision_model.generate(messages_i)
return response.content