File size: 2,247 Bytes
697ab39
5fbd0a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os
from smolagents import tool
from google import genai
from google.genai import types

# Initialize client once
client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))

@tool
def analyze_video(video_source: str, question: str) -> str:
    """
    Analyzes a video (YouTube URL or local file path) to answer a specific question.
    Args:
        video_source: The YouTube URL or the local path to the video file.
        question: The question you want to ask about the video content.
    """
    # 1. Handle YouTube vs Local
    if "youtube.com" in video_source or "youtu.be" in video_source:
        video_part = types.Part(file_data=types.FileData(file_uri=video_source))
    else:
        # Upload local file to Gemini's File API (stored for 48h)
        uploaded_file = client.files.upload(file=video_source)
        video_part = types.Part(file_data=types.FileData(file_uri=uploaded_file.uri))

    # 2. Generate content
    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=[video_part, question]
    )
    return response.text

@tool
def analyze_image(image_path: str, question: str) -> str:
    """
    Uses native vision to analyze an image file and answer questions about it.
    Args:
        image_path: Path to the image file (jpg, png, webp).
        question: What you want to know about the image.
    """
    uploaded_file = client.files.upload(file=image_path)
    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=[uploaded_file, question]
    )
    return response.text

@tool
def analyze_audio(audio_path: str, question: str) -> str:
    """
    Analyzes audio files (mp3, wav) to transcribe or answer questions about content and tone.
    Args:
        audio_path: Path to the audio file.
        question: The question or instruction (e.g., 'Summarize the mood' or 'Transcribe this').
    """
    uploaded_file = client.files.upload(file=audio_path)
    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=[uploaded_file, question]
    )
    return response.text


# approach inspired by: https://huggingface.co/spaces/DeekshithN05/Final_Assignment_Template/blob/main/agent.py