Spaces:
Sleeping
Sleeping
add youtube tool
Browse files- .DS_Store +0 -0
- .gitignore +2 -1
- __pycache__/agents.cpython-312.pyc +0 -0
- __pycache__/tools.cpython-312.pyc +0 -0
- agents.py +36 -3
- app.py +10 -4
- qa_graph.py +81 -28
- requirements.txt +12 -1
- test.ipynb +0 -0
- tools.py +191 -22
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
.gitignore
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
-
.env
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
.venv
|
__pycache__/agents.cpython-312.pyc
ADDED
|
Binary file (4.12 kB). View file
|
|
|
__pycache__/tools.cpython-312.pyc
ADDED
|
Binary file (15.2 kB). View file
|
|
|
agents.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from tools import general_tools, file_agent_tools, data_agent_tools, math_agent_tools
|
| 2 |
from langgraph.prebuilt import create_react_agent
|
| 3 |
from langgraph.checkpoint.memory import MemorySaver
|
| 4 |
from langchain_openai import ChatOpenAI
|
|
@@ -41,7 +41,25 @@ data_agent = create_react_agent(
|
|
| 41 |
prompt="You process data. Use tools to filter and extract data."
|
| 42 |
)
|
| 43 |
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
Do not do calculations or file reading yourself, use the tools.
|
| 46 |
Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
|
| 47 |
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
|
@@ -49,10 +67,25 @@ If you are asked for a number, don't use comma to write your number neither use
|
|
| 49 |
If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
| 50 |
If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
| 51 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
# Supervisor
|
| 53 |
excel_supervisor = create_supervisor(
|
| 54 |
[file_agent, math_agent, data_agent],
|
| 55 |
model=llm,
|
| 56 |
-
prompt=
|
| 57 |
).compile()
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tools import general_tools, file_agent_tools, data_agent_tools, math_agent_tools, analyze_video_tools, youtube_transcript_tools
|
| 2 |
from langgraph.prebuilt import create_react_agent
|
| 3 |
from langgraph.checkpoint.memory import MemorySaver
|
| 4 |
from langchain_openai import ChatOpenAI
|
|
|
|
| 41 |
prompt="You process data. Use tools to filter and extract data."
|
| 42 |
)
|
| 43 |
|
| 44 |
+
# Create video analysis agents
|
| 45 |
+
video_agent = create_react_agent(
|
| 46 |
+
model=llm,
|
| 47 |
+
tools=analyze_video_tools(),
|
| 48 |
+
name="video_analyzer",
|
| 49 |
+
prompt="""You analyze visual content in videos. Use tools to detect and track objects.
|
| 50 |
+
The object_detection tool is a general object detection model. Use this for general cases.
|
| 51 |
+
The analyze_video_content uses both the object detection model and a vision llm to analyze frames with content given a question.
|
| 52 |
+
Use this for more difficult questions."""
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
transcript_agent = create_react_agent(
|
| 56 |
+
model=llm,
|
| 57 |
+
tools=youtube_transcript_tools(),
|
| 58 |
+
name="transcript_analyzer",
|
| 59 |
+
prompt="You analyze audio/speech content in videos. Use tools to get transcripts."
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
excel_prompt = """You are a supervisor. You coordinate file_reader, calculator, and data_processor to solve problems step by step.
|
| 63 |
Do not do calculations or file reading yourself, use the tools.
|
| 64 |
Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
|
| 65 |
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
|
|
|
| 67 |
If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
| 68 |
If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
| 69 |
"""
|
| 70 |
+
|
| 71 |
+
video_analyzer_prompt = """You coordinate video_analyzer and transcript_analyzer to answer questions about YouTube videos.
|
| 72 |
+
Use video_analyzer for visual questions (objects, people, actions). Use transcript_analyzer for audio questions (what people say).
|
| 73 |
+
Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
|
| 74 |
+
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
| 75 |
+
If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
|
| 76 |
+
If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
| 77 |
+
If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
| 78 |
+
"""
|
| 79 |
# Supervisor
|
| 80 |
excel_supervisor = create_supervisor(
|
| 81 |
[file_agent, math_agent, data_agent],
|
| 82 |
model=llm,
|
| 83 |
+
prompt=excel_prompt
|
| 84 |
).compile()
|
| 85 |
|
| 86 |
+
# Video supervisor
|
| 87 |
+
video_supervisor = create_supervisor(
|
| 88 |
+
[video_agent, transcript_agent],
|
| 89 |
+
model=llm,
|
| 90 |
+
prompt=video_analyzer_prompt
|
| 91 |
+
).compile()
|
app.py
CHANGED
|
@@ -3,6 +3,7 @@ import gradio as gr
|
|
| 3 |
import requests
|
| 4 |
import inspect
|
| 5 |
import pandas as pd
|
|
|
|
| 6 |
|
| 7 |
# (Keep Constants as is)
|
| 8 |
# --- Constants ---
|
|
@@ -11,13 +12,18 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
| 11 |
# --- Basic Agent Definition ---
|
| 12 |
# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
|
| 13 |
class BasicAgent:
|
|
|
|
| 14 |
def __init__(self):
|
| 15 |
print("BasicAgent initialized.")
|
|
|
|
|
|
|
| 16 |
def __call__(self, question: str) -> str:
|
| 17 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
| 21 |
|
| 22 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
| 23 |
"""
|
|
@@ -80,7 +86,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 80 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 81 |
continue
|
| 82 |
try:
|
| 83 |
-
submitted_answer = agent(
|
| 84 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 85 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 86 |
except Exception as e:
|
|
|
|
| 3 |
import requests
|
| 4 |
import inspect
|
| 5 |
import pandas as pd
|
| 6 |
+
from qa_graph import build_graph
|
| 7 |
|
| 8 |
# (Keep Constants as is)
|
| 9 |
# --- Constants ---
|
|
|
|
| 12 |
# --- Basic Agent Definition ---
|
| 13 |
# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
|
| 14 |
class BasicAgent:
|
| 15 |
+
"""A langgraph agent."""
|
| 16 |
def __init__(self):
|
| 17 |
print("BasicAgent initialized.")
|
| 18 |
+
self.graph = build_graph()
|
| 19 |
+
|
| 20 |
def __call__(self, question: str) -> str:
|
| 21 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
| 22 |
+
# Wrap the question in a HumanMessage from langchain_core
|
| 23 |
+
messages = self.graph.invoke({"question": question, "decision": "",
|
| 24 |
+
"answer": ""})
|
| 25 |
+
answer = messages['messages'][-1].content
|
| 26 |
+
return answer[14:]
|
| 27 |
|
| 28 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
| 29 |
"""
|
|
|
|
| 86 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 87 |
continue
|
| 88 |
try:
|
| 89 |
+
submitted_answer = agent(item)
|
| 90 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 91 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 92 |
except Exception as e:
|
qa_graph.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
from dataclasses import dataclass
|
| 2 |
from langgraph.graph import START, StateGraph, END
|
| 3 |
from typing import TypedDict
|
| 4 |
-
from agents import general_agent, excel_supervisor
|
| 5 |
import os
|
| 6 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 7 |
os.environ["OPENAI_API_KEY"] = str(OPENAI_API_KEY)
|
|
@@ -75,6 +75,17 @@ def ask_question_with_file(question: Question, thread_id: str = "default") -> st
|
|
| 75 |
|
| 76 |
return ask_question(enhanced_question, thread_id)
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
test = [
|
| 79 |
# {
|
| 80 |
# "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
|
|
@@ -88,12 +99,36 @@ test = [
|
|
| 88 |
# "Level": "1",
|
| 89 |
# "file_name": "1f975693-876d-457b-a649-393859e79bf3.mp3"
|
| 90 |
# },
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
"Level": "1",
|
| 95 |
-
"file_name": "
|
| 96 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
]
|
| 98 |
|
| 99 |
questions = [Question(**item) for item in test]
|
|
@@ -133,10 +168,22 @@ def ask_question_with_file_node(state: State) -> dict:
|
|
| 133 |
# Return dict to update state
|
| 134 |
return {"answer": answer}
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
def router_node(state: State):
|
| 137 |
"""Router node - returns dict to update state"""
|
| 138 |
if state["question"].file_name:
|
| 139 |
decision = "query_with_file"
|
|
|
|
|
|
|
| 140 |
else:
|
| 141 |
decision = "query"
|
| 142 |
|
|
@@ -146,33 +193,39 @@ def router_function(state: State):
|
|
| 146 |
"""Routing function - returns string to choose path"""
|
| 147 |
return state["decision"]
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
builder.add_node("
|
| 155 |
-
builder.add_node("
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
"router"
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
if __name__ == "__main__":
|
| 173 |
for i, question in enumerate(questions):
|
| 174 |
print(f"\n{i}. {question.question}")
|
| 175 |
|
|
|
|
| 176 |
# Invoke the graph and capture the result
|
| 177 |
result = react_graph.invoke({
|
| 178 |
"question": question,
|
|
|
|
| 1 |
from dataclasses import dataclass
|
| 2 |
from langgraph.graph import START, StateGraph, END
|
| 3 |
from typing import TypedDict
|
| 4 |
+
from agents import general_agent, excel_supervisor, video_supervisor
|
| 5 |
import os
|
| 6 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 7 |
os.environ["OPENAI_API_KEY"] = str(OPENAI_API_KEY)
|
|
|
|
| 75 |
|
| 76 |
return ask_question(enhanced_question, thread_id)
|
| 77 |
|
| 78 |
+
def ask_question_youtube(question: Question) -> str:
|
| 79 |
+
"""Ask the agent a question, with optional file analysis."""
|
| 80 |
+
q = question.question
|
| 81 |
+
result = video_supervisor.invoke({
|
| 82 |
+
"messages": [
|
| 83 |
+
{"role": "user", "content": q}
|
| 84 |
+
]
|
| 85 |
+
})
|
| 86 |
+
print(result)
|
| 87 |
+
return result["messages"][-1].content
|
| 88 |
+
|
| 89 |
test = [
|
| 90 |
# {
|
| 91 |
# "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
|
|
|
|
| 99 |
# "Level": "1",
|
| 100 |
# "file_name": "1f975693-876d-457b-a649-393859e79bf3.mp3"
|
| 101 |
# },
|
| 102 |
+
# {
|
| 103 |
+
# "task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
|
| 104 |
+
# "question": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.",
|
| 105 |
+
# "Level": "1",
|
| 106 |
+
# "file_name": "7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx"
|
| 107 |
+
# },
|
| 108 |
+
# {
|
| 109 |
+
# "task_id": "f918266a-b3e0-4914-865d-4faa564f1aef",
|
| 110 |
+
# "question": "What is the final numeric output from the attached Python code?",
|
| 111 |
+
# "Level": "1",
|
| 112 |
+
# "file_name": "f918266a-b3e0-4914-865d-4faa564f1aef.py"
|
| 113 |
+
# },
|
| 114 |
+
# {
|
| 115 |
+
# "task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
|
| 116 |
+
# "question": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
|
| 117 |
+
# "Level": "1",
|
| 118 |
+
# "file_name": ""
|
| 119 |
+
# },
|
| 120 |
+
{
|
| 121 |
+
"task_id": "9d191bce-651d-4746-be2d-7ef8ecadb9c2",
|
| 122 |
+
"question": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"",
|
| 123 |
"Level": "1",
|
| 124 |
+
"file_name": ""
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
|
| 128 |
+
"question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
|
| 129 |
+
"Level": "1",
|
| 130 |
+
"file_name": ""
|
| 131 |
+
},
|
| 132 |
]
|
| 133 |
|
| 134 |
questions = [Question(**item) for item in test]
|
|
|
|
| 168 |
# Return dict to update state
|
| 169 |
return {"answer": answer}
|
| 170 |
|
| 171 |
+
def ask_question_youtube_node(state: State) -> dict:
|
| 172 |
+
"""Node function for questions with files."""
|
| 173 |
+
question_obj = state["question"]
|
| 174 |
+
|
| 175 |
+
# Call your existing function
|
| 176 |
+
answer = ask_question_youtube(question_obj)
|
| 177 |
+
|
| 178 |
+
# Return dict to update state
|
| 179 |
+
return {"answer": answer}
|
| 180 |
+
|
| 181 |
def router_node(state: State):
|
| 182 |
"""Router node - returns dict to update state"""
|
| 183 |
if state["question"].file_name:
|
| 184 |
decision = "query_with_file"
|
| 185 |
+
elif "youtube.com" in state["question"].question or "youtu.be" in state["question"].question:
|
| 186 |
+
decision = "youtube"
|
| 187 |
else:
|
| 188 |
decision = "query"
|
| 189 |
|
|
|
|
| 193 |
"""Routing function - returns string to choose path"""
|
| 194 |
return state["decision"]
|
| 195 |
|
| 196 |
+
def build_graph():
|
| 197 |
+
# Graph
|
| 198 |
+
builder = StateGraph(State)
|
| 199 |
+
|
| 200 |
+
# Use the NODE functions (not the original functions)
|
| 201 |
+
builder.add_node("query_with_file", ask_question_with_file_node)
|
| 202 |
+
builder.add_node("query", ask_question_node)
|
| 203 |
+
builder.add_node("youtube", ask_question_youtube_node)
|
| 204 |
+
builder.add_node("router", router_node)
|
| 205 |
+
|
| 206 |
+
# Define edges
|
| 207 |
+
builder.add_edge(START, "router")
|
| 208 |
+
builder.add_conditional_edges(
|
| 209 |
+
"router",
|
| 210 |
+
router_function,
|
| 211 |
+
{
|
| 212 |
+
"query_with_file": "query_with_file",
|
| 213 |
+
"query": "query",
|
| 214 |
+
"youtube": "youtube",
|
| 215 |
+
},
|
| 216 |
+
)
|
| 217 |
+
builder.add_edge("query_with_file", END)
|
| 218 |
+
builder.add_edge("query", END)
|
| 219 |
+
builder.add_edge("youtube", END)
|
| 220 |
+
|
| 221 |
+
react_graph = builder.compile()
|
| 222 |
+
return react_graph
|
| 223 |
|
| 224 |
if __name__ == "__main__":
|
| 225 |
for i, question in enumerate(questions):
|
| 226 |
print(f"\n{i}. {question.question}")
|
| 227 |
|
| 228 |
+
react_graph = build_graph()
|
| 229 |
# Invoke the graph and capture the result
|
| 230 |
result = react_graph.invoke({
|
| 231 |
"question": question,
|
requirements.txt
CHANGED
|
@@ -1,2 +1,13 @@
|
|
| 1 |
gradio
|
| 2 |
-
requests
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
gradio
|
| 2 |
+
requests
|
| 3 |
+
langgraph
|
| 4 |
+
langgraph-supervisor
|
| 5 |
+
langchain
|
| 6 |
+
langchain_community
|
| 7 |
+
langchain_openai
|
| 8 |
+
duckduckgo-search
|
| 9 |
+
wikipedia
|
| 10 |
+
arxiv
|
| 11 |
+
openpyxl
|
| 12 |
+
ultralytics
|
| 13 |
+
youtube-transcript-api
|
test.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tools.py
CHANGED
|
@@ -16,6 +16,11 @@ import os
|
|
| 16 |
from huggingface_hub import InferenceClient
|
| 17 |
import json
|
| 18 |
import requests
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
from dotenv import load_dotenv
|
| 21 |
load_dotenv()
|
|
@@ -114,28 +119,6 @@ def transcribe_audio(file_path: str, question: str) -> str:
|
|
| 114 |
except Exception as e:
|
| 115 |
return f"Error transcribing audio: {str(e)}"
|
| 116 |
|
| 117 |
-
#### Excel supervisor agent
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
def general_tools():
|
| 121 |
-
tools = [
|
| 122 |
-
DuckDuckGoSearchRun(),
|
| 123 |
-
WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper()),
|
| 124 |
-
ArxivQueryRun(api_wrapper=ArxivAPIWrapper()),
|
| 125 |
-
analyze_image,
|
| 126 |
-
read_python_file,
|
| 127 |
-
transcribe_audio,
|
| 128 |
-
]
|
| 129 |
-
return tools
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
# Simple file tools
|
| 133 |
-
@tool
|
| 134 |
-
def read_excel(file_path: str) -> str:
|
| 135 |
-
"""Read any Excel file and return as JSON."""
|
| 136 |
-
df = pd.read_excel(file_path)
|
| 137 |
-
return json.dumps(df.to_dict(orient='records'))
|
| 138 |
-
|
| 139 |
# Simple math tools
|
| 140 |
@tool
|
| 141 |
def add(a: float, b: float) -> float:
|
|
@@ -173,6 +156,192 @@ def filter_rows(data: str, exclude_words: list) -> str:
|
|
| 173 |
filtered.append(row)
|
| 174 |
return json.dumps(filtered)
|
| 175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
def file_agent_tools():
|
| 177 |
tools = [read_excel]
|
| 178 |
return tools
|
|
|
|
| 16 |
from huggingface_hub import InferenceClient
|
| 17 |
import json
|
| 18 |
import requests
|
| 19 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
| 20 |
+
from ultralytics import YOLO
|
| 21 |
+
import cv2
|
| 22 |
+
|
| 23 |
+
import re
|
| 24 |
|
| 25 |
from dotenv import load_dotenv
|
| 26 |
load_dotenv()
|
|
|
|
| 119 |
except Exception as e:
|
| 120 |
return f"Error transcribing audio: {str(e)}"
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
# Simple math tools
|
| 123 |
@tool
|
| 124 |
def add(a: float, b: float) -> float:
|
|
|
|
| 156 |
filtered.append(row)
|
| 157 |
return json.dumps(filtered)
|
| 158 |
|
| 159 |
+
@tool
|
| 160 |
+
def read_excel(file_path: str) -> str:
|
| 161 |
+
"""Read any Excel file and return as JSON."""
|
| 162 |
+
df = pd.read_excel(file_path)
|
| 163 |
+
return json.dumps(df.to_dict(orient='records'))
|
| 164 |
+
|
| 165 |
+
@tool
|
| 166 |
+
def object_detection(video_url: str) -> str:
|
| 167 |
+
"""Analyze objects and visual content in a YouTube video."""
|
| 168 |
+
try:
|
| 169 |
+
model = YOLO("yolo11n.pt") # Load an official Detect model
|
| 170 |
+
results = model.track(video_url)
|
| 171 |
+
|
| 172 |
+
# Track objects across frames
|
| 173 |
+
frame_objects = []
|
| 174 |
+
for i, result in enumerate(results):
|
| 175 |
+
if result.boxes is not None:
|
| 176 |
+
objects_in_frame = []
|
| 177 |
+
for j in range(len(result.boxes)):
|
| 178 |
+
class_name = result.names[int(result.boxes.cls[j].item())]
|
| 179 |
+
confidence = float(result.boxes.conf[j].item())
|
| 180 |
+
if confidence > 0.5: # Only high confidence detections
|
| 181 |
+
objects_in_frame.append(class_name)
|
| 182 |
+
|
| 183 |
+
frame_objects.append({
|
| 184 |
+
"frame": i,
|
| 185 |
+
"objects": objects_in_frame,
|
| 186 |
+
"unique_objects": list(set(objects_in_frame))
|
| 187 |
+
})
|
| 188 |
+
|
| 189 |
+
return json.dumps(frame_objects, indent=2)
|
| 190 |
+
|
| 191 |
+
except Exception as e:
|
| 192 |
+
return f"Error analyzing video: {str(e)}"
|
| 193 |
+
|
| 194 |
+
@tool
|
| 195 |
+
def get_youtube_transcript(video_url: str) -> str:
|
| 196 |
+
"""Get transcript from a YouTube video."""
|
| 197 |
+
try:
|
| 198 |
+
# Extract video ID
|
| 199 |
+
video_id_match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', video_url)
|
| 200 |
+
if not video_id_match:
|
| 201 |
+
return "Error: Could not extract video ID"
|
| 202 |
+
|
| 203 |
+
video_id = video_id_match.group(1)
|
| 204 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
| 205 |
+
|
| 206 |
+
# Format with timestamps
|
| 207 |
+
formatted_transcript = []
|
| 208 |
+
for entry in transcript:
|
| 209 |
+
formatted_transcript.append({
|
| 210 |
+
"start": entry['start'],
|
| 211 |
+
"duration": entry['duration'],
|
| 212 |
+
"text": entry['text']
|
| 213 |
+
})
|
| 214 |
+
|
| 215 |
+
return json.dumps(formatted_transcript, indent=2)
|
| 216 |
+
|
| 217 |
+
except Exception as e:
|
| 218 |
+
return f"Error getting transcript: {str(e)}"
|
| 219 |
+
|
| 220 |
+
# @tool
|
| 221 |
+
def analyze_video_content(video_url: str, question: str = "", max_vision_frames: int = 1) -> str:
|
| 222 |
+
"""Analyze video content using YOLO for object detection and vision LLM for detailed analysis."""
|
| 223 |
+
try:
|
| 224 |
+
model = YOLO("yolo11n.pt")
|
| 225 |
+
results = model.track(video_url)
|
| 226 |
+
|
| 227 |
+
# Step 1: YOLO analysis for all frames
|
| 228 |
+
frame_objects = []
|
| 229 |
+
frames_with_content = []
|
| 230 |
+
|
| 231 |
+
for i, result in enumerate(results):
|
| 232 |
+
frame_data = {
|
| 233 |
+
"frame": i,
|
| 234 |
+
"objects": [],
|
| 235 |
+
"unique_objects": [],
|
| 236 |
+
"object_counts": {}
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
if result.boxes is not None:
|
| 240 |
+
objects_in_frame = []
|
| 241 |
+
for j in range(len(result.boxes)):
|
| 242 |
+
class_name = result.names[int(result.boxes.cls[j].item())]
|
| 243 |
+
confidence = float(result.boxes.conf[j].item())
|
| 244 |
+
if confidence > 0.5:
|
| 245 |
+
objects_in_frame.append(class_name)
|
| 246 |
+
|
| 247 |
+
# Count objects
|
| 248 |
+
for obj in objects_in_frame:
|
| 249 |
+
frame_data["object_counts"][obj] = frame_data["object_counts"].get(obj, 0) + 1
|
| 250 |
+
|
| 251 |
+
frame_data["objects"] = objects_in_frame
|
| 252 |
+
frame_data["unique_objects"] = list(set(objects_in_frame))
|
| 253 |
+
|
| 254 |
+
# Store frame for potential vision analysis
|
| 255 |
+
if objects_in_frame: # Only store frames with detected objects
|
| 256 |
+
frames_with_content.append({
|
| 257 |
+
"frame_index": i,
|
| 258 |
+
"objects": objects_in_frame,
|
| 259 |
+
"object_counts": frame_data["object_counts"],
|
| 260 |
+
"total_objects": len(objects_in_frame),
|
| 261 |
+
"image": result.orig_img
|
| 262 |
+
})
|
| 263 |
+
|
| 264 |
+
frame_objects.append(frame_data)
|
| 265 |
+
|
| 266 |
+
# Step 2: If there's a specific question, use vision LLM on selected frames
|
| 267 |
+
detailed_analyses = []
|
| 268 |
+
if question.strip():
|
| 269 |
+
# Sort frames by total objects and select top frames
|
| 270 |
+
frames_with_content.sort(key=lambda x: x["total_objects"], reverse=True)
|
| 271 |
+
selected_frames = frames_with_content[:max_vision_frames]
|
| 272 |
+
|
| 273 |
+
for frame_data in selected_frames:
|
| 274 |
+
try:
|
| 275 |
+
# Encode frame directly to base64
|
| 276 |
+
_, buffer = cv2.imencode('.jpg', frame_data["image"])
|
| 277 |
+
image_bytes = buffer.tobytes()
|
| 278 |
+
image_base64 = base64.b64encode(image_bytes).decode("utf-8")
|
| 279 |
+
|
| 280 |
+
message = [
|
| 281 |
+
HumanMessage(
|
| 282 |
+
content=[
|
| 283 |
+
{"type": "text", "text": question},
|
| 284 |
+
{
|
| 285 |
+
"type": "image_url",
|
| 286 |
+
"image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
|
| 287 |
+
}
|
| 288 |
+
]
|
| 289 |
+
)
|
| 290 |
+
]
|
| 291 |
+
|
| 292 |
+
vision_response = vision_llm.invoke(message)
|
| 293 |
+
|
| 294 |
+
detailed_analyses.append({
|
| 295 |
+
"frame_index": frame_data["frame_index"],
|
| 296 |
+
"yolo_objects": frame_data["objects"],
|
| 297 |
+
"yolo_counts": frame_data["object_counts"],
|
| 298 |
+
"vision_analysis": vision_response.content
|
| 299 |
+
})
|
| 300 |
+
|
| 301 |
+
except Exception as vision_error:
|
| 302 |
+
detailed_analyses.append({
|
| 303 |
+
"frame_index": frame_data["frame_index"],
|
| 304 |
+
"yolo_objects": frame_data["objects"],
|
| 305 |
+
"yolo_counts": frame_data["object_counts"],
|
| 306 |
+
"vision_analysis": f"Vision analysis failed: {str(vision_error)}"
|
| 307 |
+
})
|
| 308 |
+
|
| 309 |
+
# Combine results
|
| 310 |
+
result_data = {
|
| 311 |
+
"video_url": video_url,
|
| 312 |
+
"question": question,
|
| 313 |
+
"total_frames": len(frame_objects),
|
| 314 |
+
"yolo_analysis": frame_objects,
|
| 315 |
+
"frames_with_objects": len(frames_with_content)
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
if detailed_analyses:
|
| 319 |
+
result_data["detailed_vision_analysis"] = detailed_analyses
|
| 320 |
+
result_data["vision_frames_analyzed"] = len(detailed_analyses)
|
| 321 |
+
|
| 322 |
+
return json.dumps(result_data, indent=2)
|
| 323 |
+
|
| 324 |
+
except Exception as e:
|
| 325 |
+
return f"Error analyzing video content: {str(e)}"
|
| 326 |
+
def general_tools():
|
| 327 |
+
tools = [
|
| 328 |
+
DuckDuckGoSearchRun(),
|
| 329 |
+
WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper()),
|
| 330 |
+
ArxivQueryRun(api_wrapper=ArxivAPIWrapper()),
|
| 331 |
+
analyze_image,
|
| 332 |
+
read_python_file,
|
| 333 |
+
transcribe_audio,
|
| 334 |
+
]
|
| 335 |
+
return tools
|
| 336 |
+
|
| 337 |
+
def analyze_video_tools():
|
| 338 |
+
tools = [object_detection, analyze_video_content]
|
| 339 |
+
return tools
|
| 340 |
+
|
| 341 |
+
def youtube_transcript_tools():
|
| 342 |
+
tools = [get_youtube_transcript]
|
| 343 |
+
return tools
|
| 344 |
+
|
| 345 |
def file_agent_tools():
|
| 346 |
tools = [read_excel]
|
| 347 |
return tools
|