Final_Assignment_Template

Sleeping

@@ -1,4 +1,4 @@
-from tools import general_tools, file_agent_tools, data_agent_tools, math_agent_tools
 from langgraph.prebuilt import create_react_agent
 from langgraph.checkpoint.memory import MemorySaver
 from langchain_openai import ChatOpenAI
@@ -41,7 +41,25 @@ data_agent = create_react_agent(
     prompt="You process data. Use tools to filter and extract data."
 )
-prompt = """You are a supervisor. You coordinate file_reader, calculator, and data_processor to solve problems step by step.
 Do not do calculations or file reading yourself, use the tools.
 Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
 YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
@@ -49,10 +67,25 @@ If you are asked for a number, don't use comma to write your number neither use
 If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
 If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
 """
 # Supervisor
 excel_supervisor = create_supervisor(
     [file_agent, math_agent, data_agent],
     model=llm,
-    prompt=prompt
 ).compile()

+from tools import general_tools, file_agent_tools, data_agent_tools, math_agent_tools, analyze_video_tools, youtube_transcript_tools
 from langgraph.prebuilt import create_react_agent
 from langgraph.checkpoint.memory import MemorySaver
 from langchain_openai import ChatOpenAI
     prompt="You process data. Use tools to filter and extract data."
 )
+# Create video analysis agents
+video_agent = create_react_agent(
+    model=llm,
+    tools=analyze_video_tools(),
+    name="video_analyzer",
+    prompt="""You analyze visual content in videos. Use tools to detect and track objects.
+    The object_detection tool is a general object detection model. Use this for general cases.
+    The analyze_video_content uses both the object detection model and a vision llm to analyze frames with content given a question.
+    Use this for more difficult questions."""
+)
+transcript_agent = create_react_agent(
+    model=llm,
+    tools=youtube_transcript_tools(),
+    name="transcript_analyzer",
+    prompt="You analyze audio/speech content in videos. Use tools to get transcripts."
+)
+excel_prompt = """You are a supervisor. You coordinate file_reader, calculator, and data_processor to solve problems step by step.
 Do not do calculations or file reading yourself, use the tools.
 Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
 YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
 If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
 If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
 """
+video_analyzer_prompt = """You coordinate video_analyzer and transcript_analyzer to answer questions about YouTube videos.
+Use video_analyzer for visual questions (objects, people, actions). Use transcript_analyzer for audio questions (what people say).
+Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
+YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
+If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
+If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
+"""
 # Supervisor
 excel_supervisor = create_supervisor(
     [file_agent, math_agent, data_agent],
     model=llm,
+    prompt=excel_prompt
 ).compile()
+# Video supervisor
+video_supervisor = create_supervisor(
+    [video_agent, transcript_agent],
+    model=llm,
+    prompt=video_analyzer_prompt
+).compile()

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import gradio as gr
 import requests
 import inspect
 import pandas as pd
 # (Keep Constants as is)
 # --- Constants ---
@@ -11,13 +12,18 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # --- Basic Agent Definition ---
 # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
 class BasicAgent:
     def __init__(self):
         print("BasicAgent initialized.")
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
-        fixed_answer = "This is a default answer."
-        print(f"Agent returning fixed answer: {fixed_answer}")
-        return fixed_answer
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
@@ -80,7 +86,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
-            submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:

 import requests
 import inspect
 import pandas as pd
+from qa_graph import build_graph
 # (Keep Constants as is)
 # --- Constants ---
 # --- Basic Agent Definition ---
 # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
 class BasicAgent:
+    """A langgraph agent."""
     def __init__(self):
         print("BasicAgent initialized.")
+        self.graph = build_graph()
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
+        # Wrap the question in a HumanMessage from langchain_core
+        messages = self.graph.invoke({"question": question,  "decision": "",
+            "answer": ""})
+        answer = messages['messages'][-1].content
+        return answer[14:]
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
+            submitted_answer = agent(item)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:

qa_graph.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from dataclasses import dataclass
 from langgraph.graph import START, StateGraph, END
 from typing import TypedDict
-from agents import general_agent, excel_supervisor
 import os
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 os.environ["OPENAI_API_KEY"] = str(OPENAI_API_KEY)
@@ -75,6 +75,17 @@ def ask_question_with_file(question: Question, thread_id: str = "default") -> st
     return ask_question(enhanced_question, thread_id)
 test = [
 #     {
 #     "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
@@ -88,12 +99,36 @@ test = [
 #     "Level": "1",
 #     "file_name": "1f975693-876d-457b-a649-393859e79bf3.mp3"
 #   },
-              {
-    "task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
-    "question": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.",
     "Level": "1",
-    "file_name": "7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx"
-  }
             ]
 questions = [Question(**item) for item in test]
@@ -133,10 +168,22 @@ def ask_question_with_file_node(state: State) -> dict:
     # Return dict to update state
     return {"answer": answer}
 def router_node(state: State):
     """Router node - returns dict to update state"""
     if state["question"].file_name:
         decision = "query_with_file"
     else:
         decision = "query"
@@ -146,33 +193,39 @@ def router_function(state: State):
     """Routing function - returns string to choose path"""
     return state["decision"]
-# Graph
-builder = StateGraph(State)
-# Use the NODE functions (not the original functions)
-builder.add_node("query_with_file", ask_question_with_file_node)
-builder.add_node("query", ask_question_node)
-builder.add_node("router", router_node)
-# Define edges
-builder.add_edge(START, "router")
-builder.add_conditional_edges(
-    "router",
-    router_function,
-    {
-        "query_with_file": "query_with_file",
-        "query": "query",
-    },
-)
-builder.add_edge("query_with_file", END)
-builder.add_edge("query", END)
-react_graph = builder.compile()
 if __name__ == "__main__":
     for i, question in enumerate(questions):
         print(f"\n{i}. {question.question}")
         # Invoke the graph and capture the result
         result = react_graph.invoke({
             "question": question,

 from dataclasses import dataclass
 from langgraph.graph import START, StateGraph, END
 from typing import TypedDict
+from agents import general_agent, excel_supervisor, video_supervisor
 import os
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 os.environ["OPENAI_API_KEY"] = str(OPENAI_API_KEY)
     return ask_question(enhanced_question, thread_id)
+def ask_question_youtube(question: Question) -> str:
+    """Ask the agent a question, with optional file analysis."""
+    q = question.question
+    result = video_supervisor.invoke({
+        "messages": [
+            {"role": "user", "content": q}
+        ]
+    })
+    print(result)
+    return result["messages"][-1].content
 test = [
 #     {
 #     "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
 #     "Level": "1",
 #     "file_name": "1f975693-876d-457b-a649-393859e79bf3.mp3"
 #   },
+#               {
+#     "task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
+#     "question": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.",
+#     "Level": "1",
+#     "file_name": "7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx"
+#   },
+#                 {
+#     "task_id": "f918266a-b3e0-4914-865d-4faa564f1aef",
+#     "question": "What is the final numeric output from the attached Python code?",
+#     "Level": "1",
+#     "file_name": "f918266a-b3e0-4914-865d-4faa564f1aef.py"
+#   },
+#                   {
+#     "task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
+#     "question": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
+#     "Level": "1",
+#     "file_name": ""
+#   },
+                    {
+    "task_id": "9d191bce-651d-4746-be2d-7ef8ecadb9c2",
+    "question": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"",
     "Level": "1",
+    "file_name": ""
+  },
+                      {
+    "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
+    "question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
+    "Level": "1",
+    "file_name": ""
+  },
             ]
 questions = [Question(**item) for item in test]
     # Return dict to update state
     return {"answer": answer}
+def ask_question_youtube_node(state: State) -> dict:
+    """Node function for questions with files."""
+    question_obj = state["question"]
+    # Call your existing function
+    answer = ask_question_youtube(question_obj)
+    # Return dict to update state
+    return {"answer": answer}
 def router_node(state: State):
     """Router node - returns dict to update state"""
     if state["question"].file_name:
         decision = "query_with_file"
+    elif "youtube.com" in state["question"].question or "youtu.be" in state["question"].question:
+        decision = "youtube"
     else:
         decision = "query"
     """Routing function - returns string to choose path"""
     return state["decision"]
+def build_graph():
+    # Graph
+    builder = StateGraph(State)
+    # Use the NODE functions (not the original functions)
+    builder.add_node("query_with_file", ask_question_with_file_node)
+    builder.add_node("query", ask_question_node)
+    builder.add_node("youtube", ask_question_youtube_node)
+    builder.add_node("router", router_node)
+    # Define edges
+    builder.add_edge(START, "router")
+    builder.add_conditional_edges(
+        "router",
+        router_function,
+        {
+            "query_with_file": "query_with_file",
+            "query": "query",
+            "youtube": "youtube",
+        },
+    )
+    builder.add_edge("query_with_file", END)
+    builder.add_edge("query", END)
+    builder.add_edge("youtube", END)
+    react_graph = builder.compile()
+    return react_graph
 if __name__ == "__main__":
     for i, question in enumerate(questions):
         print(f"\n{i}. {question.question}")
+        react_graph = build_graph()
         # Invoke the graph and capture the result
         result = react_graph.invoke({
             "question": question,

requirements.txt CHANGED Viewed

@@ -1,2 +1,13 @@
 gradio
-requests

 gradio
+requests
+langgraph
+langgraph-supervisor
+langchain
+langchain_community
+langchain_openai
+duckduckgo-search
+wikipedia
+arxiv
+openpyxl
+ultralytics
+youtube-transcript-api

test.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

tools.py CHANGED Viewed

@@ -16,6 +16,11 @@ import os
 from huggingface_hub import InferenceClient
 import json
 import requests
 from dotenv import load_dotenv
 load_dotenv()
@@ -114,28 +119,6 @@ def transcribe_audio(file_path: str, question: str) -> str:
     except Exception as e:
         return f"Error transcribing audio: {str(e)}"
-#### Excel supervisor agent
-def general_tools():
-    tools = [
-        DuckDuckGoSearchRun(),
-        WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper()),
-        ArxivQueryRun(api_wrapper=ArxivAPIWrapper()),
-        analyze_image,
-        read_python_file,
-        transcribe_audio,
-    ]
-    return tools
-# Simple file tools
-@tool
-def read_excel(file_path: str) -> str:
-    """Read any Excel file and return as JSON."""
-    df = pd.read_excel(file_path)
-    return json.dumps(df.to_dict(orient='records'))
 # Simple math tools
 @tool
 def add(a: float, b: float) -> float:
@@ -173,6 +156,192 @@ def filter_rows(data: str, exclude_words: list) -> str:
             filtered.append(row)
     return json.dumps(filtered)
 def file_agent_tools():
     tools = [read_excel]
     return tools

 from huggingface_hub import InferenceClient
 import json
 import requests
+from youtube_transcript_api import YouTubeTranscriptApi
+from ultralytics import YOLO
+import cv2
+import re
 from dotenv import load_dotenv
 load_dotenv()
     except Exception as e:
         return f"Error transcribing audio: {str(e)}"
 # Simple math tools
 @tool
 def add(a: float, b: float) -> float:
             filtered.append(row)
     return json.dumps(filtered)
+@tool
+def read_excel(file_path: str) -> str:
+    """Read any Excel file and return as JSON."""
+    df = pd.read_excel(file_path)
+    return json.dumps(df.to_dict(orient='records'))
+@tool
+def object_detection(video_url: str) -> str:
+    """Analyze objects and visual content in a YouTube video."""
+    try:
+        model = YOLO("yolo11n.pt")  # Load an official Detect model
+        results = model.track(video_url)
+        # Track objects across frames
+        frame_objects = []
+        for i, result in enumerate(results):
+            if result.boxes is not None:
+                objects_in_frame = []
+                for j in range(len(result.boxes)):
+                    class_name = result.names[int(result.boxes.cls[j].item())]
+                    confidence = float(result.boxes.conf[j].item())
+                    if confidence > 0.5:  # Only high confidence detections
+                        objects_in_frame.append(class_name)
+                frame_objects.append({
+                    "frame": i,
+                    "objects": objects_in_frame,
+                    "unique_objects": list(set(objects_in_frame))
+                })
+        return json.dumps(frame_objects, indent=2)
+    except Exception as e:
+        return f"Error analyzing video: {str(e)}"
+@tool
+def get_youtube_transcript(video_url: str) -> str:
+    """Get transcript from a YouTube video."""
+    try:
+        # Extract video ID
+        video_id_match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', video_url)
+        if not video_id_match:
+            return "Error: Could not extract video ID"
+        video_id = video_id_match.group(1)
+        transcript = YouTubeTranscriptApi.get_transcript(video_id)
+        # Format with timestamps
+        formatted_transcript = []
+        for entry in transcript:
+            formatted_transcript.append({
+                "start": entry['start'],
+                "duration": entry['duration'],
+                "text": entry['text']
+            })
+        return json.dumps(formatted_transcript, indent=2)
+    except Exception as e:
+        return f"Error getting transcript: {str(e)}"
+    # @tool
+def analyze_video_content(video_url: str, question: str = "", max_vision_frames: int = 1) -> str:
+    """Analyze video content using YOLO for object detection and vision LLM for detailed analysis."""
+    try:
+        model = YOLO("yolo11n.pt")
+        results = model.track(video_url)
+        # Step 1: YOLO analysis for all frames
+        frame_objects = []
+        frames_with_content = []
+        for i, result in enumerate(results):
+            frame_data = {
+                "frame": i,
+                "objects": [],
+                "unique_objects": [],
+                "object_counts": {}
+            }
+            if result.boxes is not None:
+                objects_in_frame = []
+                for j in range(len(result.boxes)):
+                    class_name = result.names[int(result.boxes.cls[j].item())]
+                    confidence = float(result.boxes.conf[j].item())
+                    if confidence > 0.5:
+                        objects_in_frame.append(class_name)
+                # Count objects
+                for obj in objects_in_frame:
+                    frame_data["object_counts"][obj] = frame_data["object_counts"].get(obj, 0) + 1
+                frame_data["objects"] = objects_in_frame
+                frame_data["unique_objects"] = list(set(objects_in_frame))
+                # Store frame for potential vision analysis
+                if objects_in_frame:  # Only store frames with detected objects
+                    frames_with_content.append({
+                        "frame_index": i,
+                        "objects": objects_in_frame,
+                        "object_counts": frame_data["object_counts"],
+                        "total_objects": len(objects_in_frame),
+                        "image": result.orig_img
+                    })
+            frame_objects.append(frame_data)
+        # Step 2: If there's a specific question, use vision LLM on selected frames
+        detailed_analyses = []
+        if question.strip():
+            # Sort frames by total objects and select top frames
+            frames_with_content.sort(key=lambda x: x["total_objects"], reverse=True)
+            selected_frames = frames_with_content[:max_vision_frames]
+            for frame_data in selected_frames:
+                try:
+                    # Encode frame directly to base64
+                    _, buffer = cv2.imencode('.jpg', frame_data["image"])
+                    image_bytes = buffer.tobytes()
+                    image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+                    message = [
+                        HumanMessage(
+                            content=[
+                                {"type": "text", "text": question},
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
+                                }
+                            ]
+                        )
+                    ]
+                    vision_response = vision_llm.invoke(message)
+                    detailed_analyses.append({
+                        "frame_index": frame_data["frame_index"],
+                        "yolo_objects": frame_data["objects"],
+                        "yolo_counts": frame_data["object_counts"],
+                        "vision_analysis": vision_response.content
+                    })
+                except Exception as vision_error:
+                    detailed_analyses.append({
+                        "frame_index": frame_data["frame_index"],
+                        "yolo_objects": frame_data["objects"],
+                        "yolo_counts": frame_data["object_counts"],
+                        "vision_analysis": f"Vision analysis failed: {str(vision_error)}"
+                    })
+        # Combine results
+        result_data = {
+            "video_url": video_url,
+            "question": question,
+            "total_frames": len(frame_objects),
+            "yolo_analysis": frame_objects,
+            "frames_with_objects": len(frames_with_content)
+        }
+        if detailed_analyses:
+            result_data["detailed_vision_analysis"] = detailed_analyses
+            result_data["vision_frames_analyzed"] = len(detailed_analyses)
+        return json.dumps(result_data, indent=2)
+    except Exception as e:
+        return f"Error analyzing video content: {str(e)}"
+def general_tools():
+    tools = [
+        DuckDuckGoSearchRun(),
+        WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper()),
+        ArxivQueryRun(api_wrapper=ArxivAPIWrapper()),
+        analyze_image,
+        read_python_file,
+        transcribe_audio,
+    ]
+    return tools
+def analyze_video_tools():
+    tools = [object_detection, analyze_video_content]
+    return tools
+def youtube_transcript_tools():
+    tools = [get_youtube_transcript]
+    return tools
 def file_agent_tools():
     tools = [read_excel]
     return tools