Spaces:

DenisRz
/

GAIA-Agent

Sleeping

App Files Files Community

DenisRz commited on Jan 14

Commit

67d287e

1 Parent(s): 0ca5d0a

Initial upload: GAIA Agent

Browse files

Files changed (19) hide show

README.md +34 -5
agent.py +184 -0
app.py +598 -0
prompts.yaml +114 -0
requirements.txt +38 -0
tools/__init__.py +88 -0
tools/__pycache__/__init__.cpython-311.pyc +0 -0
tools/__pycache__/code_executors.cpython-311.pyc +0 -0
tools/__pycache__/file_tools.cpython-311.pyc +0 -0
tools/__pycache__/image_tools.cpython-311.pyc +0 -0
tools/__pycache__/math_tools.cpython-311.pyc +0 -0
tools/__pycache__/media_tools.cpython-311.pyc +0 -0
tools/__pycache__/web_tools.cpython-311.pyc +0 -0
tools/code_executors.py +131 -0
tools/file_tools.py +77 -0
tools/image_tools.py +396 -0
tools/math_tools.py +354 -0
tools/media_tools.py +434 -0
tools/web_tools.py +229 -0

README.md CHANGED Viewed

@@ -1,13 +1,42 @@
 ---
 title: GAIA Agent
-emoji: 🌍
-colorFrom: yellow
-colorTo: gray
 sdk: gradio
-sdk_version: 6.3.0
 app_file: app.py
 pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: GAIA Agent
+emoji: 🤖
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: "4.44.0"
 app_file: app.py
 pinned: false
 license: mit
 ---
+# 🤖 GAIA Agent
+A general-purpose AI agent built for the **Hugging Face Agents Course** (Unit 4).
+## Features
+- **22 Tools** organized by category:
+  - 📌 Web & Research: web_search, wikipedia_lookup, arxiv_search, webpage_fetch
+  - 📁 Files: read_file, download_file
+  - 🎬 Media: youtube_transcript, audio_transcribe, video_frame_analyze
+  - 💻 Code Execution: python_executor, javascript_executor, bash_executor
+  - 🔢 Mathematics: calculator, symbolic_math, matrix_operations, statistical_analysis
+  - 🖼️ Image Processing: image_analyze, image_manipulate, image_annotate, image_ocr
+- **ReAct Architecture** using LangGraph
+- **GPT-4o** for reasoning and vision tasks
+## Setup
+1. Set your secrets in the Space settings:
+   - `OPENAI_API_KEY`: Your OpenAI API key
+   - `TAVILY_API_KEY`: Your Tavily API key (for web search)
+2. Click "Run Full Evaluation" to test on all 20 GAIA questions
+## Resources
+- [Course Page](https://huggingface.co/learn/agents-course/unit4/hands-on)
+- [API Docs](https://agents-course-unit4-scoring.hf.space/docs)
+- [Leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard)

agent.py ADDED Viewed

	@@ -0,0 +1,184 @@

+"""
+General-Purpose ReAct Agent with LangGraph
+Uses the prebuilt create_react_agent which automatically handles:
+- The Think → Act → Observe loop
+- Tool calling and response routing
+- Deciding when to stop
+Much simpler than manually defining the graph!
+"""
+import os
+from typing import TypedDict, Optional
+from dotenv import load_dotenv
+from langgraph.prebuilt import create_react_agent
+from langchain_openai import ChatOpenAI
+from langchain_core.messages import HumanMessage, SystemMessage
+import yaml
+# Import all tools from the modular tools package
+from tools import ALL_TOOLS, MODEL_NAME
+# ============== SYSTEM PROMPT ==============
+load_dotenv()
+_HERE = os.path.dirname(os.path.abspath(__file__))
+with open(os.path.join(_HERE, "prompts.yaml"), "r", encoding="utf-8") as f:
+    PROMPTS = yaml.safe_load(f)
+    SYSTEM_PROMPT = PROMPTS["SYSTEM_PROMPT"]
+# ============== CREATE THE AGENT ==============
+def create_agent():
+    """Create the ReAct agent using LangGraph's prebuilt function."""
+    llm = ChatOpenAI(
+        model=MODEL_NAME,
+        temperature=0,
+    )
+    # This is all you need! LangGraph handles the rest.
+    agent = create_react_agent(
+        model=llm,
+        tools=ALL_TOOLS,
+        prompt=SYSTEM_PROMPT,  # Adds system prompt to every call
+    )
+    return agent
+# Create global agent instance
+agent = create_agent()
+# ============== MAIN INTERFACE ==============
+def run_agent(question: str, task_id: str = "", file_name: str = "", local_file_path: str = None) -> str:
+    """
+    Run the ReAct agent on a question.
+    Args:
+        question: The question to answer
+        task_id: Optional GAIA task ID (for file downloads)
+        file_name: Optional filename hint
+        local_file_path: Optional local path to pre-downloaded file
+    Returns:
+        The agent's final answer
+    """
+    # Build message with context
+    user_message = question
+    if task_id:
+        user_message += f"\n\n[Task ID: {task_id}]"
+    if file_name:
+        user_message += f"\n[Attached file: {file_name}]"
+    if local_file_path:
+        user_message += f"\n[File already downloaded to: {local_file_path}]"
+        user_message += f"\n[Use read_file tool with this path to analyze the file]"
+    # Run agent
+    try:
+        result = agent.invoke({
+            "messages": [HumanMessage(content=user_message)]
+        })
+        # Get final answer from last message
+        final_message = result["messages"][-1]
+        answer = final_message.content
+        return answer
+    except Exception as e:
+        return f"Agent error: {str(e)}"
+def run_agent_verbose(question: str, task_id: str = "", file_name: str = "", local_file_path: str = None) -> str:
+    """Run the agent with verbose output showing each step."""
+    user_message = question
+    if task_id:
+        user_message += f"\n\n[Task ID: {task_id}]"
+    if file_name:
+        user_message += f"\n[Attached file: {file_name}]"
+    if local_file_path:
+        user_message += f"\n[File already downloaded to: {local_file_path}]"
+        user_message += f"\n[Use read_file tool with this path to analyze the file]"
+    print("\n" + "="*70)
+    print("🤖 ReAct Agent - Verbose Mode")
+    print("="*70)
+    print(f"\n📝 Question: {question[:200]}{'...' if len(question) > 200 else ''}")
+    if local_file_path:
+        print(f"📎 File: {local_file_path}")
+    print("\n" + "-"*70)
+    try:
+        # Stream through steps
+        step_count = 0
+        for step in agent.stream({"messages": [HumanMessage(content=user_message)]}):
+            step_count += 1
+            # Get the node name and output
+            for node_name, node_output in step.items():
+                print(f"\n🔄 Step {step_count} - {node_name}")
+                print("-"*40)
+                if "messages" in node_output:
+                    for msg in node_output["messages"]:
+                        msg_type = type(msg).__name__
+                        # Show tool calls
+                        if hasattr(msg, "tool_calls") and msg.tool_calls:
+                            print(f"🔧 Tool calls requested:")
+                            for tc in msg.tool_calls:
+                                args_str = str(tc.get('args', {}))[:300]
+                                print(f"   → {tc['name']}({args_str}{'...' if len(str(tc.get('args', {}))) > 300 else ''})")
+                        # Show tool results
+                        elif msg_type == "ToolMessage":
+                            content = str(msg.content)[:300]
+                            print(f"📋 Tool result: {content}{'...' if len(str(msg.content)) > 300 else ''}")
+                        # Show AI reasoning
+                        elif hasattr(msg, "content") and msg.content and msg_type == "AIMessage":
+                            content = msg.content[:400]
+                            print(f"💭 AI: {content}{'...' if len(msg.content) > 400 else ''}")
+        # Get final result
+        result = agent.invoke({"messages": [HumanMessage(content=user_message)]})
+        final_message = result["messages"][-1]
+        answer = final_message.content
+        print("\n" + "="*70)
+        print(f"✅ Final Answer: {answer}")
+        print("="*70 + "\n")
+        return answer
+    except Exception as e:
+        import traceback
+        print(f"\n❌ Error: {str(e)}")
+        traceback.print_exc()
+        return f"Error: {str(e)}"
+# ============== TEST ==============
+if __name__ == "__main__":
+    print("\n" + "="*70)
+    print("Testing ReAct Agent (Prebuilt)")
+    print("="*70)
+    # Show available tools
+    print(f"\n📦 Loaded {len(ALL_TOOLS)} tools:")
+    for tool in ALL_TOOLS:
+        print(f"   - {tool.name}")
+    # Test with verbose output
+    test_question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."
+    print(f"\n🧪 Test question: {test_question}")
+    run_agent_verbose(test_question)

app.py ADDED Viewed

	@@ -0,0 +1,598 @@

+"""
+GAIA Agent - Gradio Application
+This is the main entry point for the Hugging Face Space.
+It provides a Gradio interface for running the GAIA evaluation
+and submitting answers to the scoring API.
+LOCAL DEBUGGING:
+    1. Create a .env file with your API keys
+    2. Run: python app.py
+    3. Open http://localhost:7860 in your browser
+"""
+import os
+import tempfile
+import gradio as gr
+import requests
+import pandas as pd
+from typing import List, Dict, Any, Optional, Tuple
+from dotenv import load_dotenv
+# Load environment variables from .env file (for local development)
+load_dotenv()
+# Use the ReAct agent (multi-step reasoning)
+from agent import run_agent, run_agent_verbose
+# ============== CONFIGURATION ==============
+API_BASE = os.getenv("GAIA_API_BASE", "https://agents-course-unit4-scoring.hf.space")
+DEBUG_MODE = os.getenv("DEBUG_MODE", "false").lower() == "true"
+# ============== FILE HANDLING ==============
+def fetch_task_file(task_id: str, file_name: str = "") -> Optional[str]:
+    """
+    Fetch a file attached to a GAIA task and save it locally.
+    Args:
+        task_id: The GAIA task ID
+        file_name: Expected filename (helps determine file type)
+    Returns:
+        Local file path if successful, None if no file or error
+    """
+    if not file_name:
+        return None
+    try:
+        url = f"{API_BASE}/files/{task_id}"
+        print(f"📥 Fetching file from: {url}")
+        response = requests.get(url, timeout=60)
+        if response.status_code == 200:
+            # Try to get filename from content-disposition header
+            content_disp = response.headers.get('content-disposition', '')
+            if 'filename=' in content_disp:
+                filename = content_disp.split('filename=')[1].strip('"\'')
+            else:
+                filename = file_name
+            # Save to temp directory
+            file_path = os.path.join(tempfile.gettempdir(), filename)
+            with open(file_path, 'wb') as f:
+                f.write(response.content)
+            file_size = len(response.content)
+            print(f"✅ File saved: {file_path} ({file_size} bytes)")
+            return file_path
+        else:
+            print(f"⚠️ File fetch failed: HTTP {response.status_code}")
+            return None
+    except Exception as e:
+        print(f"❌ Error fetching file: {e}")
+        return None
+# ============== API FUNCTIONS ==============
+def fetch_questions() -> List[Dict[str, Any]]:
+    """Fetch all GAIA questions from the evaluation API."""
+    try:
+        response = requests.get(f"{API_BASE}/questions", timeout=30)
+        if response.status_code == 200:
+            return response.json()
+        else:
+            print(f"Failed to fetch questions: {response.status_code}")
+    except Exception as e:
+        print(f"Error fetching questions: {e}")
+    return []
+def fetch_random_question() -> Optional[Dict[str, Any]]:
+    """Fetch a single random question for testing."""
+    try:
+        response = requests.get(f"{API_BASE}/random-question", timeout=30)
+        if response.status_code == 200:
+            return response.json()
+    except Exception as e:
+        print(f"Error fetching random question: {e}")
+    return None
+def submit_answers(username: str, agent_code_url: str, answers: List[Dict[str, str]]) -> Optional[Dict[str, Any]]:
+    """Submit answers to the scoring API."""
+    try:
+        payload = {
+            "username": username,
+            "agent_code": agent_code_url,
+            "answers": answers
+        }
+        response = requests.post(
+            f"{API_BASE}/submit",
+            json=payload,
+            timeout=120
+        )
+        if response.status_code == 200:
+            return response.json()
+        else:
+            print(f"Submission failed: {response.status_code} - {response.text}")
+    except Exception as e:
+        print(f"Error submitting answers: {e}")
+    return None
+# ============== LOCAL DEBUG FUNCTIONS ==============
+def run_single_question_local(question_text: str, task_id: str, file_name: str) -> Tuple[str, str, str]:
+    """
+    Run the agent on a manually entered question (for local debugging).
+    """
+    if not question_text.strip():
+        return "Please enter a question", "", ""
+    task_id = task_id.strip() or "local_test"
+    file_name = file_name.strip() or None
+    print(f"\n{'='*60}")
+    print(f"LOCAL DEBUG - Running agent")
+    print(f"Task ID: {task_id}")
+    print(f"Question: {question_text[:200]}...")
+    print(f"File: {file_name or 'None'}")
+    print(f"{'='*60}\n")
+    # Pre-fetch file if specified
+    local_file_path = None
+    if file_name and task_id != "local_test":
+        local_file_path = fetch_task_file(task_id, file_name)
+    try:
+        answer = run_agent_verbose(question_text, task_id, file_name, local_file_path)
+        return question_text, answer, f"Processed task: {task_id}"
+    except Exception as e:
+        import traceback
+        error_details = traceback.format_exc()
+        print(f"Error:\n{error_details}")
+        return question_text, f"Error: {str(e)}\n\nDetails:\n{error_details}", "Failed"
+def run_random_question() -> Tuple[str, str, str, str, str]:
+    """Fetch and run a random question from the API."""
+    question_data = fetch_random_question()
+    if not question_data:
+        return "Failed to fetch question", "", "", "", ""
+    task_id = question_data.get("task_id", "unknown")
+    question = question_data.get("question", "")
+    file_name = question_data.get("file_name", "")
+    level = question_data.get("Level", "?")
+    print(f"\n{'='*60}")
+    print(f"RANDOM QUESTION from API")
+    print(f"Task ID: {task_id}")
+    print(f"Level: {level}")
+    print(f"Question: {question[:200]}...")
+    print(f"File: {file_name or 'None'}")
+    print(f"{'='*60}\n")
+    # Pre-fetch file if attached
+    local_file_path = None
+    if file_name:
+        local_file_path = fetch_task_file(task_id, file_name)
+    try:
+        answer = run_agent_verbose(question, task_id, file_name if file_name else None, local_file_path)
+        status = f"✅ Task: {task_id} | Level: {level}"
+        return question, task_id, file_name or "", answer, status
+    except Exception as e:
+        import traceback
+        error_details = traceback.format_exc()
+        print(f"Error:\n{error_details}")
+        return question, task_id, file_name or "", f"Error: {str(e)}", "❌ Failed"
+def run_specific_question(task_id_input: str) -> Tuple[str, str, str, str, str]:
+    """Run a specific question by task ID."""
+    task_id_input = task_id_input.strip()
+    if not task_id_input:
+        return "Please enter a task ID", "", "", "", ""
+    # Fetch all questions and find the matching one
+    questions = fetch_questions()
+    matching = [q for q in questions if q.get("task_id") == task_id_input]
+    if not matching:
+        return f"Task ID not found: {task_id_input}", task_id_input, "", "", "❌ Not found"
+    q = matching[0]
+    task_id = q.get("task_id", "")
+    question = q.get("question", "")
+    file_name = q.get("file_name", "")
+    level = q.get("Level", "?")
+    print(f"\n{'='*60}")
+    print(f"SPECIFIC QUESTION: {task_id}")
+    print(f"Level: {level}")
+    print(f"Question: {question[:200]}...")
+    print(f"File: {file_name or 'None'}")
+    print(f"{'='*60}\n")
+    # Pre-fetch file if attached
+    local_file_path = None
+    if file_name:
+        local_file_path = fetch_task_file(task_id, file_name)
+    try:
+        answer = run_agent(question, task_id, file_name if file_name else None, local_file_path)
+        status = f"✅ Completed | Level: {level}"
+        return question, task_id, file_name or "", answer, status
+    except Exception as e:
+        import traceback
+        error_details = traceback.format_exc()
+        print(f"Error:\n{error_details}")
+        return question, task_id, file_name or "", f"Error: {str(e)}", "❌ Failed"
+def list_all_questions() -> pd.DataFrame:
+    """Fetch and display all available questions."""
+    questions = fetch_questions()
+    if not questions:
+        return pd.DataFrame({"error": ["Failed to fetch questions"]})
+    data = []
+    for q in questions:
+        data.append({
+            "task_id": q.get("task_id", "")[:20] + "...",
+            "question": q.get("question", "")[:80] + "...",
+            "file": q.get("file_name", "") or "-",
+            "level": q.get("Level", "?")
+        })
+    return pd.DataFrame(data)
+def run_full_evaluation_local(username: str) -> Tuple[str, pd.DataFrame]:
+    """
+    Run full evaluation in local mode (without HF OAuth).
+    """
+    if not username.strip():
+        return "❌ Please enter your HuggingFace username", pd.DataFrame()
+    username = username.strip()
+    agent_code_url = f"https://huggingface.co/spaces/{username}/GAIA-Agent/tree/main"
+    print(f"\n{'='*60}")
+    print(f"FULL EVALUATION - LOCAL MODE")
+    print(f"Username: {username}")
+    print(f"Agent URL: {agent_code_url}")
+    print(f"{'='*60}\n")
+    # Fetch questions
+    questions = fetch_questions()
+    if not questions:
+        return "❌ Failed to fetch questions from API.", pd.DataFrame()
+    print(f"Fetched {len(questions)} questions")
+    # Process each question
+    results = []
+    answers_for_submission = []
+    for i, q in enumerate(questions):
+        task_id = q.get("task_id", "unknown")
+        question = q.get("question", "")
+        file_name = q.get("file_name", "")
+        print(f"\n[{i+1}/{len(questions)}] Processing: {task_id}")
+        print(f"Question: {question[:100]}...")
+        # Pre-fetch file if attached
+        local_file_path = None
+        if file_name:
+            local_file_path = fetch_task_file(task_id, file_name)
+        try:
+            answer = run_agent(question, task_id, file_name if file_name else None, local_file_path)
+            print(f"Answer: {answer[:100]}...")
+            results.append({
+                "task_id": task_id[:15] + "...",
+                "question": question[:60] + "...",
+                "answer": answer[:80] + "..." if len(answer) > 80 else answer
+            })
+            answers_for_submission.append({
+                "task_id": task_id,
+                "submitted_answer": answer
+            })
+        except Exception as e:
+            print(f"Error: {e}")
+            results.append({
+                "task_id": task_id[:15] + "...",
+                "question": question[:60] + "...",
+                "answer": f"ERROR: {str(e)[:50]}"
+            })
+            answers_for_submission.append({
+                "task_id": task_id,
+                "submitted_answer": ""
+            })
+    # Submit answers
+    print(f"\n{'='*60}")
+    print("Submitting answers...")
+    print(f"{'='*60}\n")
+    submission_result = submit_answers(username, agent_code_url, answers_for_submission)
+    df = pd.DataFrame(results)
+    if submission_result:
+        score = submission_result.get("score", "N/A")
+        correct = submission_result.get("correct_count", "?")
+        total = submission_result.get("total_count", len(questions))
+        status = f"✅ Submitted!\n\n📊 Score: {score}\n✓ Correct: {correct}/{total}"
+        print(f"\nFinal Score: {score} ({correct}/{total})")
+    else:
+        status = "❌ Submission failed. Check logs for details."
+    return status, df
+def run_full_evaluation_hf(profile: gr.OAuthProfile = None) -> Tuple[str, pd.DataFrame]:
+    """
+    Run full evaluation with HuggingFace OAuth (for deployed Space).
+    """
+    if profile is None:
+        return "❌ Please log in with your Hugging Face account first.", pd.DataFrame()
+    return run_full_evaluation_local(profile.username)
+# ============== BUILD GRADIO INTERFACE ==============
+def create_app():
+    """Create and configure the Gradio application."""
+    # Check if running locally (no HF Space environment)
+    is_local = os.getenv("SPACE_ID") is None
+    with gr.Blocks(title="GAIA Agent - Debug & Evaluation") as demo:
+        gr.Markdown("""
+        # 🤖 GAIA Agent - Debug & Evaluation Interface
+        Built with **LangGraph** and **OpenAI GPT-4** for the HuggingFace Agents Course.
+        """)
+        # Show environment info
+        env_info = "🖥️ **Local Mode**" if is_local else "☁️ **HuggingFace Space Mode**"
+        api_key_status = "✅ API Key Set" if os.getenv("OPENAI_API_KEY") else "❌ OPENAI_API_KEY not set!"
+        gr.Markdown(f"""
+        **Environment:** {env_info} | **OpenAI:** {api_key_status}
+        ---
+        """)
+        with gr.Tabs():
+            # ============== TAB 1: Quick Test ==============
+            with gr.TabItem("🧪 Quick Test"):
+                gr.Markdown("### Test with a random question from the GAIA API")
+                with gr.Row():
+                    random_btn = gr.Button("🎲 Fetch & Run Random Question", variant="primary")
+                with gr.Row():
+                    with gr.Column():
+                        random_question = gr.Textbox(label="Question", lines=4, interactive=False)
+                        random_task_id = gr.Textbox(label="Task ID", lines=1, interactive=False)
+                        random_file = gr.Textbox(label="Attached File", lines=1, interactive=False)
+                    with gr.Column():
+                        random_answer = gr.Textbox(label="Agent Answer", lines=4, interactive=False)
+                        random_status = gr.Textbox(label="Status", lines=1, interactive=False)
+                random_btn.click(
+                    fn=run_random_question,
+                    outputs=[random_question, random_task_id, random_file, random_answer, random_status]
+                )
+            # ============== TAB 2: Debug Specific ==============
+            with gr.TabItem("🔍 Debug Specific Question"):
+                gr.Markdown("### Run a specific question by Task ID")
+                with gr.Row():
+                    specific_task_input = gr.Textbox(
+                        label="Task ID",
+                        placeholder="e.g., 8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
+                        lines=1
+                    )
+                    specific_btn = gr.Button("▶️ Run", variant="primary")
+                with gr.Row():
+                    with gr.Column():
+                        specific_question = gr.Textbox(label="Question", lines=4, interactive=False)
+                        specific_file = gr.Textbox(label="Attached File", lines=1, interactive=False)
+                    with gr.Column():
+                        specific_answer = gr.Textbox(label="Agent Answer", lines=4, interactive=False)
+                        specific_status = gr.Textbox(label="Status", lines=1, interactive=False)
+                specific_btn.click(
+                    fn=run_specific_question,
+                    inputs=[specific_task_input],
+                    outputs=[specific_question, specific_task_input, specific_file, specific_answer, specific_status]
+                )
+                gr.Markdown("---")
+                gr.Markdown("### All Available Questions")
+                with gr.Row():
+                    list_btn = gr.Button("📋 Load Question List")
+                questions_table = gr.Dataframe(
+                    headers=["task_id", "question", "file", "level"],
+                    label="Questions",
+                    wrap=True
+                )
+                list_btn.click(fn=list_all_questions, outputs=[questions_table])
+            # ============== TAB 3: Manual Input ==============
+            with gr.TabItem("✏️ Manual Input"):
+                gr.Markdown("### Test with custom question (for debugging)")
+                with gr.Row():
+                    with gr.Column():
+                        manual_question = gr.Textbox(
+                            label="Question",
+                            lines=4,
+                            placeholder="Enter your test question here..."
+                        )
+                        manual_task_id = gr.Textbox(
+                            label="Task ID (optional)",
+                            lines=1,
+                            placeholder="test_001"
+                        )
+                        manual_file = gr.Textbox(
+                            label="File Name (optional)",
+                            lines=1,
+                            placeholder="e.g., data.xlsx"
+                        )
+                    with gr.Column():
+                        manual_answer = gr.Textbox(label="Agent Answer", lines=4, interactive=False)
+                        manual_status = gr.Textbox(label="Status", lines=2, interactive=False)
+                with gr.Row():
+                    manual_btn = gr.Button("▶️ Run Agent", variant="primary")
+                manual_btn.click(
+                    fn=run_single_question_local,
+                    inputs=[manual_question, manual_task_id, manual_file],
+                    outputs=[manual_question, manual_answer, manual_status]
+                )
+            # ============== TAB 4: Full Evaluation ==============
+            with gr.TabItem("🏆 Full Evaluation"):
+                gr.Markdown("### Run all 20 questions and submit for scoring")
+                if is_local:
+                    # Local mode - manual username input
+                    gr.Markdown("**Local Mode:** Enter your HuggingFace username to submit.")
+                    with gr.Row():
+                        username_input = gr.Textbox(
+                            label="HuggingFace Username",
+                            placeholder="your-username",
+                            lines=1
+                        )
+                    with gr.Row():
+                        full_eval_btn_local = gr.Button("🚀 Run Full Evaluation & Submit", variant="primary")
+                    with gr.Row():
+                        status_output_local = gr.Textbox(
+                            label="Status",
+                            lines=4,
+                            interactive=False,
+                            placeholder="Click 'Run Full Evaluation' to start..."
+                        )
+                    with gr.Row():
+                        results_table_local = gr.Dataframe(
+                            headers=["task_id", "question", "answer"],
+                            label="Results",
+                            wrap=True
+                        )
+                    full_eval_btn_local.click(
+                        fn=run_full_evaluation_local,
+                        inputs=[username_input],
+                        outputs=[status_output_local, results_table_local]
+                    )
+                else:
+                    # HF Space mode - OAuth login
+                    gr.Markdown("**Space Mode:** Log in with HuggingFace to submit.")
+                    with gr.Row():
+                        login_btn = gr.LoginButton(variant="huggingface")
+                    with gr.Row():
+                        full_eval_btn_hf = gr.Button("🚀 Run Full Evaluation & Submit", variant="primary")
+                    with gr.Row():
+                        status_output_hf = gr.Textbox(
+                            label="Status",
+                            lines=4,
+                            interactive=False,
+                            placeholder="Log in and click 'Run Full Evaluation' to start..."
+                        )
+                    with gr.Row():
+                        results_table_hf = gr.Dataframe(
+                            headers=["task_id", "question", "answer"],
+                            label="Results",
+                            wrap=True
+                        )
+                    full_eval_btn_hf.click(
+                        fn=run_full_evaluation_hf,
+                        outputs=[status_output_hf, results_table_hf]
+                    )
+        gr.Markdown("""
+        ---
+        ### 📚 Resources
+        - [Course Page](https://huggingface.co/learn/agents-course/unit4/hands-on)
+        - [API Docs](https://agents-course-unit4-scoring.hf.space/docs)
+        - [Leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard)
+        ### 🔧 Local Setup
+        ```bash
+        # 1. Create .env file
+        echo "OPENAI_API_KEY=sk-your-key-here" > .env
+        # 2. Install dependencies
+        pip install -r requirements.txt
+        # 3. Run the app
+        python app.py
+        ```
+        """)
+    return demo
+# ============== MAIN ==============
+if __name__ == "__main__":
+    print("\n" + "="*60)
+    print("🤖 GAIA Agent - Starting Gradio Interface")
+    print("="*60)
+    # Check for API key
+    if not os.getenv("OPENAI_API_KEY"):
+        print("\n⚠️  WARNING: OPENAI_API_KEY not set!")
+        print("   Create a .env file with: OPENAI_API_KEY=sk-your-key")
+        print("   Or set it as an environment variable.\n")
+    else:
+        print("✅ OpenAI API Key detected")
+    print(f"📡 GAIA API: {API_BASE}")
+    print("="*60 + "\n")
+    # Create and launch the app
+    demo = create_app()
+    demo.launch(
+        server_name="0.0.0.0",  # Allow external connections
+        server_port=7860,
+        share=False,  # Set to True to get a public URL
+        debug=DEBUG_MODE    # Enable debug mode for better error messages
+    )

prompts.yaml ADDED Viewed

	@@ -0,0 +1,114 @@

+SYSTEM_PROMPT: |
+  You are a general-purpose AI assistant that solves problems step by step.
+  You have access to a comprehensive set of tools organized by category:
+  📌 WEB & RESEARCH:
+  - web_search: Search the web with FULL page content (Tavily) - returns direct answers + full content
+  - webpage_fetch: Fetch content from a specific URL (use for non-search URLs)
+  - wikipedia_lookup: Look up topics on Wikipedia
+  - arxiv_search: Search arXiv for academic papers and research
+  📁 FILES:
+  - read_file: Read text, code, Excel, CSV, JSON files
+  - download_file: Download files from GAIA benchmark by task_id (usually pre-downloaded for you)
+  🎬 MEDIA:
+  - youtube_transcript: Get transcripts/captions from YouTube videos
+  - youtube_audio_transcribe: Download and transcribe YouTube audio with Whisper
+  - audio_transcribe: Transcribe audio files (.mp3, .wav, etc.)
+  - video_metadata: Get video metadata (duration, resolution, title)
+  - video_frame_analyze: Extract and analyze video frames with GPT-4o vision
+  💻 CODE EXECUTION:
+  - python_executor: Execute Python code (use print() to see output)
+  - javascript_executor: Execute JavaScript code with Node.js (use console.log())
+  - bash_executor: Execute shell commands (PowerShell on Windows, bash on Unix)
+  🔢 MATHEMATICS:
+  - calculator: Evaluate mathematical expressions with high precision
+  - symbolic_math: Symbolic math (simplify, expand, factor, solve, differentiate, integrate)
+  - matrix_operations: Matrix math (determinant, inverse, eigenvalues, solve linear systems)
+  - statistical_analysis: Statistics (mean, median, std, correlation, regression)
+  🖼️ IMAGE PROCESSING:
+  - image_analyze: Analyze images with GPT-4o vision (describe, answer questions)
+  - image_manipulate: Manipulate images (crop, rotate, resize, flip, grayscale, blur, etc.)
+  - image_annotate: Add annotations to images (text, rectangles, circles, arrows)
+  - image_ocr: Extract text from images using OCR
+  APPROACH:
+  1. First, understand what the question is asking
+  2. Think about what information or computation you need
+  3. Use the most appropriate tools to gather information or perform actions
+  4. Analyze the results carefully
+  5. If you need more information, use different tools or approaches
+  6. When you have enough information, provide your final answer
+  ⚠️ IMPORTANT STRATEGIES:
+  Web Research Workflow:
+  - web_search returns FULL page content directly - often no need for additional tools
+  - web_search also provides a direct answer when possible (shown as "DIRECT ANSWER")
+  - Use webpage_fetch only for specific URLs not from search (e.g., links found in content)
+  - For Wikipedia-specific queries, wikipedia_lookup may give more structured results
+  Avoiding Repetition:
+  - NEVER repeat the same tool call with identical or very similar arguments
+  - If a search doesn't give you what you need, try a DIFFERENT approach:
+    * Use webpage_fetch to read a URL from results
+    * Try a completely different search query
+    * Use a different tool (e.g., wikipedia_lookup instead of web_search)
+    * Use Python to process/analyze data you already have
+  - If you've tried 2-3 similar searches without success, CHANGE YOUR STRATEGY
+  Wikipedia Tips:
+  - For specific topics like discographies, try "Artist Name discography" as the topic
+  - If the main article doesn't have enough detail, search for more specific sub-topics
+  - Wikipedia articles often have structured data - consider using Python to parse it
+  TIPS:
+  - If text looks reversed or encoded, use Python to manipulate it: print(text[::-1])
+  - For complex math, use symbolic_math for algebra/calculus or calculator for arithmetic
+  - For matrix/linear algebra problems, use matrix_operations
+  - For statistical analysis, use statistical_analysis
+  - For current events or facts you're unsure about, search the web
+  - For academic papers, research articles, or scientific preprints, use arxiv_search
+  - For file-based questions: if a file path is provided, use read_file directly on that path
+  - If no file path is provided but file_name is given, use download_file first
+  - For image questions, use image_analyze to understand the content
+  - To prepare images for analysis (crop, rotate), use image_manipulate first
+  - To read text in images (screenshots, documents), use image_ocr
+  - You can chain multiple tools to solve complex problems
+  - When processing data, Python is often the most flexible choice
+  - For JSON manipulation, both Python and JavaScript work well
+  ⚠️ CRITICAL - FINAL ANSWER RULES:
+  Your final answer will be checked for EXACT MATCH. Follow these rules strictly:
+  1. ANSWER ONLY - NO EXPLANATIONS:
+     - Give ONLY the answer itself, nothing else
+     - Do NOT include reasoning, context, or explanations in the final answer
+     - Do NOT repeat the question or say "The answer is..."
+  2. BE CONSISTENT:
+     - Your final answer MUST match your analysis
+     - If your analysis found 5 items, answer "5" not "3"
+     - Double-check your work before giving the final answer
+  3. FORMAT BY TYPE:
+     - Numbers: just the number → "42" or "3.14"
+     - Names: just the name → "Einstein" or "Albert"
+     - Lists: comma-separated, alphabetized if requested → "apple, banana, cherry"
+     - Yes/No questions: just "Yes" or "No"
+  4. EXAMPLES:
+     ❌ WRONG: "Mercedes Sosa released 5 studio albums between 2000 and 2009. These were: Misa Criolla (2000), Acústico (2002)..."
+     ✅ CORRECT: "5"
+     ❌ WRONG: "The answer is Einstein, who developed the theory of relativity."
+     ✅ CORRECT: "Einstein"
+     ❌ WRONG: "Based on my research, the vegetables are: broccoli, celery, lettuce"
+     ✅ CORRECT: "broccoli, celery, lettuce"

requirements.txt ADDED Viewed

	@@ -0,0 +1,38 @@

+# Core framework
+gradio>=4.0.0
+langgraph>=0.2.0
+langchain>=0.2.0
+langchain-openai>=0.1.0
+langchain-community>=0.2.0
+openai>=1.0.0
+# Data processing
+pandas>=2.0.0
+openpyxl>=3.1.0
+numpy>=1.26.0,<2.0.0
+# Web & API
+requests>=2.31.0
+httpx>=0.25.0
+beautifulsoup4>=4.12.0
+# duckduckgo-search>=5.0.0  # Replaced by Tavily
+wikipedia>=1.4.0
+tavily-python>=0.3.0
+arxiv>=2.0.0
+# Media processing
+youtube-transcript-api>=0.6.0
+yt-dlp>=2024.0.0
+pytube>=15.0.0
+# Math & Statistics
+sympy>=1.12
+scipy>=1.11.0,<1.14.0
+# Image processing
+Pillow>=10.0.0
+pytesseract>=0.3.10
+# Configuration
+PyYAML>=6.0.0
+python-dotenv>=1.0.0

tools/__init__.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""
+Tools package for the GAIA Agent.
+This package provides modular tools organized by category:
+- web_tools: Web search and Wikipedia lookup
+- file_tools: File reading and downloading
+- media_tools: YouTube, audio, and video processing
+- code_executors: Python, JavaScript, and Bash execution
+- math_tools: Symbolic math, matrix operations, calculator, statistics
+- image_tools: Image analysis, manipulation, annotation, and OCR
+"""
+from tools.web_tools import (
+    web_search,
+    wikipedia_lookup,
+    arxiv_search,
+    webpage_fetch,
+)
+from tools.file_tools import (
+    read_file,
+    download_file,
+)
+from tools.media_tools import (
+    youtube_transcript,
+    youtube_audio_transcribe,
+    audio_transcribe,
+    video_metadata,
+    video_frame_analyze,
+)
+from tools.code_executors import (
+    python_executor,
+    javascript_executor,
+    bash_executor,
+)
+from tools.math_tools import (
+    symbolic_math,
+    matrix_operations,
+    calculator,
+    statistical_analysis,
+)
+from tools.image_tools import (
+    image_analyze,
+    image_manipulate,
+    image_annotate,
+    image_ocr,
+)
+# Export all tools as a list for easy agent registration
+ALL_TOOLS = [
+    # Web tools
+    web_search,
+    wikipedia_lookup,
+    arxiv_search,
+    webpage_fetch,
+    # File tools
+    read_file,
+    download_file,
+    # Media tools
+    youtube_transcript,
+    youtube_audio_transcribe,
+    audio_transcribe,
+    video_metadata,
+    video_frame_analyze,
+    # Code executors
+    python_executor,
+    javascript_executor,
+    bash_executor,
+    # Math tools
+    symbolic_math,
+    matrix_operations,
+    calculator,
+    statistical_analysis,
+    # Image tools
+    image_analyze,
+    image_manipulate,
+    image_annotate,
+    image_ocr,
+]
+# Configuration
+MAX_ITERATIONS = 15
+MODEL_NAME = "gpt-4o"

tools/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.9 kB). View file

tools/__pycache__/code_executors.cpython-311.pyc ADDED Viewed

Binary file (6 kB). View file

tools/__pycache__/file_tools.cpython-311.pyc ADDED Viewed

Binary file (4.89 kB). View file

tools/__pycache__/image_tools.cpython-311.pyc ADDED Viewed

Binary file (20.9 kB). View file

tools/__pycache__/math_tools.cpython-311.pyc ADDED Viewed

Binary file (19.4 kB). View file

tools/__pycache__/media_tools.cpython-311.pyc ADDED Viewed

Binary file (21 kB). View file

tools/__pycache__/web_tools.cpython-311.pyc ADDED Viewed

Binary file (12.8 kB). View file

tools/code_executors.py ADDED Viewed

	@@ -0,0 +1,131 @@

+"""
+Code execution tools for the GAIA Agent.
+Includes Python, JavaScript, and Bash executors with sandboxed execution.
+"""
+import os
+import subprocess
+import tempfile
+import shutil
+from langchain_core.tools import tool
+@tool
+def python_executor(code: str) -> str:
+    """Execute Python code and return the output.
+    Use this for calculations, data processing, string manipulation, or any computation.
+    The code should print() any results you want to see.
+    Args:
+        code: Python code to execute
+    """
+    try:
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') as f:
+            f.write(code)
+            temp_path = f.name
+        result = subprocess.run(
+            ['python', temp_path],
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+        os.unlink(temp_path)
+        output = result.stdout.strip()
+        if result.stderr:
+            output += f"\nStderr: {result.stderr}"
+        return output or "Code executed with no output."
+    except subprocess.TimeoutExpired:
+        if 'temp_path' in locals():
+            os.unlink(temp_path)
+        return "Execution timed out (60s limit)"
+    except Exception as e:
+        if 'temp_path' in locals() and os.path.exists(temp_path):
+            os.unlink(temp_path)
+        return f"Execution error: {str(e)}"
+@tool
+def javascript_executor(code: str) -> str:
+    """Execute JavaScript code using Node.js and return the output.
+    Use this for JSON processing, string manipulation, or JavaScript-specific operations.
+    Use console.log() to output results.
+    Args:
+        code: JavaScript code to execute
+    """
+    # Check if Node.js is available
+    if shutil.which("node") is None:
+        return "Error: Node.js is not installed or not in PATH. Please install Node.js to use this tool."
+    try:
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.js', delete=False, encoding='utf-8') as f:
+            f.write(code)
+            temp_path = f.name
+        result = subprocess.run(
+            ['node', temp_path],
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+        os.unlink(temp_path)
+        output = result.stdout.strip()
+        if result.stderr:
+            output += f"\nStderr: {result.stderr}"
+        return output or "Code executed with no output."
+    except subprocess.TimeoutExpired:
+        if 'temp_path' in locals():
+            os.unlink(temp_path)
+        return "Execution timed out (60s limit)"
+    except Exception as e:
+        if 'temp_path' in locals() and os.path.exists(temp_path):
+            os.unlink(temp_path)
+        return f"Execution error: {str(e)}"
+@tool
+def bash_executor(command: str) -> str:
+    """Execute a Bash/Shell command and return the output.
+    Use this for file operations, text processing with sed/awk/grep, or system commands.
+    On Windows, this uses PowerShell. On Unix/Linux/Mac, this uses bash.
+    Args:
+        command: Shell command to execute
+    """
+    try:
+        # Determine the shell to use based on the OS
+        import platform
+        system = platform.system().lower()
+        if system == "windows":
+            # Use PowerShell on Windows
+            shell_cmd = ["powershell", "-NoProfile", "-NonInteractive", "-Command", command]
+        else:
+            # Use bash on Unix-like systems
+            shell_cmd = ["bash", "-c", command]
+        result = subprocess.run(
+            shell_cmd,
+            capture_output=True,
+            text=True,
+            timeout=60,
+            cwd=tempfile.gettempdir()  # Run in temp directory for safety
+        )
+        output = result.stdout.strip()
+        if result.stderr:
+            output += f"\nStderr: {result.stderr}"
+        if result.returncode != 0:
+            output += f"\nExit code: {result.returncode}"
+        return output or "Command executed with no output."
+    except subprocess.TimeoutExpired:
+        return "Execution timed out (60s limit)"
+    except Exception as e:
+        return f"Execution error: {str(e)}"

tools/file_tools.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+File handling tools for the GAIA Agent.
+Includes file reading and downloading from GAIA API.
+"""
+import os
+import tempfile
+import requests
+import pandas as pd
+from langchain_core.tools import tool
+@tool
+def read_file(file_path: str) -> str:
+    """Read the contents of a file.
+    Supports text files, code files, CSV, Excel, etc.
+    Args:
+        file_path: Path to the file to read
+    """
+    try:
+        if file_path.endswith(('.xlsx', '.xls')):
+            df = pd.read_excel(file_path)
+            return f"Excel file with {len(df)} rows, {len(df.columns)} columns.\n\nColumns: {list(df.columns)}\n\nData:\n{df.to_string()}"
+        elif file_path.endswith('.csv'):
+            df = pd.read_csv(file_path)
+            return f"CSV file with {len(df)} rows, {len(df.columns)} columns.\n\nColumns: {list(df.columns)}\n\nData:\n{df.to_string()}"
+        elif file_path.endswith('.json'):
+            import json
+            with open(file_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            return f"JSON file contents:\n{json.dumps(data, indent=2)[:10000]}"
+        else:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read()
+            return content[:10000]
+    except Exception as e:
+        return f"Error reading file: {str(e)}"
+@tool
+def download_file(task_id: str, file_name: str = "") -> str:
+    """Download a file associated with a GAIA task.
+    Returns the local file path where the file was saved.
+    Args:
+        task_id: The GAIA task ID
+        file_name: Expected filename (helps determine file type)
+    """
+    try:
+        api_base = "https://agents-course-unit4-scoring.hf.space"
+        url = f"{api_base}/files/{task_id}"
+        response = requests.get(url, timeout=60)
+        if response.status_code == 200:
+            content_disp = response.headers.get('content-disposition', '')
+            if 'filename=' in content_disp:
+                filename = content_disp.split('filename=')[1].strip('"\'')
+            elif file_name:
+                filename = file_name
+            else:
+                filename = f"{task_id}_file"
+            # Use a portable temp directory (works on Windows + Linux)
+            file_path = os.path.join(tempfile.gettempdir(), filename)
+            with open(file_path, 'wb') as f:
+                f.write(response.content)
+            return f"File downloaded to: {file_path}"
+        else:
+            return f"Download failed: HTTP {response.status_code}"
+    except Exception as e:
+        return f"Download error: {str(e)}"

tools/image_tools.py ADDED Viewed

	@@ -0,0 +1,396 @@

+"""
+Image processing tools for the GAIA Agent.
+Includes image analysis (GPT-4o vision), manipulation, annotation, and OCR.
+"""
+import os
+import json
+import tempfile
+import base64
+from typing import Optional
+from langchain_core.tools import tool
+import openai
+from dotenv import load_dotenv
+load_dotenv()
+client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+@tool
+def image_analyze(file_path: str, question: str) -> str:
+    """Analyze an image (local path or URL) with GPT-4o vision.
+    Use this to understand image contents, describe what's shown, read text,
+    analyze diagrams, identify objects, or answer questions about images.
+    Args:
+        file_path: Path to the image file OR an http/https URL
+        question: What you want to know about the image
+    """
+    try:
+        # Decide whether this is a URL or a local file
+        is_url = file_path.lower().startswith(("http://", "https://"))
+        if is_url:
+            image_content = {"type": "image_url", "image_url": {"url": file_path}}
+        else:
+            with open(file_path, "rb") as img_file:
+                image_data = base64.b64encode(img_file.read()).decode("utf-8")
+            ext = file_path.lower().split('.')[-1]
+            media_type = {
+                "png": "image/png",
+                "jpg": "image/jpeg",
+                "jpeg": "image/jpeg",
+                "gif": "image/gif",
+                "webp": "image/webp",
+            }.get(ext, "image/png")
+            image_content = {
+                "type": "image_url",
+                "image_url": {"url": f"data:{media_type};base64,{image_data}"},
+            }
+        response = client.chat.completions.create(
+            model="gpt-4o",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": question},
+                        image_content,
+                    ],
+                }
+            ],
+            max_tokens=800,
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        return f"Image analysis error: {str(e)}"
+@tool
+def image_manipulate(
+    file_path: str,
+    operation: str,
+    params: str = "{}"
+) -> str:
+    """Manipulate an image file using PIL/Pillow.
+    Operations available:
+    - crop: Crop image. Params: {"box": [left, top, right, bottom]}
+    - rotate: Rotate image. Params: {"angle": 90} (degrees, counterclockwise)
+    - resize: Resize image. Params: {"width": 800, "height": 600} or {"scale": 0.5}
+    - flip: Flip image. Params: {"direction": "horizontal"} or {"direction": "vertical"}
+    - grayscale: Convert to grayscale. No params needed.
+    - brightness: Adjust brightness. Params: {"factor": 1.5} (1.0 = original)
+    - contrast: Adjust contrast. Params: {"factor": 1.5} (1.0 = original)
+    - sharpen: Sharpen image. Params: {"factor": 2.0} (1.0 = original)
+    - blur: Apply Gaussian blur. Params: {"radius": 2}
+    - thumbnail: Create thumbnail. Params: {"size": [128, 128]}
+    Args:
+        file_path: Path to the image file
+        operation: One of the operations listed above
+        params: JSON string with operation parameters
+    """
+    try:
+        from PIL import Image, ImageEnhance, ImageFilter
+        # Parse parameters
+        try:
+            p = json.loads(params) if params else {}
+        except json.JSONDecodeError:
+            return f"Error parsing params: {params}. Use JSON format like {{\"angle\": 90}}"
+        # Open the image
+        img = Image.open(file_path)
+        original_format = img.format or "PNG"
+        operation = operation.lower().strip()
+        if operation == "crop":
+            if "box" not in p:
+                return "Error: crop requires 'box' param: {\"box\": [left, top, right, bottom]}"
+            box = tuple(p["box"])
+            img = img.crop(box)
+        elif operation == "rotate":
+            angle = p.get("angle", 90)
+            expand = p.get("expand", True)
+            img = img.rotate(angle, expand=expand)
+        elif operation == "resize":
+            if "scale" in p:
+                new_width = int(img.width * p["scale"])
+                new_height = int(img.height * p["scale"])
+            elif "width" in p and "height" in p:
+                new_width = p["width"]
+                new_height = p["height"]
+            elif "width" in p:
+                new_width = p["width"]
+                new_height = int(img.height * (p["width"] / img.width))
+            elif "height" in p:
+                new_height = p["height"]
+                new_width = int(img.width * (p["height"] / img.height))
+            else:
+                return "Error: resize requires 'width'/'height' or 'scale' param"
+            img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
+        elif operation == "flip":
+            direction = p.get("direction", "horizontal")
+            if direction == "horizontal":
+                img = img.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
+            elif direction == "vertical":
+                img = img.transpose(Image.Transpose.FLIP_TOP_BOTTOM)
+            else:
+                return "Error: flip direction must be 'horizontal' or 'vertical'"
+        elif operation == "grayscale":
+            img = img.convert("L")
+        elif operation == "brightness":
+            factor = p.get("factor", 1.0)
+            enhancer = ImageEnhance.Brightness(img)
+            img = enhancer.enhance(factor)
+        elif operation == "contrast":
+            factor = p.get("factor", 1.0)
+            enhancer = ImageEnhance.Contrast(img)
+            img = enhancer.enhance(factor)
+        elif operation == "sharpen":
+            factor = p.get("factor", 2.0)
+            enhancer = ImageEnhance.Sharpness(img)
+            img = enhancer.enhance(factor)
+        elif operation == "blur":
+            radius = p.get("radius", 2)
+            img = img.filter(ImageFilter.GaussianBlur(radius=radius))
+        elif operation == "thumbnail":
+            size = tuple(p.get("size", [128, 128]))
+            img.thumbnail(size, Image.Resampling.LANCZOS)
+        else:
+            return f"Unknown operation: {operation}. Available: crop, rotate, resize, flip, grayscale, brightness, contrast, sharpen, blur, thumbnail"
+        # Save to temp file
+        ext = file_path.lower().split('.')[-1]
+        if ext not in ['jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp']:
+            ext = 'png'
+        output_path = os.path.join(tempfile.gettempdir(), f"manipulated_{os.path.basename(file_path)}")
+        # Handle mode conversion for JPEG
+        if ext in ['jpg', 'jpeg'] and img.mode in ['RGBA', 'LA', 'P']:
+            img = img.convert('RGB')
+        img.save(output_path, format=original_format if original_format else None)
+        return f"Image manipulated successfully.\nOperation: {operation}\nOriginal size: {Image.open(file_path).size}\nNew size: {img.size}\nSaved to: {output_path}"
+    except ImportError:
+        return "Error: Pillow is not installed. Please install it with: pip install Pillow"
+    except Exception as e:
+        return f"Image manipulation error: {str(e)}"
+@tool
+def image_annotate(
+    file_path: str,
+    annotations: str
+) -> str:
+    """Add annotations (text, rectangles, circles, lines) to an image.
+    Annotations format (JSON array):
+    [
+        {"type": "text", "text": "Label", "position": [x, y], "color": "red", "size": 20},
+        {"type": "rectangle", "box": [x1, y1, x2, y2], "color": "blue", "width": 2},
+        {"type": "circle", "center": [x, y], "radius": 50, "color": "green", "width": 2},
+        {"type": "line", "start": [x1, y1], "end": [x2, y2], "color": "yellow", "width": 2},
+        {"type": "arrow", "start": [x1, y1], "end": [x2, y2], "color": "red", "width": 2}
+    ]
+    Colors can be: "red", "green", "blue", "yellow", "white", "black", "orange", "purple", or RGB tuple like [255, 0, 0]
+    Args:
+        file_path: Path to the image file
+        annotations: JSON string with list of annotations
+    """
+    try:
+        from PIL import Image, ImageDraw, ImageFont
+        import math
+        # Parse annotations
+        try:
+            annots = json.loads(annotations)
+        except json.JSONDecodeError:
+            return f"Error parsing annotations: {annotations}. Use JSON array format."
+        if not isinstance(annots, list):
+            annots = [annots]
+        # Open the image
+        img = Image.open(file_path)
+        if img.mode != 'RGBA':
+            img = img.convert('RGBA')
+        draw = ImageDraw.Draw(img)
+        # Color mapping
+        color_map = {
+            "red": (255, 0, 0),
+            "green": (0, 255, 0),
+            "blue": (0, 0, 255),
+            "yellow": (255, 255, 0),
+            "white": (255, 255, 255),
+            "black": (0, 0, 0),
+            "orange": (255, 165, 0),
+            "purple": (128, 0, 128),
+            "cyan": (0, 255, 255),
+            "magenta": (255, 0, 255),
+        }
+        def get_color(c):
+            if isinstance(c, str):
+                return color_map.get(c.lower(), (255, 0, 0))
+            elif isinstance(c, list):
+                return tuple(c)
+            return (255, 0, 0)
+        # Try to load a font, fall back to default
+        def get_font(size):
+            try:
+                # Try common font paths
+                font_paths = [
+                    "arial.ttf",
+                    "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
+                    "/System/Library/Fonts/Helvetica.ttc",
+                    "C:/Windows/Fonts/arial.ttf",
+                ]
+                for fp in font_paths:
+                    try:
+                        return ImageFont.truetype(fp, size)
+                    except:
+                        continue
+                return ImageFont.load_default()
+            except:
+                return ImageFont.load_default()
+        # Process each annotation
+        for annot in annots:
+            atype = annot.get("type", "").lower()
+            color = get_color(annot.get("color", "red"))
+            width = annot.get("width", 2)
+            if atype == "text":
+                text = annot.get("text", "")
+                position = tuple(annot.get("position", [10, 10]))
+                size = annot.get("size", 20)
+                font = get_font(size)
+                draw.text(position, text, fill=color, font=font)
+            elif atype == "rectangle":
+                box = annot.get("box", [0, 0, 100, 100])
+                fill = annot.get("fill")
+                fill_color = get_color(fill) if fill else None
+                draw.rectangle(box, outline=color, width=width, fill=fill_color)
+            elif atype == "circle":
+                center = annot.get("center", [50, 50])
+                radius = annot.get("radius", 25)
+                box = [center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius]
+                fill = annot.get("fill")
+                fill_color = get_color(fill) if fill else None
+                draw.ellipse(box, outline=color, width=width, fill=fill_color)
+            elif atype == "line":
+                start = tuple(annot.get("start", [0, 0]))
+                end = tuple(annot.get("end", [100, 100]))
+                draw.line([start, end], fill=color, width=width)
+            elif atype == "arrow":
+                start = annot.get("start", [0, 0])
+                end = annot.get("end", [100, 100])
+                draw.line([tuple(start), tuple(end)], fill=color, width=width)
+                # Draw arrowhead
+                angle = math.atan2(end[1] - start[1], end[0] - start[0])
+                arrow_length = 15
+                arrow_angle = math.pi / 6  # 30 degrees
+                p1 = (
+                    end[0] - arrow_length * math.cos(angle - arrow_angle),
+                    end[1] - arrow_length * math.sin(angle - arrow_angle)
+                )
+                p2 = (
+                    end[0] - arrow_length * math.cos(angle + arrow_angle),
+                    end[1] - arrow_length * math.sin(angle + arrow_angle)
+                )
+                draw.polygon([tuple(end), p1, p2], fill=color)
+        # Save to temp file
+        output_path = os.path.join(tempfile.gettempdir(), f"annotated_{os.path.basename(file_path)}")
+        # Convert back to RGB if saving as JPEG
+        ext = file_path.lower().split('.')[-1]
+        if ext in ['jpg', 'jpeg']:
+            img = img.convert('RGB')
+        img.save(output_path)
+        return f"Image annotated successfully.\nAnnotations added: {len(annots)}\nSaved to: {output_path}"
+    except ImportError:
+        return "Error: Pillow is not installed. Please install it with: pip install Pillow"
+    except Exception as e:
+        return f"Image annotation error: {str(e)}"
+@tool
+def image_ocr(file_path: str, lang: str = "eng") -> str:
+    """Extract text from an image using OCR (Optical Character Recognition).
+    Uses Tesseract OCR engine. Requires tesseract to be installed on the system.
+    Args:
+        file_path: Path to the image file
+        lang: Language code for OCR (default: "eng" for English).
+              Common codes: eng, fra, deu, spa, ita, por, chi_sim, chi_tra, jpn, kor
+    """
+    try:
+        import pytesseract
+        from PIL import Image
+        # Open and preprocess image
+        img = Image.open(file_path)
+        # Convert to RGB if necessary
+        if img.mode not in ['RGB', 'L']:
+            img = img.convert('RGB')
+        # Extract text
+        text = pytesseract.image_to_string(img, lang=lang)
+        # Also get structured data with confidence
+        try:
+            data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT)
+            # Calculate average confidence for detected words
+            confidences = [int(c) for c in data['conf'] if int(c) > 0]
+            avg_confidence = sum(confidences) / len(confidences) if confidences else 0
+            word_count = len([w for w in data['text'] if w.strip()])
+            return f"OCR Result:\n{'-'*40}\n{text.strip()}\n{'-'*40}\nWords detected: {word_count}\nAverage confidence: {avg_confidence:.1f}%"
+        except:
+            return f"OCR Result:\n{'-'*40}\n{text.strip()}\n{'-'*40}"
+    except ImportError as e:
+        if "pytesseract" in str(e):
+            return "Error: pytesseract is not installed. Please install it with: pip install pytesseract\nAlso ensure Tesseract OCR is installed on your system."
+        return f"Import error: {str(e)}"
+    except Exception as e:
+        error_msg = str(e)
+        if "tesseract" in error_msg.lower():
+            return f"Tesseract OCR error: {error_msg}\n\nMake sure Tesseract is installed:\n- Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n- Mac: brew install tesseract\n- Linux: sudo apt install tesseract-ocr"
+        return f"OCR error: {error_msg}"

tools/math_tools.py ADDED Viewed

	@@ -0,0 +1,354 @@

+"""
+Mathematical computation tools for the GAIA Agent.
+Includes symbolic math, matrix operations, calculator, and statistics.
+"""
+import json
+from langchain_core.tools import tool
+@tool
+def symbolic_math(expression: str, operation: str = "simplify", variable: str = "x") -> str:
+    """Perform symbolic mathematics operations using SymPy.
+    Operations available:
+    - simplify: Simplify an expression
+    - expand: Expand an expression
+    - factor: Factor an expression
+    - solve: Solve an equation (set equal to 0)
+    - differentiate: Compute derivative with respect to variable
+    - integrate: Compute indefinite integral with respect to variable
+    - limit: Compute limit as variable approaches 0 (use expression like "sin(x)/x")
+    - series: Compute Taylor series expansion
+    Args:
+        expression: Mathematical expression (use ** for power, e.g., "x**2 + 2*x + 1")
+        operation: One of: simplify, expand, factor, solve, differentiate, integrate, limit, series
+        variable: Variable to use for calculus operations (default: "x")
+    """
+    try:
+        import sympy as sp
+        from sympy.parsing.sympy_parser import parse_expr, standard_transformations, implicit_multiplication_application
+        # Define common symbols
+        x, y, z, t, n, a, b, c = sp.symbols('x y z t n a b c')
+        symbol_map = {'x': x, 'y': y, 'z': z, 't': t, 'n': n, 'a': a, 'b': b, 'c': c}
+        # Get the main variable
+        var = symbol_map.get(variable, sp.Symbol(variable))
+        # Parse the expression with implicit multiplication support
+        transformations = standard_transformations + (implicit_multiplication_application,)
+        expr = parse_expr(expression, local_dict=symbol_map, transformations=transformations)
+        operation = operation.lower().strip()
+        if operation == "simplify":
+            result = sp.simplify(expr)
+        elif operation == "expand":
+            result = sp.expand(expr)
+        elif operation == "factor":
+            result = sp.factor(expr)
+        elif operation == "solve":
+            result = sp.solve(expr, var)
+        elif operation in ("differentiate", "diff", "derivative"):
+            result = sp.diff(expr, var)
+        elif operation in ("integrate", "integral"):
+            result = sp.integrate(expr, var)
+        elif operation == "limit":
+            result = sp.limit(expr, var, 0)
+        elif operation == "series":
+            result = sp.series(expr, var, 0, 6)
+        else:
+            return f"Unknown operation: {operation}. Available: simplify, expand, factor, solve, differentiate, integrate, limit, series"
+        return f"Input: {expression}\nOperation: {operation}\nResult: {result}"
+    except ImportError:
+        return "Error: SymPy is not installed. Please install it with: pip install sympy"
+    except Exception as e:
+        return f"Symbolic math error: {str(e)}"
+@tool
+def matrix_operations(matrix_a: str, operation: str = "determinant", matrix_b: str = "") -> str:
+    """Perform matrix operations using NumPy.
+    Operations available:
+    - determinant: Compute determinant of matrix_a
+    - inverse: Compute inverse of matrix_a
+    - transpose: Compute transpose of matrix_a
+    - eigenvalues: Compute eigenvalues of matrix_a
+    - eigenvectors: Compute eigenvectors of matrix_a
+    - rank: Compute rank of matrix_a
+    - trace: Compute trace of matrix_a
+    - multiply: Matrix multiplication of matrix_a @ matrix_b
+    - add: Element-wise addition of matrix_a + matrix_b
+    - solve: Solve linear system Ax = b (matrix_a is A, matrix_b is b vector)
+    Args:
+        matrix_a: Matrix as JSON array, e.g., "[[1,2],[3,4]]"
+        operation: One of: determinant, inverse, transpose, eigenvalues, eigenvectors, rank, trace, multiply, add, solve
+        matrix_b: Second matrix for binary operations (as JSON array)
+    """
+    try:
+        import numpy as np
+        # Parse matrix_a
+        try:
+            a = np.array(json.loads(matrix_a), dtype=float)
+        except json.JSONDecodeError:
+            return f"Error parsing matrix_a: {matrix_a}. Use JSON format like [[1,2],[3,4]]"
+        operation = operation.lower().strip()
+        # Parse matrix_b if needed
+        b = None
+        if matrix_b and operation in ("multiply", "add", "solve"):
+            try:
+                b = np.array(json.loads(matrix_b), dtype=float)
+            except json.JSONDecodeError:
+                return f"Error parsing matrix_b: {matrix_b}. Use JSON format like [[1,2],[3,4]]"
+        if operation == "determinant":
+            if a.shape[0] != a.shape[1]:
+                return "Error: Determinant requires a square matrix"
+            result = np.linalg.det(a)
+        elif operation == "inverse":
+            if a.shape[0] != a.shape[1]:
+                return "Error: Inverse requires a square matrix"
+            result = np.linalg.inv(a)
+        elif operation == "transpose":
+            result = a.T
+        elif operation == "eigenvalues":
+            if a.shape[0] != a.shape[1]:
+                return "Error: Eigenvalues require a square matrix"
+            result = np.linalg.eigvals(a)
+        elif operation == "eigenvectors":
+            if a.shape[0] != a.shape[1]:
+                return "Error: Eigenvectors require a square matrix"
+            eigenvalues, eigenvectors = np.linalg.eig(a)
+            result = {"eigenvalues": eigenvalues.tolist(), "eigenvectors": eigenvectors.tolist()}
+        elif operation == "rank":
+            result = np.linalg.matrix_rank(a)
+        elif operation == "trace":
+            result = np.trace(a)
+        elif operation == "multiply":
+            if b is None:
+                return "Error: multiply operation requires matrix_b"
+            result = a @ b
+        elif operation == "add":
+            if b is None:
+                return "Error: add operation requires matrix_b"
+            result = a + b
+        elif operation == "solve":
+            if b is None:
+                return "Error: solve operation requires matrix_b (the b vector)"
+            result = np.linalg.solve(a, b)
+        else:
+            return f"Unknown operation: {operation}. Available: determinant, inverse, transpose, eigenvalues, eigenvectors, rank, trace, multiply, add, solve"
+        # Format result
+        if isinstance(result, np.ndarray):
+            result_str = np.array2string(result, precision=6, suppress_small=True)
+        elif isinstance(result, dict):
+            result_str = json.dumps(result, indent=2)
+        else:
+            result_str = str(result)
+        return f"Matrix A:\n{np.array2string(a)}\nOperation: {operation}\nResult:\n{result_str}"
+    except ImportError:
+        return "Error: NumPy is not installed. Please install it with: pip install numpy"
+    except np.linalg.LinAlgError as e:
+        return f"Linear algebra error: {str(e)}"
+    except Exception as e:
+        return f"Matrix operation error: {str(e)}"
+@tool
+def calculator(expression: str) -> str:
+    """Evaluate a mathematical expression with high precision.
+    Supports standard math operations: +, -, *, /, ** (power), % (modulo)
+    Also supports functions: sqrt, sin, cos, tan, log, log10, exp, abs, ceil, floor, round
+    Constants: pi, e
+    Args:
+        expression: Mathematical expression to evaluate, e.g., "sqrt(2) * pi" or "2**10 + 5"
+    """
+    try:
+        import math
+        from decimal import Decimal, getcontext
+        # Set high precision
+        getcontext().prec = 50
+        # Safe evaluation context with math functions
+        safe_dict = {
+            'sqrt': math.sqrt,
+            'sin': math.sin,
+            'cos': math.cos,
+            'tan': math.tan,
+            'asin': math.asin,
+            'acos': math.acos,
+            'atan': math.atan,
+            'atan2': math.atan2,
+            'log': math.log,
+            'log10': math.log10,
+            'log2': math.log2,
+            'exp': math.exp,
+            'pow': pow,
+            'abs': abs,
+            'ceil': math.ceil,
+            'floor': math.floor,
+            'round': round,
+            'pi': math.pi,
+            'e': math.e,
+            'inf': math.inf,
+            'factorial': math.factorial,
+            'gcd': math.gcd,
+            'lcm': math.lcm,
+            'degrees': math.degrees,
+            'radians': math.radians,
+        }
+        # Basic security check - only allow safe characters
+        allowed_chars = set('0123456789+-*/.()%, abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_')
+        if not all(c in allowed_chars for c in expression):
+            return f"Error: Expression contains invalid characters. Only math operations and functions are allowed."
+        # Evaluate the expression
+        result = eval(expression, {"__builtins__": {}}, safe_dict)
+        # Format the result
+        if isinstance(result, float):
+            # Check if it's close to an integer
+            if result == int(result):
+                result_str = str(int(result))
+            else:
+                result_str = f"{result:.15g}"  # High precision but remove trailing zeros
+        else:
+            result_str = str(result)
+        return f"Expression: {expression}\nResult: {result_str}"
+    except ZeroDivisionError:
+        return "Error: Division by zero"
+    except ValueError as e:
+        return f"Math error: {str(e)}"
+    except Exception as e:
+        return f"Calculator error: {str(e)}"
+@tool
+def statistical_analysis(data: str, operation: str = "describe") -> str:
+    """Perform statistical analysis on numerical data.
+    Operations available:
+    - describe: Full statistical summary (mean, median, std, min, max, quartiles)
+    - mean: Arithmetic mean
+    - median: Median value
+    - mode: Most frequent value
+    - std: Standard deviation
+    - var: Variance
+    - correlation: Correlation coefficient (requires 2D data like [[x1,y1],[x2,y2],...])
+    - regression: Linear regression (requires 2D data)
+    - percentile: Compute 25th, 50th, 75th percentiles
+    - zscore: Compute z-scores for each value
+    Args:
+        data: Numerical data as JSON array, e.g., "[1, 2, 3, 4, 5]" or "[[1,2],[3,4]]" for 2D
+        operation: One of: describe, mean, median, mode, std, var, correlation, regression, percentile, zscore
+    """
+    try:
+        import numpy as np
+        from scipy import stats as sp_stats
+        # Parse data
+        try:
+            arr = np.array(json.loads(data), dtype=float)
+        except json.JSONDecodeError:
+            return f"Error parsing data: {data}. Use JSON format like [1, 2, 3, 4, 5]"
+        operation = operation.lower().strip()
+        if operation == "describe":
+            if arr.ndim == 1:
+                result = {
+                    "count": len(arr),
+                    "mean": float(np.mean(arr)),
+                    "median": float(np.median(arr)),
+                    "std": float(np.std(arr)),
+                    "variance": float(np.var(arr)),
+                    "min": float(np.min(arr)),
+                    "max": float(np.max(arr)),
+                    "25th_percentile": float(np.percentile(arr, 25)),
+                    "50th_percentile": float(np.percentile(arr, 50)),
+                    "75th_percentile": float(np.percentile(arr, 75)),
+                    "sum": float(np.sum(arr)),
+                }
+            else:
+                result = {
+                    "shape": arr.shape,
+                    "mean_per_column": np.mean(arr, axis=0).tolist(),
+                    "std_per_column": np.std(arr, axis=0).tolist(),
+                }
+        elif operation == "mean":
+            result = float(np.mean(arr))
+        elif operation == "median":
+            result = float(np.median(arr))
+        elif operation == "mode":
+            mode_result = sp_stats.mode(arr.flatten(), keepdims=False)
+            result = {"mode": float(mode_result.mode), "count": int(mode_result.count)}
+        elif operation == "std":
+            result = float(np.std(arr))
+        elif operation == "var":
+            result = float(np.var(arr))
+        elif operation == "correlation":
+            if arr.ndim != 2 or arr.shape[1] != 2:
+                return "Error: correlation requires 2D data with 2 columns, e.g., [[x1,y1],[x2,y2],...]"
+            result = float(np.corrcoef(arr[:, 0], arr[:, 1])[0, 1])
+        elif operation == "regression":
+            if arr.ndim != 2 or arr.shape[1] != 2:
+                return "Error: regression requires 2D data with 2 columns, e.g., [[x1,y1],[x2,y2],...]"
+            slope, intercept, r_value, p_value, std_err = sp_stats.linregress(arr[:, 0], arr[:, 1])
+            result = {
+                "slope": float(slope),
+                "intercept": float(intercept),
+                "r_squared": float(r_value**2),
+                "p_value": float(p_value),
+                "std_error": float(std_err),
+                "equation": f"y = {slope:.6f}x + {intercept:.6f}"
+            }
+        elif operation == "percentile":
+            result = {
+                "25th": float(np.percentile(arr, 25)),
+                "50th": float(np.percentile(arr, 50)),
+                "75th": float(np.percentile(arr, 75)),
+                "90th": float(np.percentile(arr, 90)),
+                "95th": float(np.percentile(arr, 95)),
+                "99th": float(np.percentile(arr, 99)),
+            }
+        elif operation == "zscore":
+            zscores = sp_stats.zscore(arr.flatten())
+            result = zscores.tolist()
+        else:
+            return f"Unknown operation: {operation}. Available: describe, mean, median, mode, std, var, correlation, regression, percentile, zscore"
+        # Format result
+        if isinstance(result, dict):
+            result_str = json.dumps(result, indent=2)
+        elif isinstance(result, list):
+            result_str = json.dumps(result)
+        else:
+            result_str = str(result)
+        return f"Data: {len(arr.flatten())} values\nOperation: {operation}\nResult:\n{result_str}"
+    except ImportError as e:
+        missing = "scipy" if "scipy" in str(e) else "numpy"
+        return f"Error: {missing} is not installed. Please install it with: pip install {missing}"
+    except Exception as e:
+        return f"Statistical analysis error: {str(e)}"

tools/media_tools.py ADDED Viewed

	@@ -0,0 +1,434 @@

+"""
+Media processing tools for the GAIA Agent.
+Includes YouTube transcript, audio transcription, and video analysis.
+"""
+import os
+import json
+import math
+import shutil
+import tempfile
+import subprocess
+from typing import Optional, List, Dict
+from urllib.parse import urlparse, parse_qs
+import requests
+import openai
+import base64
+from langchain_core.tools import tool
+from dotenv import load_dotenv
+load_dotenv()
+client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+def _extract_youtube_id(url: str) -> Optional[str]:
+    """Extract YouTube video ID from various URL formats."""
+    try:
+        u = urlparse(url)
+        host = (u.netloc or "").lower()
+        path = u.path or ""
+        # watch?v=VIDEO_ID
+        qs = parse_qs(u.query)
+        if "v" in qs and qs["v"]:
+            vid = qs["v"][0]
+            if len(vid) == 11:
+                return vid
+        # youtu.be/VIDEO_ID
+        if "youtu.be" in host:
+            seg = path.strip("/").split("/")
+            if seg and len(seg[0]) == 11:
+                return seg[0]
+        # /embed/VIDEO_ID, /shorts/VIDEO_ID, /live/VIDEO_ID
+        parts = path.strip("/").split("/")
+        for i, p in enumerate(parts[:-1]):
+            if p in {"embed", "shorts", "live"}:
+                vid = parts[i + 1]
+                if len(vid) == 11:
+                    return vid
+        return None
+    except Exception:
+        return None
+def _transcribe_audio_file(path: str) -> str:
+    """Shared helper to transcribe a local audio file with Whisper-1."""
+    with open(path, "rb") as audio_file:
+        transcript = client.audio.transcriptions.create(
+            model="whisper-1",
+            file=audio_file,
+            response_format="text"
+        )
+    return transcript
+def _encode_image_to_data_url(path: str) -> str:
+    """Helper to turn a local image into a data: URL for GPT-4o vision."""
+    with open(path, "rb") as img_file:
+        image_data = base64.b64encode(img_file.read()).decode("utf-8")
+    ext = path.lower().split('.')[-1]
+    media_type = {
+        "png": "image/png",
+        "jpg": "image/jpeg",
+        "jpeg": "image/jpeg",
+        "gif": "image/gif",
+        "webp": "image/webp",
+    }.get(ext, "image/png")
+    return f"data:{media_type};base64,{image_data}"
+@tool
+def youtube_transcript(video_url: str, languages: str = "en") -> str:
+    """Get the transcript/captions from a YouTube video.
+    First tries to get existing captions. If captions are disabled,
+    falls back to downloading audio and transcribing with Whisper.
+    Args:
+        video_url: The YouTube video URL
+        languages: Comma-separated language codes to prefer (default: "en")
+    """
+    video_id = _extract_youtube_id(video_url)
+    if not video_id:
+        return f"Could not extract video ID from: {video_url}"
+    lang_list = [l.strip() for l in languages.split(",") if l.strip()]
+    # First, try to get existing captions
+    caption_error = None
+    try:
+        from youtube_transcript_api import YouTubeTranscriptApi
+        api = YouTubeTranscriptApi()
+        tlist = api.fetch(video_id)
+        transcript = None
+        # Prefer manually created captions
+        for lang in lang_list:
+            try:
+                transcript = tlist.find_manually_created_transcript([lang])
+                break
+            except Exception:
+                pass
+        # Otherwise try auto-generated captions
+        if transcript is None:
+            for lang in lang_list:
+                try:
+                    transcript = tlist.find_generated_transcript([lang])
+                    break
+                except Exception:
+                    pass
+        # Otherwise fall back to whatever exists for those languages
+        if transcript is None:
+            for lang in lang_list:
+                try:
+                    transcript = tlist.find_transcript([lang])
+                    break
+                except Exception:
+                    pass
+        if transcript is not None:
+            items = transcript.fetch()
+            text = " ".join(i.get("text", "") for i in items).strip()
+            if text:
+                return text[:8000]
+    except Exception as e:
+        # Captions might be disabled - we'll try fallback
+        caption_error = f"{type(e).__name__}: {e}"
+    # Fallback: Download audio and transcribe with Whisper
+    try:
+        return youtube_audio_transcribe.invoke({"video_url": video_url})
+    except Exception as whisper_error:
+        return (
+            f"Transcript error: Captions unavailable and audio transcription failed.\n"
+            f"Caption error: {caption_error or 'Unknown'}\n"
+            f"Whisper error: {whisper_error}\n\n"
+            f"Suggestion: Try using web_search to find information about this video instead."
+        )
+@tool
+def youtube_audio_transcribe(video_url: str) -> str:
+    """Download YouTube audio and transcribe with Whisper-1.
+    Use when captions are unavailable or you want an audio-based transcript.
+    Args:
+        video_url: The YouTube video URL
+    """
+    video_id = _extract_youtube_id(video_url)
+    if not video_id:
+        return f"Could not extract video ID from: {video_url}"
+    # Create temp directory for audio
+    with tempfile.TemporaryDirectory() as tmpdir:
+        audio_path = f"{tmpdir}/{video_id}.webm"
+        # Download audio using yt-dlp
+        result = subprocess.run(
+            [
+                "yt-dlp",
+                "-f", "bestaudio/best",
+                "-o", audio_path,
+                "--no-playlist",
+                "--max-filesize", "25M",
+                video_url,
+            ],
+            capture_output=True,
+            text=True,
+            timeout=120
+        )
+        if result.returncode != 0 or not os.path.exists(audio_path):
+            raise RuntimeError(f"yt-dlp failed.\nSTDERR:\n{result.stderr}\nSTDOUT:\n{result.stdout}")
+        return _transcribe_audio_file(audio_path)
+@tool
+def audio_transcribe(file_path: str) -> str:
+    """Transcribe an audio file to text using speech recognition.
+    Args:
+        file_path: Path to the audio file (.mp3, .wav, .m4a, etc.) or an http/https URL
+    """
+    try:
+        # If it's a URL, download first
+        if file_path.lower().startswith(("http://", "https://")):
+            with tempfile.NamedTemporaryFile(suffix=".audio", delete=False) as tmp:
+                r = requests.get(file_path, timeout=120)
+                r.raise_for_status()
+                tmp.write(r.content)
+                tmp_path = tmp.name
+            try:
+                return _transcribe_audio_file(tmp_path)
+            finally:
+                os.unlink(tmp_path)
+        else:
+            return _transcribe_audio_file(file_path)
+    except Exception as e:
+        return f"Transcription error: {str(e)}"
+@tool
+def video_metadata(video_url: str) -> str:
+    """Fetch coarse metadata for a video (duration, resolution, fps, title) using yt-dlp.
+    Args:
+        video_url: The video URL (YouTube or direct link)
+    """
+    try:
+        result = subprocess.run(
+            [
+                "yt-dlp",
+                "--dump-single-json",
+                "--no-playlist",
+                "--no-warnings",
+                video_url,
+            ],
+            capture_output=True,
+            text=True,
+            timeout=90,
+        )
+        if result.returncode != 0:
+            return f"Metadata error: yt-dlp failed.\nStdout: {result.stdout[:4000]}\nStderr: {result.stderr[:4000]}"
+        data = json.loads(result.stdout)
+        core = {
+            "title": data.get("title"),
+            "uploader": data.get("uploader"),
+            "duration_seconds": data.get("duration"),
+            "width": data.get("width"),
+            "height": data.get("height"),
+            "fps": data.get("fps"),
+            "url": video_url,
+        }
+        return json.dumps(core, indent=2)
+    except Exception as e:
+        return f"Metadata error: {str(e)}"
+@tool
+def video_frame_analyze(
+    video_url: str,
+    vision_task_prompt: str,
+    scene_threshold: Optional[float] = None,
+    scene_threshold_low: float = 0.2,
+    scene_threshold_high: float = 0.4,
+    max_frames: int = 120,
+    batch_size: int = 6,
+) -> str:
+    """Download a video, extract scene-change frames, and run GPT-4o vision batches.
+    Args:
+        video_url: URL to the video (YouTube or direct)
+        vision_task_prompt: Task for the vision model (e.g., count bird species per frame)
+        scene_threshold: Optional direct ffmpeg scene threshold (0-1). If None, use mid of low/high.
+        scene_threshold_low: Lower bound for threshold (default 0.2)
+        scene_threshold_high: Upper bound for threshold (default 0.4)
+        max_frames: Cap on frames to send to vision (downsamples if exceeded).
+        batch_size: Number of frames per GPT-4o call (keep modest to control context size).
+    """
+    tmpdir = tempfile.mkdtemp(prefix="video_analyze_")
+    try:
+        video_path = os.path.join(tmpdir, "video.mp4")
+        frame_dir = os.path.join(tmpdir, "frames")
+        os.makedirs(frame_dir, exist_ok=True)
+        # Step 1: obtain video (URL via yt-dlp, or local path copy)
+        if video_url.lower().startswith(("http://", "https://")):
+            # Ensure ffmpeg exists for merging
+            if shutil.which("ffmpeg") is None and shutil.which("avconv") is None:
+                return "Download error: ffmpeg/avconv not found in PATH; required for muxing."
+            # Use an AVC/H.264 + m4a combination to avoid unsupported codecs, cap at 1080p.
+            out_template = os.path.join(tmpdir, "video.%(ext)s")
+            dl = subprocess.run(
+                [
+                    "yt-dlp",
+                    "-f",
+                    "bestvideo[ext=mp4][vcodec^=avc1][height<=1080]+bestaudio[ext=m4a]/best[ext=mp4]/best",
+                    "--merge-output-format",
+                    "mp4",
+                    "--recode-video",
+                    "mp4",
+                    "--no-keep-video",
+                    "--no-playlist",
+                    "--no-warnings",
+                    "-o",
+                    out_template,
+                    video_url,
+                ],
+                capture_output=True,
+                text=True,
+                timeout=240,
+            )
+            if dl.returncode != 0:
+                return f"Download error: {dl.stderr[:4000] or dl.stdout[:4000]}"
+            # Locate the merged/re-encoded mp4
+            candidates = [
+                os.path.join(tmpdir, f)
+                for f in os.listdir(tmpdir)
+                if f.lower().endswith(".mp4")
+            ]
+            if not candidates:
+                return (
+                    "Download error: no mp4 produced after merge/recode. "
+                    f"yt-dlp stdout: {dl.stdout[:2000]} stderr: {dl.stderr[:2000]}"
+                )
+            # Pick the largest mp4 (most likely the merged one)
+            best_mp4 = max(candidates, key=lambda p: os.path.getsize(p))
+            if os.path.getsize(best_mp4) < 1024:
+                return (
+                    "Download error: merged file is empty or too small. "
+                    f"yt-dlp stdout: {dl.stdout[:2000]} stderr: {dl.stderr[:2000]}"
+                )
+            shutil.move(best_mp4, video_path)
+        else:
+            if not os.path.exists(video_url):
+                return f"Video path not found: {video_url}"
+            shutil.copy2(video_url, video_path)
+        # Step 2: choose scene threshold
+        thr_low = max(0.0, min(1.0, scene_threshold_low))
+        thr_high = max(thr_low, min(1.0, scene_threshold_high))
+        if scene_threshold is not None:
+            thr = max(thr_low, min(thr_high, scene_threshold))
+        else:
+            thr = (thr_low + thr_high) / 2.0
+        # Step 3: extract frames on scene changes
+        ffmpeg_cmd = [
+            "ffmpeg",
+            "-i",
+            video_path,
+            "-vf",
+            f"select='gt(scene,{thr})',showinfo",
+            "-vsync",
+            "vfr",
+            os.path.join(frame_dir, "frame_%05d.jpg"),
+        ]
+        ff = subprocess.run(
+            ffmpeg_cmd,
+            capture_output=True,
+            text=True,
+            timeout=180,
+        )
+        frames = sorted(
+            [
+                os.path.join(frame_dir, f)
+                for f in os.listdir(frame_dir)
+                if f.lower().endswith((".jpg", ".jpeg", ".png", ".webp"))
+            ]
+        )
+        if not frames:
+            return f"No frames extracted with scene threshold {thr}. ffmpeg stderr: {ff.stderr[:2000]}"
+        total_frames = len(frames)
+        if total_frames > max_frames:
+            step = math.ceil(total_frames / max_frames)
+            frames = frames[::step]
+        # Step 4: batch frames and call GPT-4o vision
+        batches = [frames[i : i + batch_size] for i in range(0, len(frames), batch_size)]
+        batch_outputs: List[Dict[str, str]] = []
+        for idx, batch in enumerate(batches, start=1):
+            content = [
+                {
+                    "type": "text",
+                    "text": (
+                        "You are a vision assistant. "
+                        "For each image, run the requested task and return a compact JSON array "
+                        "with objects: {frame_id, result}. "
+                        "frame_id should match the filename. "
+                        "Task:\n"
+                        f"{vision_task_prompt}"
+                    ),
+                }
+            ]
+            for p in batch:
+                data_url = _encode_image_to_data_url(p)
+                content.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": data_url},
+                    }
+                )
+            resp = client.chat.completions.create(
+                model="gpt-4o",
+                messages=[{"role": "user", "content": content}],
+                max_tokens=1200,
+            )
+            batch_outputs.append(
+                {
+                    "batch_index": idx,
+                    "frames": [os.path.basename(p) for p in batch],
+                    "response": resp.choices[0].message.content,
+                }
+            )
+        summary = {
+            "scene_threshold_used": thr,
+            "frames_extracted": total_frames,
+            "frames_sent": len(frames),
+            "batch_size": batch_size,
+            "batches": batch_outputs,
+        }
+        return json.dumps(summary, indent=2)
+    except Exception as e:
+        return f"Video frame analyze error: {str(e)}"
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)

tools/web_tools.py ADDED Viewed

	@@ -0,0 +1,229 @@

+"""
+Web-related tools for the GAIA Agent.
+Includes web search, Wikipedia lookup, arXiv search, and webpage fetching.
+"""
+import os
+import re
+import requests
+from langchain_core.tools import tool
+from tavily import TavilyClient
+import wikipedia
+import arxiv
+from dotenv import load_dotenv
+load_dotenv()
+@tool
+def web_search(query: str, include_content: bool = True, max_results: int = 5) -> str:
+    """Search the web for current information with full page content.
+    Use this for facts, news, people, places, events, or anything you need to look up.
+    Returns search results WITH full page content, so you can get detailed information directly.
+    Args:
+        query: The search query
+        include_content: If True, includes full page content (default: True)
+        max_results: Number of results to return (default: 5)
+    """
+    try:
+        client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
+        response = client.search(
+            query,
+            max_results=max_results,
+            include_raw_content=include_content,  # Get full page content
+            include_answer=True,  # Get a direct answer if available
+        )
+        output = []
+        # Include Tavily's direct answer if available
+        if response.get("answer"):
+            output.append(f"📌 DIRECT ANSWER: {response['answer']}")
+            output.append("=" * 50)
+        results = response.get("results", [])
+        if not results:
+            return "No search results found."
+        for i, r in enumerate(results, 1):
+            output.append(f"\n[{i}] {r.get('title', 'N/A')}")
+            output.append(f"URL: {r.get('url', 'N/A')}")
+            output.append(f"Snippet: {r.get('content', 'N/A')}")
+            # Include full page content if available
+            raw_content = r.get('raw_content')
+            if raw_content:
+                # Truncate to reasonable length per result
+                content_preview = raw_content[:3000]
+                if len(raw_content) > 3000:
+                    content_preview += "\n...[content truncated]"
+                output.append(f"\nFull Content:\n{content_preview}")
+            output.append("-" * 40)
+        return "\n".join(output)
+    except Exception as e:
+        return f"Search error: {str(e)}"
+@tool
+def wikipedia_lookup(topic: str) -> str:
+    """Look up a topic on Wikipedia for detailed encyclopedic information.
+    Args:
+        topic: The topic to look up
+    """
+    try:
+        search_results = wikipedia.search(topic, results=3)
+        if not search_results:
+            return f"No Wikipedia article found for: {topic}"
+        try:
+            page = wikipedia.page(search_results[0], auto_suggest=False)
+            return f"Title: {page.title}\n\nSummary:\n{page.summary[:4000]}"
+        except wikipedia.DisambiguationError as e:
+            if e.options:
+                page = wikipedia.page(e.options[0], auto_suggest=False)
+                return f"Title: {page.title}\n\nSummary:\n{page.summary[:4000]}"
+            return f"Multiple matches found: {e.options[:5]}"
+        except wikipedia.PageError:
+            return f"Page not found: {search_results[0]}"
+    except Exception as e:
+        return f"Wikipedia error: {str(e)}"
+@tool
+def arxiv_search(query: str, max_results: int = 5) -> str:
+    """Search arXiv for academic papers and research articles.
+    Use this for scientific papers, research, preprints, and academic publications.
+    Returns paper titles, authors, abstracts, and arXiv IDs.
+    Args:
+        query: Search query (can include author names, titles, or topics)
+        max_results: Maximum number of results to return (default: 5)
+    """
+    try:
+        # Create arXiv client and search
+        client = arxiv.Client()
+        search = arxiv.Search(
+            query=query,
+            max_results=max_results,
+            sort_by=arxiv.SortCriterion.Relevance
+        )
+        results = list(client.results(search))
+        if not results:
+            return f"No arXiv papers found for: {query}"
+        output = []
+        for i, paper in enumerate(results, 1):
+            output.append(f"[{i}] {paper.title}")
+            output.append(f"    Authors: {', '.join(a.name for a in paper.authors[:5])}")
+            if len(paper.authors) > 5:
+                output[-1] += f" et al. ({len(paper.authors)} total)"
+            output.append(f"    Published: {paper.published.strftime('%Y-%m-%d')}")
+            output.append(f"    arXiv ID: {paper.entry_id.split('/')[-1]}")
+            output.append(f"    Categories: {', '.join(paper.categories[:3])}")
+            output.append(f"    PDF: {paper.pdf_url}")
+            # Truncate abstract to ~500 chars
+            abstract = paper.summary.replace('\n', ' ')[:500]
+            if len(paper.summary) > 500:
+                abstract += "..."
+            output.append(f"    Abstract: {abstract}")
+            output.append("---")
+        return "\n".join(output)
+    except Exception as e:
+        return f"arXiv search error: {str(e)}"
+@tool
+def webpage_fetch(url: str, extract_links: bool = False) -> str:
+    """Fetch and read the content of a webpage URL.
+    Use this to read the full content of a page from search results.
+    After web_search returns URLs, use this tool to get detailed information.
+    Args:
+        url: The URL to fetch (http or https)
+        extract_links: If True, also extract and list links found on the page
+    """
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=30)
+        response.raise_for_status()
+        html = response.text
+        # Try to use BeautifulSoup for better parsing
+        try:
+            from bs4 import BeautifulSoup
+            soup = BeautifulSoup(html, 'html.parser')
+            # Remove script and style elements
+            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
+                element.decompose()
+            # Get title
+            title = soup.title.string if soup.title else "No title"
+            # Get main text content
+            text = soup.get_text(separator='\n', strip=True)
+            # Clean up excessive whitespace
+            lines = [line.strip() for line in text.splitlines() if line.strip()]
+            text = '\n'.join(lines)
+            # Extract links if requested
+            links_text = ""
+            if extract_links:
+                links = []
+                for a in soup.find_all('a', href=True)[:20]:  # Limit to 20 links
+                    href = a['href']
+                    link_text = a.get_text(strip=True)[:50]
+                    if href.startswith('http'):
+                        links.append(f"  - {link_text}: {href}")
+                if links:
+                    links_text = "\n\nLinks found:\n" + "\n".join(links)
+            # Truncate to reasonable length
+            if len(text) > 8000:
+                text = text[:8000] + "\n...[truncated]"
+            return f"Title: {title}\nURL: {url}\n\nContent:\n{text}{links_text}"
+        except ImportError:
+            # Fallback: basic HTML tag stripping without BeautifulSoup
+            # Remove script and style content
+            html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
+            html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
+            # Remove HTML tags
+            text = re.sub(r'<[^>]+>', ' ', html)
+            # Decode HTML entities
+            import html as html_module
+            text = html_module.unescape(text)
+            # Clean up whitespace
+            text = re.sub(r'\s+', ' ', text).strip()
+            # Truncate
+            if len(text) > 8000:
+                text = text[:8000] + "...[truncated]"
+            return f"URL: {url}\n\nContent:\n{text}\n\n(Note: Install beautifulsoup4 for better parsing: pip install beautifulsoup4)"
+    except requests.exceptions.Timeout:
+        return f"Error: Request timed out for URL: {url}"
+    except requests.exceptions.HTTPError as e:
+        return f"HTTP Error {e.response.status_code}: Could not fetch {url}"
+    except Exception as e:
+        return f"Error fetching webpage: {str(e)}"