Spaces:
Runtime error
Runtime error
Add .gitignore, implement BasicAgent in agent.py, and enhance app.py for checkpointing
Browse files- .gitignore +15 -0
- agent.py +1009 -0
- app.py +156 -43
- requirements.txt +9 -1
.gitignore
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
|
| 5 |
+
# Documentation
|
| 6 |
+
DOCUMENTATION/
|
| 7 |
+
|
| 8 |
+
#test sets
|
| 9 |
+
TEST_SET/
|
| 10 |
+
|
| 11 |
+
#test results
|
| 12 |
+
test_results/
|
| 13 |
+
|
| 14 |
+
#cursor
|
| 15 |
+
.cursor/
|
agent.py
ADDED
|
@@ -0,0 +1,1009 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, TypedDict, Dict, Any, Literal
|
| 2 |
+
from langgraph.graph import StateGraph, START, END
|
| 3 |
+
from langgraph.types import Command
|
| 4 |
+
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
|
| 5 |
+
from langchain_anthropic import ChatAnthropic
|
| 6 |
+
from langchain_core.tools import tool
|
| 7 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 8 |
+
from langgraph.prebuilt import ToolNode
|
| 9 |
+
import os
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
from tavily import TavilyClient
|
| 13 |
+
from langfuse.callback import CallbackHandler
|
| 14 |
+
import requests
|
| 15 |
+
import json
|
| 16 |
+
import time
|
| 17 |
+
from daytona_sdk import Daytona, DaytonaConfig
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# Load environment variablesTuple
|
| 22 |
+
load_dotenv()
|
| 23 |
+
|
| 24 |
+
# Define the state schema with messages that ToolNode can use
|
| 25 |
+
class AgentState(TypedDict):
|
| 26 |
+
messages: List
|
| 27 |
+
current_question: str
|
| 28 |
+
final_answer: str
|
| 29 |
+
validation_result: str
|
| 30 |
+
worker_iterations: int
|
| 31 |
+
supervisor_satisfaction: bool
|
| 32 |
+
validator_approval: bool
|
| 33 |
+
|
| 34 |
+
# Define tools following Langgraph guide
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@tool
|
| 38 |
+
def search_web_tavily(query: str) -> str:
|
| 39 |
+
"""Search the web for information using the Tavily search API."""
|
| 40 |
+
# Initialize the Tavily client with API key from environment variables
|
| 41 |
+
client = TavilyClient(os.getenv("TAVILY_API_KEY"))
|
| 42 |
+
|
| 43 |
+
# Perform the search
|
| 44 |
+
response = client.search(query=query)
|
| 45 |
+
|
| 46 |
+
# Process the results into a readable format
|
| 47 |
+
results = []
|
| 48 |
+
for i, result in enumerate(response.get("results", []), 1):
|
| 49 |
+
results.append(f"{i}. {result.get('title')}\n URL: {result.get('url')}\n {result.get('content')}\n")
|
| 50 |
+
|
| 51 |
+
# Format the final response
|
| 52 |
+
formatted_response = f"Search results for '{query}':\n\n" + "\n".join(results)
|
| 53 |
+
|
| 54 |
+
return formatted_response
|
| 55 |
+
|
| 56 |
+
@tool
|
| 57 |
+
def search_web_serper(query: str, result_limit: int = 5, search_type: str = "search") -> str:
|
| 58 |
+
"""Search the web for information using the Serper.dev API.
|
| 59 |
+
|
| 60 |
+
This tool provides comprehensive search results including:
|
| 61 |
+
1. Knowledge Graph data when available (title, description, attributes)
|
| 62 |
+
2. Organic search results (titles, links, snippets)
|
| 63 |
+
3. Related questions from "People Also Ask" section
|
| 64 |
+
4. Top stories and news articles related to the query
|
| 65 |
+
|
| 66 |
+
It's particularly useful for gathering factual information, current events,
|
| 67 |
+
and general knowledge from across the web. The results are formatted in a
|
| 68 |
+
readable structure with clear sections.
|
| 69 |
+
|
| 70 |
+
Parameters:
|
| 71 |
+
- query: The search query string
|
| 72 |
+
- result_limit: Maximum number of results to return per section (default: 5)
|
| 73 |
+
- search_type: Type of search ('search', 'news', 'places', 'images', 'shopping')
|
| 74 |
+
"""
|
| 75 |
+
# API URL and headers setup
|
| 76 |
+
url = "https://google.serper.dev/search"
|
| 77 |
+
headers = {
|
| 78 |
+
'X-API-KEY': os.getenv("SERPER_API_KEY"),
|
| 79 |
+
'Content-Type': 'application/json'
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
# Prepare the payload with the query and search type
|
| 83 |
+
payload = json.dumps({
|
| 84 |
+
"q": query,
|
| 85 |
+
"type": search_type
|
| 86 |
+
})
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
# Make the API request
|
| 90 |
+
response = requests.request("POST", url, headers=headers, data=payload, timeout=30)
|
| 91 |
+
response.raise_for_status() # Raise exception for HTTP errors
|
| 92 |
+
|
| 93 |
+
# Parse the JSON response
|
| 94 |
+
data = response.json()
|
| 95 |
+
|
| 96 |
+
# Format the results
|
| 97 |
+
results = []
|
| 98 |
+
|
| 99 |
+
# Add knowledge graph if available
|
| 100 |
+
if "knowledgeGraph" in data:
|
| 101 |
+
kg = data["knowledgeGraph"]
|
| 102 |
+
results.append(f"Knowledge Graph:\n{kg.get('title', 'Unknown')} - {kg.get('type', '')}")
|
| 103 |
+
results.append(f"Description: {kg.get('description', 'No description available')}")
|
| 104 |
+
|
| 105 |
+
if "attributes" in kg:
|
| 106 |
+
results.append("Attributes:")
|
| 107 |
+
for key, value in kg["attributes"].items():
|
| 108 |
+
results.append(f"- {key}: {value}")
|
| 109 |
+
|
| 110 |
+
results.append("") # Empty line for separation
|
| 111 |
+
|
| 112 |
+
# Add organic search results
|
| 113 |
+
if "organic" in data:
|
| 114 |
+
results.append("Organic Search Results:")
|
| 115 |
+
for i, result in enumerate(data["organic"][:result_limit], 1):
|
| 116 |
+
results.append(f"{i}. {result.get('title', 'No title')}")
|
| 117 |
+
results.append(f" URL: {result.get('link', 'No link')}")
|
| 118 |
+
results.append(f" {result.get('snippet', 'No snippet')}")
|
| 119 |
+
results.append("") # Empty line for separation
|
| 120 |
+
|
| 121 |
+
# Add people also ask if available
|
| 122 |
+
if "peopleAlsoAsk" in data and data["peopleAlsoAsk"]:
|
| 123 |
+
results.append("People Also Ask:")
|
| 124 |
+
for i, qa in enumerate(data["peopleAlsoAsk"][:min(3, result_limit)], 1):
|
| 125 |
+
results.append(f"{i}. Q: {qa.get('question', 'No question')}")
|
| 126 |
+
results.append(f" A: {qa.get('snippet', 'No answer')}")
|
| 127 |
+
results.append("") # Empty line for separation
|
| 128 |
+
|
| 129 |
+
# Add top stories if available
|
| 130 |
+
if "topStories" in data and data["topStories"]:
|
| 131 |
+
results.append("Top Stories:")
|
| 132 |
+
for i, story in enumerate(data["topStories"][:min(3, result_limit)], 1):
|
| 133 |
+
results.append(f"{i}. {story.get('title', 'No title')}")
|
| 134 |
+
results.append(f" Source: {story.get('source', 'Unknown source')}")
|
| 135 |
+
if "date" in story:
|
| 136 |
+
results.append(f" Published: {story.get('date')}")
|
| 137 |
+
results.append(f" URL: {story.get('link', 'No link')}")
|
| 138 |
+
results.append("") # Empty line for separation
|
| 139 |
+
|
| 140 |
+
# Format the final response
|
| 141 |
+
formatted_response = f"Search results for '{query}':\n\n" + "\n".join(results)
|
| 142 |
+
|
| 143 |
+
return formatted_response
|
| 144 |
+
|
| 145 |
+
except requests.exceptions.Timeout:
|
| 146 |
+
return f"Error: Request to Serper API timed out after 30 seconds"
|
| 147 |
+
except requests.exceptions.RequestException as e:
|
| 148 |
+
return f"Error making request to Serper API: {str(e)}"
|
| 149 |
+
except json.JSONDecodeError:
|
| 150 |
+
return f"Error: Received invalid JSON response from Serper API"
|
| 151 |
+
except Exception as e:
|
| 152 |
+
return f"Error processing search results: {str(e)}"
|
| 153 |
+
|
| 154 |
+
# Initialize a global Daytona sandbox for reuse
|
| 155 |
+
_daytona_sandbox = None
|
| 156 |
+
|
| 157 |
+
@tool
|
| 158 |
+
def execute_code_securely(code: str, language: str = "python", timeout: int = 300) -> str:
|
| 159 |
+
"""Execute code securely in an isolated sandbox environment using Daytona.
|
| 160 |
+
|
| 161 |
+
This tool runs code in a secure, isolated environment to prevent security issues.
|
| 162 |
+
It's particularly useful for solving computational problems, data processing tasks,
|
| 163 |
+
mathematical calculations, and other scenarios where code execution is needed.
|
| 164 |
+
|
| 165 |
+
The tool supports multiple languages, with Python as the default.
|
| 166 |
+
|
| 167 |
+
Parameters:
|
| 168 |
+
- code: The code to execute
|
| 169 |
+
- language: The programming language (default: "python")
|
| 170 |
+
- timeout: Maximum execution time in seconds (default: 30)
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
- The execution result or error message
|
| 174 |
+
"""
|
| 175 |
+
global _daytona_sandbox
|
| 176 |
+
|
| 177 |
+
try:
|
| 178 |
+
# Initialize Daytona client if not already done
|
| 179 |
+
if _daytona_sandbox is None:
|
| 180 |
+
api_key = os.getenv("DAYTONA_API_KEY")
|
| 181 |
+
if not api_key:
|
| 182 |
+
return "Error: DAYTONA_API_KEY environment variable not set"
|
| 183 |
+
|
| 184 |
+
# Initialize the Daytona client and create a sandbox
|
| 185 |
+
config = DaytonaConfig(api_key=api_key)
|
| 186 |
+
daytona_client = Daytona(config)
|
| 187 |
+
_daytona_sandbox = daytona_client.create()
|
| 188 |
+
|
| 189 |
+
# Execute the code based on the specified language
|
| 190 |
+
if language.lower() == "python":
|
| 191 |
+
response = _daytona_sandbox.process.code_run(code, timeout=timeout)
|
| 192 |
+
else:
|
| 193 |
+
# For non-Python languages, create a temporary file and execute it
|
| 194 |
+
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
| 195 |
+
file_extension = {
|
| 196 |
+
"javascript": "js",
|
| 197 |
+
"nodejs": "js",
|
| 198 |
+
"ruby": "rb",
|
| 199 |
+
"php": "php",
|
| 200 |
+
"bash": "sh",
|
| 201 |
+
"shell": "sh",
|
| 202 |
+
"powershell": "ps1",
|
| 203 |
+
"c": "c",
|
| 204 |
+
"cpp": "cpp",
|
| 205 |
+
"java": "java",
|
| 206 |
+
"go": "go",
|
| 207 |
+
"rust": "rs",
|
| 208 |
+
}.get(language.lower(), "txt")
|
| 209 |
+
|
| 210 |
+
filename = f"/tmp/code_{timestamp}.{file_extension}"
|
| 211 |
+
|
| 212 |
+
# Upload the code file to the sandbox
|
| 213 |
+
_daytona_sandbox.fs.upload_file(filename, code.encode('utf-8'))
|
| 214 |
+
|
| 215 |
+
# Prepare the execution command based on language
|
| 216 |
+
exec_cmd = {
|
| 217 |
+
"javascript": f"node {filename}",
|
| 218 |
+
"nodejs": f"node {filename}",
|
| 219 |
+
"ruby": f"ruby {filename}",
|
| 220 |
+
"php": f"php {filename}",
|
| 221 |
+
"bash": f"bash {filename}",
|
| 222 |
+
"shell": f"sh {filename}",
|
| 223 |
+
"powershell": f"pwsh {filename}",
|
| 224 |
+
"c": f"gcc {filename} -o /tmp/prog_{timestamp} && /tmp/prog_{timestamp}",
|
| 225 |
+
"cpp": f"g++ {filename} -o /tmp/prog_{timestamp} && /tmp/prog_{timestamp}",
|
| 226 |
+
"java": f"javac {filename} && java -cp /tmp {os.path.basename(filename).split('.')[0]}",
|
| 227 |
+
"go": f"go run {filename}",
|
| 228 |
+
"rust": f"rustc {filename} -o /tmp/prog_{timestamp} && /tmp/prog_{timestamp}",
|
| 229 |
+
}.get(language.lower(), f"cat {filename}")
|
| 230 |
+
|
| 231 |
+
# Execute the command
|
| 232 |
+
response = _daytona_sandbox.process.exec(exec_cmd, cwd="/tmp", timeout=timeout)
|
| 233 |
+
|
| 234 |
+
# Extract and return the result
|
| 235 |
+
if hasattr(response, 'result'):
|
| 236 |
+
result = response.result
|
| 237 |
+
elif hasattr(response, 'stdout'):
|
| 238 |
+
result = response.stdout
|
| 239 |
+
else:
|
| 240 |
+
result = str(response)
|
| 241 |
+
|
| 242 |
+
return f"Code Execution Result ({language}):\n{result}"
|
| 243 |
+
|
| 244 |
+
except Exception as e:
|
| 245 |
+
# Clean up on error
|
| 246 |
+
try:
|
| 247 |
+
if _daytona_sandbox is not None:
|
| 248 |
+
_daytona_sandbox = None
|
| 249 |
+
except:
|
| 250 |
+
pass
|
| 251 |
+
|
| 252 |
+
return f"Error executing code: {str(e)}"
|
| 253 |
+
|
| 254 |
+
@tool
|
| 255 |
+
def execute_shell_command(command: str, working_dir: str = "/tmp", timeout: int = 300) -> str:
|
| 256 |
+
"""Execute a shell command securely in an isolated sandbox environment using Daytona.
|
| 257 |
+
|
| 258 |
+
This tool runs shell commands in a secure, isolated environment to prevent security issues.
|
| 259 |
+
It's useful for file operations, system tasks, and other command-line operations.
|
| 260 |
+
|
| 261 |
+
Parameters:
|
| 262 |
+
- command: The shell command to execute
|
| 263 |
+
- working_dir: The working directory (default: "/tmp")
|
| 264 |
+
- timeout: Maximum execution time in seconds (default: 30)
|
| 265 |
+
|
| 266 |
+
Returns:
|
| 267 |
+
- The command execution output or error message
|
| 268 |
+
"""
|
| 269 |
+
global _daytona_sandbox
|
| 270 |
+
|
| 271 |
+
try:
|
| 272 |
+
# Initialize Daytona client if not already done
|
| 273 |
+
if _daytona_sandbox is None:
|
| 274 |
+
api_key = os.getenv("DAYTONA_API_KEY")
|
| 275 |
+
if not api_key:
|
| 276 |
+
return "Error: DAYTONA_API_KEY environment variable not set"
|
| 277 |
+
|
| 278 |
+
# Initialize the Daytona client and create a sandbox
|
| 279 |
+
config = DaytonaConfig(api_key=api_key)
|
| 280 |
+
daytona_client = Daytona(config)
|
| 281 |
+
_daytona_sandbox = daytona_client.create()
|
| 282 |
+
|
| 283 |
+
# Execute the command
|
| 284 |
+
response = _daytona_sandbox.process.exec(command, cwd=working_dir, timeout=timeout)
|
| 285 |
+
|
| 286 |
+
# Extract and return the result
|
| 287 |
+
if hasattr(response, 'result'):
|
| 288 |
+
result = response.result
|
| 289 |
+
elif hasattr(response, 'stdout'):
|
| 290 |
+
result = response.stdout
|
| 291 |
+
else:
|
| 292 |
+
result = str(response)
|
| 293 |
+
|
| 294 |
+
return f"Shell Command Execution Result:\n{result}"
|
| 295 |
+
|
| 296 |
+
except Exception as e:
|
| 297 |
+
# Clean up on error
|
| 298 |
+
try:
|
| 299 |
+
if _daytona_sandbox is not None:
|
| 300 |
+
_daytona_sandbox = None
|
| 301 |
+
except:
|
| 302 |
+
pass
|
| 303 |
+
|
| 304 |
+
return f"Error executing shell command: {str(e)}"
|
| 305 |
+
|
| 306 |
+
@tool
|
| 307 |
+
def sandbox_file_operation(operation: str, file_path: str, content: str = "", target_path: str = "") -> str:
|
| 308 |
+
"""Perform file operations in the secure sandbox environment.
|
| 309 |
+
|
| 310 |
+
This tool allows secure file manipulation in an isolated sandbox.
|
| 311 |
+
It supports creating, reading, writing, moving, copying and deleting files.
|
| 312 |
+
|
| 313 |
+
Parameters:
|
| 314 |
+
- operation: The operation to perform ('create', 'read', 'write', 'append', 'delete', 'move', 'copy', 'list')
|
| 315 |
+
- file_path: Path to the file to operate on
|
| 316 |
+
- content: Content to write (for 'create', 'write', 'append' operations)
|
| 317 |
+
- target_path: Target path for 'move' and 'copy' operations
|
| 318 |
+
|
| 319 |
+
Returns:
|
| 320 |
+
- Operation result or file content
|
| 321 |
+
"""
|
| 322 |
+
global _daytona_sandbox
|
| 323 |
+
|
| 324 |
+
try:
|
| 325 |
+
# Initialize Daytona client if not already done
|
| 326 |
+
if _daytona_sandbox is None:
|
| 327 |
+
api_key = os.getenv("DAYTONA_API_KEY")
|
| 328 |
+
if not api_key:
|
| 329 |
+
return "Error: DAYTONA_API_KEY environment variable not set"
|
| 330 |
+
|
| 331 |
+
# Initialize the Daytona client and create a sandbox
|
| 332 |
+
config = DaytonaConfig(api_key=api_key)
|
| 333 |
+
daytona_client = Daytona(config)
|
| 334 |
+
_daytona_sandbox = daytona_client.create()
|
| 335 |
+
|
| 336 |
+
# Perform the requested operation
|
| 337 |
+
operation = operation.lower()
|
| 338 |
+
|
| 339 |
+
if operation == "create" or operation == "write":
|
| 340 |
+
# Create or overwrite file
|
| 341 |
+
_daytona_sandbox.fs.upload_file(file_path, content.encode('utf-8'))
|
| 342 |
+
return f"File {file_path} created/written successfully"
|
| 343 |
+
|
| 344 |
+
elif operation == "append":
|
| 345 |
+
# First try to read the existing content
|
| 346 |
+
try:
|
| 347 |
+
existing_content = _daytona_sandbox.fs.download_file(file_path).decode('utf-8')
|
| 348 |
+
except:
|
| 349 |
+
existing_content = ""
|
| 350 |
+
|
| 351 |
+
# Append new content and write back
|
| 352 |
+
new_content = existing_content + content
|
| 353 |
+
_daytona_sandbox.fs.upload_file(file_path, new_content.encode('utf-8'))
|
| 354 |
+
return f"Content appended to {file_path} successfully"
|
| 355 |
+
|
| 356 |
+
elif operation == "read":
|
| 357 |
+
# Read file content
|
| 358 |
+
try:
|
| 359 |
+
content = _daytona_sandbox.fs.download_file(file_path).decode('utf-8')
|
| 360 |
+
return f"Content of {file_path}:\n{content}"
|
| 361 |
+
except Exception as e:
|
| 362 |
+
return f"Error reading {file_path}: {str(e)}"
|
| 363 |
+
|
| 364 |
+
elif operation == "delete":
|
| 365 |
+
# Delete file
|
| 366 |
+
response = _daytona_sandbox.process.exec(f"rm -f {file_path}", cwd="/tmp")
|
| 367 |
+
return f"File {file_path} deleted"
|
| 368 |
+
|
| 369 |
+
elif operation == "move":
|
| 370 |
+
# Move file
|
| 371 |
+
if not target_path:
|
| 372 |
+
return "Error: Target path required for move operation"
|
| 373 |
+
response = _daytona_sandbox.process.exec(f"mv {file_path} {target_path}", cwd="/tmp")
|
| 374 |
+
return f"File moved from {file_path} to {target_path}"
|
| 375 |
+
|
| 376 |
+
elif operation == "copy":
|
| 377 |
+
# Copy file
|
| 378 |
+
if not target_path:
|
| 379 |
+
return "Error: Target path required for copy operation"
|
| 380 |
+
response = _daytona_sandbox.process.exec(f"cp {file_path} {target_path}", cwd="/tmp")
|
| 381 |
+
return f"File copied from {file_path} to {target_path}"
|
| 382 |
+
|
| 383 |
+
elif operation == "list":
|
| 384 |
+
# List directory contents
|
| 385 |
+
response = _daytona_sandbox.process.exec(f"ls -la {file_path}", cwd="/tmp")
|
| 386 |
+
if hasattr(response, 'result'):
|
| 387 |
+
result = response.result
|
| 388 |
+
elif hasattr(response, 'stdout'):
|
| 389 |
+
result = response.stdout
|
| 390 |
+
else:
|
| 391 |
+
result = str(response)
|
| 392 |
+
return f"Directory listing of {file_path}:\n{result}"
|
| 393 |
+
|
| 394 |
+
else:
|
| 395 |
+
return f"Unsupported operation: {operation}"
|
| 396 |
+
|
| 397 |
+
except Exception as e:
|
| 398 |
+
return f"Error performing file operation: {str(e)}"
|
| 399 |
+
|
| 400 |
+
def cleanup_daytona_sandbox():
|
| 401 |
+
"""Clean up the Daytona sandbox when it's no longer needed."""
|
| 402 |
+
global _daytona_sandbox
|
| 403 |
+
|
| 404 |
+
try:
|
| 405 |
+
if _daytona_sandbox is not None:
|
| 406 |
+
# Get the Daytona client
|
| 407 |
+
api_key = os.getenv("DAYTONA_API_KEY")
|
| 408 |
+
if api_key:
|
| 409 |
+
config = DaytonaConfig(api_key=api_key)
|
| 410 |
+
daytona_client = Daytona(config)
|
| 411 |
+
|
| 412 |
+
# Remove the sandbox
|
| 413 |
+
daytona_client.remove(_daytona_sandbox)
|
| 414 |
+
_daytona_sandbox = None
|
| 415 |
+
print("Daytona sandbox cleaned up successfully")
|
| 416 |
+
except Exception as e:
|
| 417 |
+
print(f"Error cleaning up Daytona sandbox: {str(e)}")
|
| 418 |
+
|
| 419 |
+
# Track last execution time for rate limiting
|
| 420 |
+
_last_extract_url_time = 0
|
| 421 |
+
|
| 422 |
+
@tool
|
| 423 |
+
def extract_document_data(input_method: str, files: list, prompt: str, json_mode: bool = False) -> str:
|
| 424 |
+
"""Extract structured data from documents using Dumpling AI.
|
| 425 |
+
|
| 426 |
+
This tool allows you to extract information from various document formats including PDFs,
|
| 427 |
+
Office documents, images, and many other file types. It uses vision-capable Large Language
|
| 428 |
+
Models (LLMs) to interpret and extract data based on your specific prompt.
|
| 429 |
+
|
| 430 |
+
Parameters:
|
| 431 |
+
- input_method: How to input files, either "url" or "base64"
|
| 432 |
+
- files: List of file URLs or base64-encoded strings depending on input_method
|
| 433 |
+
- prompt: Specific instructions for what data to extract from the document
|
| 434 |
+
- json_mode: Whether to return structured JSON (true) or free text (false)
|
| 435 |
+
|
| 436 |
+
Returns:
|
| 437 |
+
- Extracted data from the document based on your prompt
|
| 438 |
+
|
| 439 |
+
Supported file extensions include PDFs, Word docs, Excel files, PowerPoint, images, HTML, and many others.
|
| 440 |
+
"""
|
| 441 |
+
api_key = os.getenv("DUMPLING_API_KEY")
|
| 442 |
+
if not api_key:
|
| 443 |
+
return "Error: DUMPLING_API_KEY environment variable not set"
|
| 444 |
+
|
| 445 |
+
try:
|
| 446 |
+
url = "https://app.dumplingai.com/api/v1/extract-document"
|
| 447 |
+
headers = {
|
| 448 |
+
"Content-Type": "application/json",
|
| 449 |
+
"Authorization": f"Bearer {api_key}"
|
| 450 |
+
}
|
| 451 |
+
|
| 452 |
+
data = {
|
| 453 |
+
"inputMethod": input_method,
|
| 454 |
+
"files": files,
|
| 455 |
+
"prompt": prompt,
|
| 456 |
+
"jsonMode": json_mode
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
response = requests.post(url, headers=headers, json=data, timeout=120)
|
| 460 |
+
response.raise_for_status()
|
| 461 |
+
|
| 462 |
+
result = response.json()
|
| 463 |
+
|
| 464 |
+
# Format the response in a readable way
|
| 465 |
+
formatted_response = f"Document Extraction Results:\n\n"
|
| 466 |
+
formatted_response += f"Extracted Data:\n{result.get('results', 'No results found')}\n\n"
|
| 467 |
+
formatted_response += f"Pages Processed: {result.get('pages', 'Unknown')}\n"
|
| 468 |
+
formatted_response += f"Files Processed: {result.get('fileCount', 'Unknown')}\n"
|
| 469 |
+
formatted_response += f"Credit Usage: {result.get('creditUsage', 'Unknown')}\n"
|
| 470 |
+
|
| 471 |
+
return formatted_response
|
| 472 |
+
|
| 473 |
+
except requests.exceptions.Timeout:
|
| 474 |
+
return "Error: Request to Dumpling AI API timed out after 120 seconds"
|
| 475 |
+
except requests.exceptions.HTTPError as e:
|
| 476 |
+
error_detail = f"HTTP Error: {e.response.status_code}"
|
| 477 |
+
try:
|
| 478 |
+
error_json = e.response.json()
|
| 479 |
+
error_detail += f" - {error_json.get('detail', error_json)}"
|
| 480 |
+
except:
|
| 481 |
+
error_detail += f" - {e.response.text[:500]}"
|
| 482 |
+
return error_detail
|
| 483 |
+
except requests.exceptions.RequestException as e:
|
| 484 |
+
return f"Error making request to Dumpling AI API: {str(e)}"
|
| 485 |
+
except Exception as e:
|
| 486 |
+
return f"Error extracting document data: {str(e)}"
|
| 487 |
+
|
| 488 |
+
@tool
|
| 489 |
+
def extract_url_content(url: str) -> str:
|
| 490 |
+
"""Extract content from a URL using Diffbot API (supports webpages, articles, PDFs, etc.).
|
| 491 |
+
This function is rate-limited to execute no more frequently than once every 20 seconds."""
|
| 492 |
+
global _last_extract_url_time
|
| 493 |
+
|
| 494 |
+
# Check if we need to wait before executing
|
| 495 |
+
current_time = time.time()
|
| 496 |
+
time_since_last_call = current_time - _last_extract_url_time
|
| 497 |
+
|
| 498 |
+
if time_since_last_call < 20 and _last_extract_url_time > 0:
|
| 499 |
+
# Calculate how long to wait
|
| 500 |
+
wait_time = 20 - time_since_last_call
|
| 501 |
+
print(f"Rate limiting: waiting {wait_time:.2f} seconds before next API call")
|
| 502 |
+
time.sleep(wait_time)
|
| 503 |
+
current_time = time.time() # Update current time after sleeping
|
| 504 |
+
|
| 505 |
+
# Update last execution time
|
| 506 |
+
_last_extract_url_time = current_time
|
| 507 |
+
|
| 508 |
+
# Diffbot token from environment or use the fallback
|
| 509 |
+
token = os.getenv("DIFFBOT_TOKEN")
|
| 510 |
+
if not token:
|
| 511 |
+
return "Error: DIFFBOT_TOKEN environment variable not set"
|
| 512 |
+
|
| 513 |
+
# Set up the API endpoint
|
| 514 |
+
api_url = "https://api.diffbot.com/v3/article"
|
| 515 |
+
|
| 516 |
+
# Parameters for the request
|
| 517 |
+
params = {
|
| 518 |
+
"token": token,
|
| 519 |
+
"url": url
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
try:
|
| 523 |
+
# Make the API request with a timeout
|
| 524 |
+
response = requests.get(api_url, params=params, timeout=30) # 30 second timeout
|
| 525 |
+
response.raise_for_status() # Raise exception for HTTP errors
|
| 526 |
+
|
| 527 |
+
# Parse the response
|
| 528 |
+
data = response.json()
|
| 529 |
+
|
| 530 |
+
# Extract relevant information
|
| 531 |
+
if "objects" in data and len(data["objects"]) > 0:
|
| 532 |
+
obj = data["objects"][0]
|
| 533 |
+
|
| 534 |
+
# Create a formatted result
|
| 535 |
+
result = f"Title: {obj.get('title', 'No title')}\n\n"
|
| 536 |
+
|
| 537 |
+
if "text" in obj:
|
| 538 |
+
result += f"Content:\n{obj.get('text')}\n\n"
|
| 539 |
+
|
| 540 |
+
#if "html" in obj:
|
| 541 |
+
# result += f"HTML Content:\n{obj.get('html')}\n\n"
|
| 542 |
+
|
| 543 |
+
if "categories" in obj and obj["categories"]:
|
| 544 |
+
categories = ", ".join([f"{cat.get('name')} ({cat.get('score', 0):.2f})"
|
| 545 |
+
for cat in obj["categories"]])
|
| 546 |
+
result += f"Categories: {categories}\n"
|
| 547 |
+
|
| 548 |
+
result += f"Source: {obj.get('siteName', 'Unknown')}\n"
|
| 549 |
+
result += f"URL: {obj.get('pageUrl', url)}"
|
| 550 |
+
|
| 551 |
+
return result
|
| 552 |
+
else:
|
| 553 |
+
return f"No content could be extracted from {url}. Response: {data}"
|
| 554 |
+
|
| 555 |
+
except requests.exceptions.Timeout:
|
| 556 |
+
return f"Error: Request to extract content from {url} timed out after 30 seconds"
|
| 557 |
+
except requests.exceptions.RequestException as e:
|
| 558 |
+
return f"Error: Failed to extract content from {url}: {str(e)}"
|
| 559 |
+
except Exception as e:
|
| 560 |
+
return f"Error extracting content from {url}: {str(e)}"
|
| 561 |
+
|
| 562 |
+
class BasicAgent:
|
| 563 |
+
def __init__(self):
|
| 564 |
+
print("BasicAgent initialized.")
|
| 565 |
+
# Initialize the Anthropic models
|
| 566 |
+
# Standard model for supervisor and validator
|
| 567 |
+
self.langfuse_handler = CallbackHandler()
|
| 568 |
+
|
| 569 |
+
self.supervisor_model = ChatAnthropic(
|
| 570 |
+
model="claude-3-7-sonnet-20250219",
|
| 571 |
+
max_tokens=20000,
|
| 572 |
+
anthropic_api_key=os.getenv("ANTHROPIC_API_KEY"),
|
| 573 |
+
temperature=0.6,
|
| 574 |
+
# thinking={
|
| 575 |
+
# "type": "enabled",
|
| 576 |
+
# "budget_tokens": 5000
|
| 577 |
+
# }
|
| 578 |
+
)
|
| 579 |
+
|
| 580 |
+
# Standard model for validator
|
| 581 |
+
self.validator_model = ChatAnthropic(
|
| 582 |
+
model="claude-3-7-sonnet-20250219",
|
| 583 |
+
max_tokens=20000,
|
| 584 |
+
temperature=0.5, # Lower temperature for more consistent validation
|
| 585 |
+
anthropic_api_key=os.getenv("ANTHROPIC_API_KEY")
|
| 586 |
+
)
|
| 587 |
+
|
| 588 |
+
# Tool-enabled model for worker
|
| 589 |
+
self.worker_model_base = ChatAnthropic(
|
| 590 |
+
model="claude-3-7-sonnet-20250219",
|
| 591 |
+
max_tokens=20000,
|
| 592 |
+
temperature=0.75,
|
| 593 |
+
anthropic_api_key=os.getenv("ANTHROPIC_API_KEY")
|
| 594 |
+
)
|
| 595 |
+
|
| 596 |
+
# Initialize tools
|
| 597 |
+
self.tools = [search_web_tavily, search_web_serper, execute_code_securely, execute_shell_command, sandbox_file_operation, extract_document_data, extract_url_content]
|
| 598 |
+
|
| 599 |
+
# Bind tools only to the worker model
|
| 600 |
+
self.worker_model = self.worker_model_base.bind_tools(self.tools)
|
| 601 |
+
|
| 602 |
+
# Create the tool node for executing tools
|
| 603 |
+
self.tool_node = ToolNode(self.tools)
|
| 604 |
+
|
| 605 |
+
# Create the workflow
|
| 606 |
+
self.app = self._create_workflow()
|
| 607 |
+
|
| 608 |
+
def _process_messages_after_tools(self, messages):
|
| 609 |
+
"""Process messages to ensure tool calls and tool results are properly paired.
|
| 610 |
+
This helps prevent the Anthropic error: unexpected `tool_use_id` found in `tool_result` blocks."""
|
| 611 |
+
# Create a mapping of tool_call_id to AIMessage index
|
| 612 |
+
tool_call_map = {}
|
| 613 |
+
for i, msg in enumerate(messages):
|
| 614 |
+
if isinstance(msg, AIMessage) and getattr(msg, "tool_calls", None):
|
| 615 |
+
for tool_call in msg.tool_calls:
|
| 616 |
+
if "id" in tool_call:
|
| 617 |
+
tool_call_map[tool_call["id"]] = i
|
| 618 |
+
|
| 619 |
+
# Filter out ToolMessages that don't have a matching AIMessage with tool_calls
|
| 620 |
+
processed_messages = []
|
| 621 |
+
for i, msg in enumerate(messages):
|
| 622 |
+
if isinstance(msg, ToolMessage) and hasattr(msg, "tool_call_id"):
|
| 623 |
+
# Only include if there is a matching AIMessage with this tool_call_id
|
| 624 |
+
if msg.tool_call_id in tool_call_map:
|
| 625 |
+
ai_msg_index = tool_call_map[msg.tool_call_id]
|
| 626 |
+
# Make sure this tool message comes right after its AIMessage
|
| 627 |
+
if i > ai_msg_index and not any(
|
| 628 |
+
isinstance(messages[j], ToolMessage) and
|
| 629 |
+
hasattr(messages[j], "tool_call_id") and
|
| 630 |
+
messages[j].tool_call_id == msg.tool_call_id
|
| 631 |
+
for j in range(ai_msg_index + 1, i)
|
| 632 |
+
):
|
| 633 |
+
processed_messages.append(msg)
|
| 634 |
+
else:
|
| 635 |
+
processed_messages.append(msg)
|
| 636 |
+
|
| 637 |
+
return processed_messages
|
| 638 |
+
|
| 639 |
+
def _create_workflow(self):
|
| 640 |
+
workflow = StateGraph(AgentState)
|
| 641 |
+
|
| 642 |
+
# Add nodes
|
| 643 |
+
workflow.add_node("supervisor", self._supervisor_agent)
|
| 644 |
+
workflow.add_node("worker", self._worker_agent)
|
| 645 |
+
workflow.add_node("tools", self._handle_tools)
|
| 646 |
+
workflow.add_node("validator", self._validation_agent)
|
| 647 |
+
|
| 648 |
+
# Add edges using the START and END constants
|
| 649 |
+
workflow.add_edge(START, "supervisor")
|
| 650 |
+
|
| 651 |
+
# All nodes use Command to specify their next destination, so we don't need conditional edges
|
| 652 |
+
# Each node's Command(goto=...) specifies the next node
|
| 653 |
+
|
| 654 |
+
# Compile the graph
|
| 655 |
+
return workflow.compile()
|
| 656 |
+
|
| 657 |
+
def _supervisor_agent(self, state: AgentState) -> Command:
|
| 658 |
+
"""Supervisor agent that coordinates the workflow."""
|
| 659 |
+
# Get the question from state
|
| 660 |
+
current_question = state["current_question"]
|
| 661 |
+
messages = state["messages"]
|
| 662 |
+
worker_iterations = state.get("worker_iterations", 0)
|
| 663 |
+
|
| 664 |
+
# If we have messages and this isn't the first iteration, evaluate worker's response
|
| 665 |
+
if messages and worker_iterations > 0:
|
| 666 |
+
# Find the last worker response
|
| 667 |
+
worker_response = None
|
| 668 |
+
for msg in reversed(messages):
|
| 669 |
+
if isinstance(msg, AIMessage) and not getattr(msg, "tool_calls", None):
|
| 670 |
+
worker_response = msg.content
|
| 671 |
+
break
|
| 672 |
+
|
| 673 |
+
if worker_response:
|
| 674 |
+
# Evaluate the worker's response
|
| 675 |
+
eval_prompt = ChatPromptTemplate.from_messages([
|
| 676 |
+
("system", """You are a supervisor agent evaluating a worker's research report about user's question.
|
| 677 |
+
Analyze whether the report with answer completely and accurately answers the question.
|
| 678 |
+
|
| 679 |
+
Your evaluation criteria:
|
| 680 |
+
- Completeness: Does the answer address all aspects of the question?
|
| 681 |
+
- Accuracy: Are the facts, references and reasoning correct?
|
| 682 |
+
- Path clarity: Is the path to the answer logical and well-explained?
|
| 683 |
+
- Evidence quality: Are the references reliable and directly relevant?
|
| 684 |
+
|
| 685 |
+
Worker has access to search and web content extraction tools, also python code execution tool.
|
| 686 |
+
|
| 687 |
+
Tasks given to You are not casual questions by random humans, but tricky contest puzzles that test LLM capabilities.
|
| 688 |
+
|
| 689 |
+
If all criteria are met, respond with "SATISFIED".
|
| 690 |
+
If any criteria are not met, respond with "UNSATISFIED: [specific detailed feedback]".
|
| 691 |
+
Be precise in your feedback so the worker knows exactly what to improve."""),
|
| 692 |
+
("human", f"Question: {current_question}\nWorker's report with answer: {worker_response}")
|
| 693 |
+
])
|
| 694 |
+
|
| 695 |
+
evaluation = self.supervisor_model.invoke(eval_prompt.format_prompt().to_messages()).content
|
| 696 |
+
|
| 697 |
+
# Determine if supervisor is satisfied
|
| 698 |
+
supervisor_satisfaction = evaluation.startswith("SATISFIED")
|
| 699 |
+
|
| 700 |
+
if supervisor_satisfaction:
|
| 701 |
+
# If satisfied, prepare to move to validator
|
| 702 |
+
return Command(
|
| 703 |
+
goto="validator",
|
| 704 |
+
update={
|
| 705 |
+
"supervisor_satisfaction": True
|
| 706 |
+
}
|
| 707 |
+
)
|
| 708 |
+
else:
|
| 709 |
+
# If not satisfied, give feedback to worker
|
| 710 |
+
feedback = evaluation.replace("UNSATISFIED: ", "")
|
| 711 |
+
|
| 712 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 713 |
+
("system", """You are a supervisor agent providing targeted feedback to the worker agent.
|
| 714 |
+
|
| 715 |
+
Your role is to guide the worker to improve their research report by:
|
| 716 |
+
1) Highlighting specific areas that need improvement
|
| 717 |
+
2) Providing clear, actionable guidance on what additional research is needed
|
| 718 |
+
3) Explaining exactly how the worker should revise their approach
|
| 719 |
+
4) Reminding them of any specific formatting requirements in the original question
|
| 720 |
+
|
| 721 |
+
Worker has access to the following tools:
|
| 722 |
+
- Web search (using Tavily and Serper)
|
| 723 |
+
- Web content extraction
|
| 724 |
+
- Secure code execution (for Python and other languages)
|
| 725 |
+
- Secure shell command execution
|
| 726 |
+
- Secure file operations
|
| 727 |
+
|
| 728 |
+
For computational puzzles, math problems, data processing, or tasks requiring exact precision,
|
| 729 |
+
recommend using the code execution tools rather than relying on reasoning alone.
|
| 730 |
+
|
| 731 |
+
Tasks given to You are not casual questions by random humans, but tricky contest puzzles that test LLM capabilities.
|
| 732 |
+
|
| 733 |
+
Focus on being constructive and precise. The worker should understand exactly what to do next."""),
|
| 734 |
+
("human", f"Question: {current_question}\nWorker's current response: {worker_response}\nImprovement needed: {feedback}")
|
| 735 |
+
])
|
| 736 |
+
|
| 737 |
+
feedback_message = self.supervisor_model.invoke(prompt.format_prompt().to_messages()).content
|
| 738 |
+
|
| 739 |
+
# Update messages with feedback and increment worker iterations
|
| 740 |
+
return Command(
|
| 741 |
+
goto="worker",
|
| 742 |
+
update={
|
| 743 |
+
"messages": messages + [HumanMessage(content=feedback_message)],
|
| 744 |
+
"worker_iterations": worker_iterations + 1,
|
| 745 |
+
"supervisor_satisfaction": False
|
| 746 |
+
}
|
| 747 |
+
)
|
| 748 |
+
|
| 749 |
+
# First iteration, provide initial instructions
|
| 750 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 751 |
+
("system", """You are a supervisor agent responsible for coordinating a research workflow.
|
| 752 |
+
|
| 753 |
+
Your responsibilities:
|
| 754 |
+
1) Analyze the question to identify required knowledge, tools, and research strategy
|
| 755 |
+
2) Provide clear, specific instructions to the worker agent
|
| 756 |
+
3) Specify exactly what information to gather and what analysis to perform
|
| 757 |
+
|
| 758 |
+
The worker will prepare a concise research report containing:
|
| 759 |
+
1) Their research path - the logical sequence of steps taken to reach the answer
|
| 760 |
+
2) The specific references used with clear citations
|
| 761 |
+
3) A proposed final answer formatted EXACTLY as requested in the question in separate section
|
| 762 |
+
|
| 763 |
+
Worker has access to the following powerful tools:
|
| 764 |
+
- Web search (using Tavily and Serper)
|
| 765 |
+
- Web content extraction
|
| 766 |
+
- Secure code execution (for Python and other languages)
|
| 767 |
+
- Secure shell command execution
|
| 768 |
+
- Secure file operations
|
| 769 |
+
|
| 770 |
+
You must understand LLM limitations of solving puzzles that can be solved only by code execution,
|
| 771 |
+
for example math problems, word character flipping, counting and similar tasks that typically plain LLM will fail at.
|
| 772 |
+
|
| 773 |
+
In case of such tasks, worker should use the code execution tools to solve the puzzle.
|
| 774 |
+
|
| 775 |
+
Tasks given to You are not casual questions by random humans, but tricky contest puzzles that test LLM capabilities.
|
| 776 |
+
|
| 777 |
+
Worker should give You full report with all sections for You to evaluate."""
|
| 778 |
+
),
|
| 779 |
+
("human", current_question)
|
| 780 |
+
])
|
| 781 |
+
|
| 782 |
+
response = self.supervisor_model.invoke(prompt.format_prompt().to_messages()).content
|
| 783 |
+
|
| 784 |
+
# Use Command pattern to update state and move to worker
|
| 785 |
+
return Command(
|
| 786 |
+
goto="worker",
|
| 787 |
+
update={
|
| 788 |
+
"messages": [HumanMessage(content=current_question), AIMessage(content=response)],
|
| 789 |
+
"worker_iterations": 1,
|
| 790 |
+
"supervisor_satisfaction": False
|
| 791 |
+
}
|
| 792 |
+
)
|
| 793 |
+
|
| 794 |
+
def _worker_agent(self, state: AgentState) -> Command:
|
| 795 |
+
"""Worker agent that performs the actual work using tools when needed."""
|
| 796 |
+
messages = state["messages"]
|
| 797 |
+
|
| 798 |
+
# Process messages to ensure proper tool call-result pairing
|
| 799 |
+
processed_messages = self._process_messages_after_tools(messages)
|
| 800 |
+
|
| 801 |
+
# Filter out any ToolMessages that don't have a corresponding AIMessage with tool_calls
|
| 802 |
+
# This helps prevent the "unexpected tool_use_id" error with Anthropic
|
| 803 |
+
filtered_messages = []
|
| 804 |
+
tool_call_ids = set()
|
| 805 |
+
|
| 806 |
+
# First pass: collect all tool_call_ids from AIMessages
|
| 807 |
+
for msg in processed_messages:
|
| 808 |
+
if isinstance(msg, AIMessage) and getattr(msg, "tool_calls", None):
|
| 809 |
+
for tool_call in msg.tool_calls:
|
| 810 |
+
if "id" in tool_call:
|
| 811 |
+
tool_call_ids.add(tool_call["id"])
|
| 812 |
+
|
| 813 |
+
# Second pass: only include ToolMessages that have a corresponding tool_call_id
|
| 814 |
+
for msg in processed_messages:
|
| 815 |
+
if isinstance(msg, ToolMessage) and getattr(msg, "tool_call_id", None):
|
| 816 |
+
if msg.tool_call_id in tool_call_ids:
|
| 817 |
+
filtered_messages.append(msg)
|
| 818 |
+
else:
|
| 819 |
+
filtered_messages.append(msg)
|
| 820 |
+
|
| 821 |
+
# If messages exist, use them directly with the tool-enabled model
|
| 822 |
+
response = self.worker_model.invoke(filtered_messages)
|
| 823 |
+
|
| 824 |
+
# Update messages - add the response to the original messages
|
| 825 |
+
# We don't want to lose the original message history
|
| 826 |
+
updated_messages = messages + [response]
|
| 827 |
+
|
| 828 |
+
# Determine next step using Command pattern
|
| 829 |
+
if response.tool_calls:
|
| 830 |
+
# If tool calls are present, go to tools
|
| 831 |
+
return Command(
|
| 832 |
+
goto="tools",
|
| 833 |
+
update={"messages": updated_messages}
|
| 834 |
+
)
|
| 835 |
+
else:
|
| 836 |
+
# No tool calls, return to supervisor for evaluation
|
| 837 |
+
return Command(
|
| 838 |
+
goto="supervisor",
|
| 839 |
+
update={"messages": updated_messages}
|
| 840 |
+
)
|
| 841 |
+
|
| 842 |
+
def _validation_agent(self, state: AgentState) -> Command:
|
| 843 |
+
"""Agent that validates the final answer."""
|
| 844 |
+
messages = state["messages"]
|
| 845 |
+
question = state["current_question"]
|
| 846 |
+
|
| 847 |
+
# Get the final answer from the last message
|
| 848 |
+
final_answer = ""
|
| 849 |
+
for msg in reversed(messages):
|
| 850 |
+
if isinstance(msg, AIMessage) and not getattr(msg, "tool_calls", None):
|
| 851 |
+
final_answer = msg.content
|
| 852 |
+
break
|
| 853 |
+
|
| 854 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 855 |
+
("system", """You are a quality assurance agent responsible for final verification of research reports and precise formatting of final answers.
|
| 856 |
+
|
| 857 |
+
Your critical responsibilities:
|
| 858 |
+
1) Verify the factual accuracy and completeness of the report, ensuring you can extract and format the final answer exactly as requested in the question
|
| 859 |
+
2) Ensure EXACT compliance with any formatting instructions in the question by producing a properly structured final answer
|
| 860 |
+
|
| 861 |
+
Pay extremely close attention to formatting requirements. The user may request:
|
| 862 |
+
- Only specific parts of information (first/last names, specific data points, numerical values)
|
| 863 |
+
- Particular ordering (alphabetical, chronological, size-based, relevance-based)
|
| 864 |
+
- Special formatting (bullet points, numbered lists, specific separators, tables)
|
| 865 |
+
- Exact text case, spacing, punctuation, or other presentational elements
|
| 866 |
+
|
| 867 |
+
Exact formatting compliance is MANDATORY for this challenge evaluation. Your role is to ensure the final answer meets all specified requirements.
|
| 868 |
+
If numerical values are requested, ensure they are formatted as numbers, not text.
|
| 869 |
+
|
| 870 |
+
Remember that the worker had access to:
|
| 871 |
+
- Web search tools
|
| 872 |
+
- Web content extraction
|
| 873 |
+
- Secure code execution
|
| 874 |
+
- Secure shell commands
|
| 875 |
+
- Secure file operations
|
| 876 |
+
|
| 877 |
+
For computational or precision-based questions, check if code execution was appropriately used and validate the results.
|
| 878 |
+
|
| 879 |
+
When evaluating the answer:
|
| 880 |
+
- Check if all required information is present and accurate
|
| 881 |
+
- Verify that the answer directly addresses the specific question asked
|
| 882 |
+
- Ensure any numerical values, dates, names, or technical terms are correct
|
| 883 |
+
- Confirm that the formatting precisely matches what was requested
|
| 884 |
+
- Do not add units to the final answer if not explicitly requested
|
| 885 |
+
- Answers tend to be as short as possible, so do not add extra data unless explicitly requested
|
| 886 |
+
|
| 887 |
+
If the answer report is correct, format it exactly as asked in the question, and respond with:
|
| 888 |
+
"APPROVED: [THE PROPERLY FORMATTED ANSWER]"
|
| 889 |
+
|
| 890 |
+
If there are issues with overall answer quality and you cannot format the final answer as requested, respond with:
|
| 891 |
+
"REJECTED: [DETAILED EXPLANATION OF ISSUES]"
|
| 892 |
+
|
| 893 |
+
Be extremely precise in your evaluation - the success of this task depends on your attention to detail.
|
| 894 |
+
"""
|
| 895 |
+
),
|
| 896 |
+
("human", f"Question: {question}\nReport to validate: {final_answer}")
|
| 897 |
+
])
|
| 898 |
+
validation_result = self.validator_model.invoke(prompt.format_prompt().to_messages()).content
|
| 899 |
+
validator_approval = validation_result.startswith("APPROVED")
|
| 900 |
+
|
| 901 |
+
if validator_approval:
|
| 902 |
+
# Approved - end the workflow
|
| 903 |
+
return Command(
|
| 904 |
+
goto=END,
|
| 905 |
+
update={
|
| 906 |
+
"final_answer": validation_result[10:], # Remove "APPROVED: " prefix
|
| 907 |
+
"validation_result": validation_result,
|
| 908 |
+
"validator_approval": True
|
| 909 |
+
}
|
| 910 |
+
)
|
| 911 |
+
else:
|
| 912 |
+
# Rejected - restart from supervisor with reset state
|
| 913 |
+
return Command(
|
| 914 |
+
goto="supervisor",
|
| 915 |
+
update={
|
| 916 |
+
"messages": [HumanMessage(content=question)],
|
| 917 |
+
"validation_result": validation_result,
|
| 918 |
+
"validator_approval": False,
|
| 919 |
+
"worker_iterations": 0,
|
| 920 |
+
"supervisor_satisfaction": False
|
| 921 |
+
}
|
| 922 |
+
)
|
| 923 |
+
|
| 924 |
+
def _handle_tools(self, state: AgentState) -> Command:
|
| 925 |
+
"""Custom wrapper around ToolNode to ensure proper message handling."""
|
| 926 |
+
# Execute the tool using the tool node
|
| 927 |
+
tool_result = self.tool_node.invoke(state)
|
| 928 |
+
|
| 929 |
+
# Process the result to ensure proper message ordering
|
| 930 |
+
if "messages" in tool_result:
|
| 931 |
+
# Get original messages
|
| 932 |
+
original_messages = state["messages"]
|
| 933 |
+
# Get all existing AIMessages with tool calls and their indices
|
| 934 |
+
ai_indices = {}
|
| 935 |
+
for i, msg in enumerate(original_messages):
|
| 936 |
+
if isinstance(msg, AIMessage) and getattr(msg, "tool_calls", None):
|
| 937 |
+
for tool_call in msg.tool_calls:
|
| 938 |
+
if "id" in tool_call:
|
| 939 |
+
ai_indices[tool_call["id"]] = i
|
| 940 |
+
|
| 941 |
+
# Add the new tool messages, ensuring they come right after their corresponding tool call
|
| 942 |
+
updated_messages = list(original_messages)
|
| 943 |
+
for msg in tool_result["messages"]:
|
| 944 |
+
if isinstance(msg, ToolMessage) and hasattr(msg, "tool_call_id"):
|
| 945 |
+
tool_id = msg.tool_call_id
|
| 946 |
+
if tool_id in ai_indices:
|
| 947 |
+
# Insert after the AIMessage with the matching tool call
|
| 948 |
+
insert_idx = ai_indices[tool_id] + 1
|
| 949 |
+
# Move past any existing tool messages for this AI message
|
| 950 |
+
while insert_idx < len(updated_messages) and \
|
| 951 |
+
isinstance(updated_messages[insert_idx], ToolMessage) and \
|
| 952 |
+
hasattr(updated_messages[insert_idx], "tool_call_id") and \
|
| 953 |
+
updated_messages[insert_idx].tool_call_id != tool_id:
|
| 954 |
+
insert_idx += 1
|
| 955 |
+
updated_messages.insert(insert_idx, msg)
|
| 956 |
+
# Update subsequent indices
|
| 957 |
+
for id in ai_indices:
|
| 958 |
+
if ai_indices[id] >= insert_idx:
|
| 959 |
+
ai_indices[id] += 1
|
| 960 |
+
else:
|
| 961 |
+
# No matching tool call found, just append
|
| 962 |
+
updated_messages.append(msg)
|
| 963 |
+
|
| 964 |
+
return Command(
|
| 965 |
+
goto="worker",
|
| 966 |
+
update={"messages": updated_messages}
|
| 967 |
+
)
|
| 968 |
+
|
| 969 |
+
# If no message updates, just return the state
|
| 970 |
+
return Command(
|
| 971 |
+
goto="worker",
|
| 972 |
+
update=tool_result
|
| 973 |
+
)
|
| 974 |
+
|
| 975 |
+
def __call__(self, question: str) -> str:
|
| 976 |
+
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
| 977 |
+
|
| 978 |
+
# Initialize the state
|
| 979 |
+
initial_state = {
|
| 980 |
+
"messages": [],
|
| 981 |
+
"current_question": question,
|
| 982 |
+
"final_answer": "",
|
| 983 |
+
"validation_result": "",
|
| 984 |
+
"worker_iterations": 0,
|
| 985 |
+
"supervisor_satisfaction": False,
|
| 986 |
+
"validator_approval": False
|
| 987 |
+
}
|
| 988 |
+
|
| 989 |
+
try:
|
| 990 |
+
# Run the workflow
|
| 991 |
+
final_state = self.app.invoke(initial_state, config={"callbacks": [self.langfuse_handler], "recursion_limit": 50})
|
| 992 |
+
|
| 993 |
+
# Return the final answer
|
| 994 |
+
answer = final_state.get("final_answer", "")
|
| 995 |
+
if not answer and final_state["messages"]:
|
| 996 |
+
for msg in reversed(final_state["messages"]):
|
| 997 |
+
if isinstance(msg, AIMessage) and not getattr(msg, "tool_calls", None):
|
| 998 |
+
answer = msg.content
|
| 999 |
+
break
|
| 1000 |
+
|
| 1001 |
+
print(f"Agent returning answer: {answer[:50]}...")
|
| 1002 |
+
return answer
|
| 1003 |
+
except Exception as e:
|
| 1004 |
+
print(f"Error in agent processing: {str(e)}")
|
| 1005 |
+
# Fallback to basic workflow without tool calls if there's an error
|
| 1006 |
+
return f"I encountered an error while processing your question: {str(e)}. Please try reformulating your question."
|
| 1007 |
+
finally:
|
| 1008 |
+
# Clean up resources
|
| 1009 |
+
cleanup_daytona_sandbox()
|
app.py
CHANGED
|
@@ -1,23 +1,20 @@
|
|
| 1 |
import os
|
|
|
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
import requests
|
| 4 |
import inspect
|
| 5 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
# (Keep Constants as is)
|
| 8 |
# --- Constants ---
|
| 9 |
-
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 10 |
-
|
| 11 |
-
# --- Basic Agent Definition ---
|
| 12 |
-
# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
|
| 13 |
-
class BasicAgent:
|
| 14 |
-
def __init__(self):
|
| 15 |
-
print("BasicAgent initialized.")
|
| 16 |
-
def __call__(self, question: str) -> str:
|
| 17 |
-
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
| 18 |
-
fixed_answer = "This is a default answer."
|
| 19 |
-
print(f"Agent returning fixed answer: {fixed_answer}")
|
| 20 |
-
return fixed_answer
|
| 21 |
|
| 22 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
| 23 |
"""
|
|
@@ -48,44 +45,105 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 48 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
| 49 |
print(agent_code)
|
| 50 |
|
| 51 |
-
#
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
print(f"Response text: {response.text[:500]}")
|
| 67 |
-
return f"Error decoding server response for questions: {e}", None
|
| 68 |
-
except Exception as e:
|
| 69 |
-
print(f"An unexpected error occurred fetching questions: {e}")
|
| 70 |
-
return f"An unexpected error occurred fetching questions: {e}", None
|
| 71 |
|
| 72 |
-
#
|
| 73 |
results_log = []
|
| 74 |
answers_payload = []
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
task_id = item.get("task_id")
|
| 78 |
question_text = item.get("question")
|
| 79 |
if not task_id or question_text is None:
|
| 80 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 81 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
try:
|
|
|
|
| 83 |
submitted_answer = agent(question_text)
|
| 84 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 85 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
except Exception as e:
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
if not answers_payload:
|
| 91 |
print("Agent did not produce any answers to submit.")
|
|
@@ -99,17 +157,42 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 99 |
# 5. Submit
|
| 100 |
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
| 101 |
try:
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
final_status = (
|
| 106 |
-
f"Submission Successful!\n"
|
| 107 |
f"User: {result_data.get('username')}\n"
|
| 108 |
f"Overall Score: {result_data.get('score', 'N/A')}% "
|
| 109 |
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
|
| 110 |
f"Message: {result_data.get('message', 'No message received.')}"
|
| 111 |
)
|
| 112 |
-
print("Submission
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
results_df = pd.DataFrame(results_log)
|
| 114 |
return final_status, results_df
|
| 115 |
except requests.exceptions.HTTPError as e:
|
|
@@ -139,6 +222,24 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 139 |
results_df = pd.DataFrame(results_log)
|
| 140 |
return status_message, results_df
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
# --- Build Gradio Interface using Blocks ---
|
| 144 |
with gr.Blocks() as demo:
|
|
@@ -190,6 +291,18 @@ if __name__ == "__main__":
|
|
| 190 |
else:
|
| 191 |
print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
print("-"*(60 + len(" App Starting ")) + "\n")
|
| 194 |
|
| 195 |
print("Launching Gradio Interface for Basic Agent Evaluation...")
|
|
|
|
| 1 |
import os
|
| 2 |
+
import json
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
import gradio as gr
|
| 5 |
import requests
|
| 6 |
import inspect
|
| 7 |
import pandas as pd
|
| 8 |
+
from agent import BasicAgent
|
| 9 |
+
import time
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
|
| 12 |
+
# Load environment variables from .env file
|
| 13 |
+
load_dotenv()
|
| 14 |
|
|
|
|
| 15 |
# --- Constants ---
|
| 16 |
+
DEFAULT_API_URL = os.getenv('DEFAULT_API_URL', "https://agents-course-unit4-scoring.hf.space")
|
| 17 |
+
CHECKPOINT_FILE = "agent_checkpoint.json"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
| 20 |
"""
|
|
|
|
| 45 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
| 46 |
print(agent_code)
|
| 47 |
|
| 48 |
+
# Check for existing checkpoint
|
| 49 |
+
checkpoint_data = None
|
| 50 |
+
if os.path.exists(CHECKPOINT_FILE):
|
| 51 |
+
try:
|
| 52 |
+
with open(CHECKPOINT_FILE, 'r') as f:
|
| 53 |
+
checkpoint_data = json.load(f)
|
| 54 |
+
print(f"Found checkpoint with {len(checkpoint_data.get('questions', []))} questions and {len(checkpoint_data.get('answers', []))} answers")
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"Error loading checkpoint: {e}")
|
| 57 |
+
# If checkpoint is corrupt, remove it
|
| 58 |
+
try:
|
| 59 |
+
os.remove(CHECKPOINT_FILE)
|
| 60 |
+
except:
|
| 61 |
+
pass
|
| 62 |
+
checkpoint_data = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
+
# Initialize results tracking
|
| 65 |
results_log = []
|
| 66 |
answers_payload = []
|
| 67 |
+
|
| 68 |
+
if checkpoint_data:
|
| 69 |
+
# If we have a checkpoint, use it
|
| 70 |
+
questions_data = checkpoint_data.get('questions', [])
|
| 71 |
+
# Load any answers we already have
|
| 72 |
+
existing_answers = checkpoint_data.get('answers', [])
|
| 73 |
+
existing_answers_dict = {a.get('task_id'): a.get('submitted_answer') for a in existing_answers}
|
| 74 |
+
print(f"Loaded {len(existing_answers)} existing answers from checkpoint")
|
| 75 |
+
|
| 76 |
+
# Load existing results log
|
| 77 |
+
if 'results_log' in checkpoint_data:
|
| 78 |
+
results_log = checkpoint_data.get('results_log', [])
|
| 79 |
+
|
| 80 |
+
# We'll use the checkpoint data
|
| 81 |
+
print(f"Resuming from checkpoint with {len(questions_data)} questions")
|
| 82 |
+
else:
|
| 83 |
+
# 2. Fetch Questions from server
|
| 84 |
+
print(f"Fetching questions from: {questions_url}")
|
| 85 |
+
try:
|
| 86 |
+
response = requests.get(questions_url, timeout=15)
|
| 87 |
+
response.raise_for_status()
|
| 88 |
+
questions_data = response.json()
|
| 89 |
+
if not questions_data:
|
| 90 |
+
print("Fetched questions list is empty.")
|
| 91 |
+
return "Fetched questions list is empty or invalid format.", None
|
| 92 |
+
print(f"Fetched {len(questions_data)} questions.")
|
| 93 |
+
|
| 94 |
+
# Save questions to checkpoint immediately
|
| 95 |
+
save_checkpoint(questions_data, [], username, [])
|
| 96 |
+
|
| 97 |
+
# No existing answers
|
| 98 |
+
existing_answers_dict = {}
|
| 99 |
+
|
| 100 |
+
except requests.exceptions.RequestException as e:
|
| 101 |
+
print(f"Error fetching questions: {e}")
|
| 102 |
+
return f"Error fetching questions: {e}", None
|
| 103 |
+
except requests.exceptions.JSONDecodeError as e:
|
| 104 |
+
print(f"Error decoding JSON response from questions endpoint: {e}")
|
| 105 |
+
print(f"Response text: {response.text[:500]}")
|
| 106 |
+
return f"Error decoding server response for questions: {e}", None
|
| 107 |
+
except Exception as e:
|
| 108 |
+
print(f"An unexpected error occurred fetching questions: {e}")
|
| 109 |
+
return f"An unexpected error occurred fetching questions: {e}", None
|
| 110 |
+
|
| 111 |
+
# 3. Run your Agent on questions we haven't answered yet
|
| 112 |
+
print(f"Running agent on questions...")
|
| 113 |
+
for idx, item in enumerate(questions_data):
|
| 114 |
task_id = item.get("task_id")
|
| 115 |
question_text = item.get("question")
|
| 116 |
if not task_id or question_text is None:
|
| 117 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 118 |
continue
|
| 119 |
+
|
| 120 |
+
# Skip if we already have an answer for this question
|
| 121 |
+
if task_id in existing_answers_dict:
|
| 122 |
+
submitted_answer = existing_answers_dict[task_id]
|
| 123 |
+
print(f"Using cached answer for task_id {task_id}")
|
| 124 |
+
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 125 |
+
|
| 126 |
+
# Check if we already have this in results_log
|
| 127 |
+
if not any(r.get("Task ID") == task_id for r in results_log):
|
| 128 |
+
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 129 |
+
|
| 130 |
+
continue
|
| 131 |
+
|
| 132 |
try:
|
| 133 |
+
print(f"Processing question {idx+1}/{len(questions_data)}: {task_id}")
|
| 134 |
submitted_answer = agent(question_text)
|
| 135 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 136 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 137 |
+
|
| 138 |
+
# Save checkpoint after each answer
|
| 139 |
+
save_checkpoint(questions_data, answers_payload, username, results_log)
|
| 140 |
+
|
| 141 |
except Exception as e:
|
| 142 |
+
print(f"Error running agent on task {task_id}: {e}")
|
| 143 |
+
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
|
| 144 |
+
|
| 145 |
+
# Save checkpoint even if there was an error
|
| 146 |
+
save_checkpoint(questions_data, answers_payload, username, results_log)
|
| 147 |
|
| 148 |
if not answers_payload:
|
| 149 |
print("Agent did not produce any answers to submit.")
|
|
|
|
| 157 |
# 5. Submit
|
| 158 |
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
| 159 |
try:
|
| 160 |
+
# Check if we're in production mode
|
| 161 |
+
is_production = os.getenv('PRODUCTION_RUN', 'FALSE').upper() == 'TRUE'
|
| 162 |
+
|
| 163 |
+
if is_production:
|
| 164 |
+
print("Running in PRODUCTION mode - making actual submission")
|
| 165 |
+
response = requests.post(submit_url, json=submission_data, timeout=60)
|
| 166 |
+
response.raise_for_status()
|
| 167 |
+
result_data = response.json()
|
| 168 |
+
else:
|
| 169 |
+
print("Running in SIMULATION mode - generating mock response")
|
| 170 |
+
# Simulate a successful response
|
| 171 |
+
result_data = {
|
| 172 |
+
"username": username,
|
| 173 |
+
"score": 85,
|
| 174 |
+
"correct_count": len(answers_payload) - 2, # Simulate some incorrect answers
|
| 175 |
+
"total_attempted": len(answers_payload),
|
| 176 |
+
"message": "Simulation mode: This is a mock response"
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
final_status = (
|
| 180 |
+
f"Submission {'Successful' if is_production else 'Simulated'}!\n"
|
| 181 |
f"User: {result_data.get('username')}\n"
|
| 182 |
f"Overall Score: {result_data.get('score', 'N/A')}% "
|
| 183 |
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
|
| 184 |
f"Message: {result_data.get('message', 'No message received.')}"
|
| 185 |
)
|
| 186 |
+
print(f"Submission {'completed' if is_production else 'simulated'} successfully.")
|
| 187 |
+
|
| 188 |
+
# Delete checkpoint file after successful submission
|
| 189 |
+
if os.path.exists(CHECKPOINT_FILE):
|
| 190 |
+
try:
|
| 191 |
+
os.remove(CHECKPOINT_FILE)
|
| 192 |
+
print(f"Checkpoint file removed after successful submission")
|
| 193 |
+
except Exception as e:
|
| 194 |
+
print(f"Warning: Could not remove checkpoint file: {e}")
|
| 195 |
+
|
| 196 |
results_df = pd.DataFrame(results_log)
|
| 197 |
return final_status, results_df
|
| 198 |
except requests.exceptions.HTTPError as e:
|
|
|
|
| 222 |
results_df = pd.DataFrame(results_log)
|
| 223 |
return status_message, results_df
|
| 224 |
|
| 225 |
+
def save_checkpoint(questions_data, answers_payload, username, results_log):
|
| 226 |
+
"""Save checkpoint data to a local file."""
|
| 227 |
+
try:
|
| 228 |
+
checkpoint_data = {
|
| 229 |
+
'questions': questions_data,
|
| 230 |
+
'answers': answers_payload,
|
| 231 |
+
'username': username,
|
| 232 |
+
'timestamp': time.time(),
|
| 233 |
+
'results_log': results_log
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
with open(CHECKPOINT_FILE, 'w') as f:
|
| 237 |
+
json.dump(checkpoint_data, f)
|
| 238 |
+
|
| 239 |
+
print(f"Checkpoint saved with {len(questions_data)} questions and {len(answers_payload)} answers")
|
| 240 |
+
except Exception as e:
|
| 241 |
+
print(f"Error saving checkpoint: {e}")
|
| 242 |
+
|
| 243 |
|
| 244 |
# --- Build Gradio Interface using Blocks ---
|
| 245 |
with gr.Blocks() as demo:
|
|
|
|
| 291 |
else:
|
| 292 |
print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
|
| 293 |
|
| 294 |
+
# Check for existing checkpoint
|
| 295 |
+
if os.path.exists(CHECKPOINT_FILE):
|
| 296 |
+
try:
|
| 297 |
+
with open(CHECKPOINT_FILE, 'r') as f:
|
| 298 |
+
checkpoint_data = json.load(f)
|
| 299 |
+
print(f"✅ Checkpoint found with {len(checkpoint_data.get('questions', []))} questions and {len(checkpoint_data.get('answers', []))} answers")
|
| 300 |
+
print(f" Created at: {datetime.fromtimestamp(checkpoint_data.get('timestamp', 0)).strftime('%Y-%m-%d %H:%M:%S')}")
|
| 301 |
+
except Exception as e:
|
| 302 |
+
print(f"⚠️ Checkpoint file exists but could not be read: {e}")
|
| 303 |
+
else:
|
| 304 |
+
print("ℹ️ No checkpoint file found. Will start fresh.")
|
| 305 |
+
|
| 306 |
print("-"*(60 + len(" App Starting ")) + "\n")
|
| 307 |
|
| 308 |
print("Launching Gradio Interface for Basic Agent Evaluation...")
|
requirements.txt
CHANGED
|
@@ -1,2 +1,10 @@
|
|
| 1 |
gradio
|
| 2 |
-
requests
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
gradio
|
| 2 |
+
requests
|
| 3 |
+
python-dotenv
|
| 4 |
+
langgraph
|
| 5 |
+
langchain-core
|
| 6 |
+
langchain-anthropic
|
| 7 |
+
anthropic
|
| 8 |
+
python-Levenshtein
|
| 9 |
+
daytona_sdk
|
| 10 |
+
|