SDR-Arena / benchmark /interface.py
behavior-in-the-wild's picture
Deploy SDR-Arena leaderboard
f9e2361 verified
"""
BaseResearchAgent - The standard interface for DR-Bench agents.
All agents submitted to the benchmark must implement this interface.
The benchmark framework provides standardized LLM and WebSearch clients
to ensure fair comparison (same model, same search provider for all agents).
The agent's job is purely ORCHESTRATION:
- Deciding what to search for
- How to decompose the research topic
- How to synthesize search results into a final report
- How many iterations/turns to use
Example minimal agent:
class MyAgent(BaseResearchAgent):
@property
def name(self) -> str:
return "my-simple-agent"
@property
def description(self) -> str:
return "Simple single-pass research agent"
@property
def author(self) -> str:
return "Jane Doe"
async def research(self, topic, llm, websearch, **kwargs):
# 1. Generate search queries
queries_response = await llm.chat.completions.create(
model=self.model_name,
messages=[{"role": "user", "content": f"Generate 3 search queries for: {topic}"}],
)
queries = queries_response.choices[0].message.content.split("\\n")
# 2. Search
results = await websearch.search(queries[:3], **kwargs)
# 3. Synthesize
synthesis = await llm.chat.completions.create(
model=self.model_name,
messages=[
{"role": "system", "content": "Synthesize search results into a report."},
{"role": "user", "content": f"Topic: {topic}\\n\\nSearch Results:\\n{results}"},
],
)
return ResearchOutput(
report=synthesis.choices[0].message.content,
searches_made=[{"queries": queries[:3]}],
)
"""
from __future__ import annotations
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Any, Optional, TYPE_CHECKING
if TYPE_CHECKING:
from openai import AsyncOpenAI
from benchmark.websearch import BenchmarkWebSearchClient
@dataclass
class ResearchOutput:
"""Output from a research agent."""
report: str
searches_made: list[dict[str, Any]] = field(default_factory=list)
metadata: dict[str, Any] = field(default_factory=dict)
class BaseResearchAgent(ABC):
"""
Abstract base class for all DR-Bench research agents.
Agents receive:
- An AsyncOpenAI client (connected to the benchmark's standardized LLM)
- A BenchmarkWebSearchClient (connected to the benchmark's Brightdata-based search)
- The model name to use with the LLM client
Agents must implement:
- name: Agent identifier (alphanumeric + hyphens)
- description: Short description of the methodology
- author: Author name / team
- research(): The core research method
Agents may optionally import any Python libraries they need for their
orchestration logic (LangGraph, DSPy, custom code, etc.), as long as
all LLM calls go through the provided client and all web searches go
through the provided websearch client.
"""
def __init__(self, model_name: str = "gpt-4o"):
"""
Initialize the agent.
Args:
model_name: The LLM model name to use (provided by the benchmark).
All agents use the same model for fairness.
"""
self.model_name = model_name
@property
@abstractmethod
def name(self) -> str:
"""Unique identifier for this agent (alphanumeric + hyphens, e.g. 'my-agent-v1')."""
...
@property
@abstractmethod
def description(self) -> str:
"""Short description of the agent's methodology."""
...
@property
@abstractmethod
def author(self) -> str:
"""Author name or team."""
...
@abstractmethod
async def research(
self,
topic: str,
llm: AsyncOpenAI,
websearch: BenchmarkWebSearchClient,
*,
start_date: Optional[str] = None,
end_date: Optional[str] = None,
**kwargs: Any,
) -> ResearchOutput:
"""
Run research on a given topic.
Args:
topic: The research prompt / topic to investigate.
llm: AsyncOpenAI client - use for all LLM calls.
Call with: await llm.chat.completions.create(model=self.model_name, ...)
websearch: WebSearch client - use for all web searches.
Call with: await websearch.search(queries, start_date=..., end_date=...)
start_date: Optional search date filter start (YYYY-MM-DD).
end_date: Optional search date filter end (YYYY-MM-DD).
**kwargs: Additional benchmark-provided parameters.
Returns:
ResearchOutput with the research report and metadata.
"""
...
def get_info(self) -> dict[str, str]:
"""Return agent metadata."""
return {
"name": self.name,
"description": self.description,
"author": self.author,
"model_name": self.model_name,
}