Spaces:

behavior-in-the-wild
/

SDR-Arena

Running

App Files Files Community

SDR-Arena / benchmark /interface.py

behavior-in-the-wild

Deploy SDR-Arena leaderboard

f9e2361 verified about 1 month ago

raw

history blame contribute delete

5.23 kB

	"""
	BaseResearchAgent - The standard interface for DR-Bench agents.

	All agents submitted to the benchmark must implement this interface.
	The benchmark framework provides standardized LLM and WebSearch clients
	to ensure fair comparison (same model, same search provider for all agents).

	The agent's job is purely ORCHESTRATION:
	- Deciding what to search for
	- How to decompose the research topic
	- How to synthesize search results into a final report
	- How many iterations/turns to use

	Example minimal agent:

	class MyAgent(BaseResearchAgent):
	@property
	def name(self) -> str:
	return "my-simple-agent"

	@property
	def description(self) -> str:
	return "Simple single-pass research agent"

	@property
	def author(self) -> str:
	return "Jane Doe"

	async def research(self, topic, llm, websearch, **kwargs):
	# 1. Generate search queries
	queries_response = await llm.chat.completions.create(
	model=self.model_name,
	messages=[{"role": "user", "content": f"Generate 3 search queries for: {topic}"}],
	)
	queries = queries_response.choices[0].message.content.split("\\n")

	# 2. Search
	results = await websearch.search(queries[:3], **kwargs)

	# 3. Synthesize
	synthesis = await llm.chat.completions.create(
	model=self.model_name,
	messages=[
	{"role": "system", "content": "Synthesize search results into a report."},
	{"role": "user", "content": f"Topic: {topic}\\n\\nSearch Results:\\n{results}"},
	],
	)
	return ResearchOutput(
	report=synthesis.choices[0].message.content,
	searches_made=[{"queries": queries[:3]}],
	)
	"""

	from __future__ import annotations

	from abc import ABC, abstractmethod
	from dataclasses import dataclass, field
	from typing import Any, Optional, TYPE_CHECKING

	if TYPE_CHECKING:
	from openai import AsyncOpenAI
	from benchmark.websearch import BenchmarkWebSearchClient


	@dataclass
	class ResearchOutput:
	"""Output from a research agent."""
	report: str
	searches_made: list[dict[str, Any]] = field(default_factory=list)
	metadata: dict[str, Any] = field(default_factory=dict)


	class BaseResearchAgent(ABC):
	"""
	Abstract base class for all DR-Bench research agents.

	Agents receive:
	- An AsyncOpenAI client (connected to the benchmark's standardized LLM)
	- A BenchmarkWebSearchClient (connected to the benchmark's Brightdata-based search)
	- The model name to use with the LLM client

	Agents must implement:
	- name: Agent identifier (alphanumeric + hyphens)
	- description: Short description of the methodology
	- author: Author name / team
	- research(): The core research method

	Agents may optionally import any Python libraries they need for their
	orchestration logic (LangGraph, DSPy, custom code, etc.), as long as
	all LLM calls go through the provided client and all web searches go
	through the provided websearch client.
	"""

	def __init__(self, model_name: str = "gpt-4o"):
	"""
	Initialize the agent.

	Args:
	model_name: The LLM model name to use (provided by the benchmark).
	All agents use the same model for fairness.
	"""
	self.model_name = model_name

	@property
	@abstractmethod
	def name(self) -> str:
	"""Unique identifier for this agent (alphanumeric + hyphens, e.g. 'my-agent-v1')."""
	...

	@property
	@abstractmethod
	def description(self) -> str:
	"""Short description of the agent's methodology."""
	...

	@property
	@abstractmethod
	def author(self) -> str:
	"""Author name or team."""
	...

	@abstractmethod
	async def research(
	self,
	topic: str,
	llm: AsyncOpenAI,
	websearch: BenchmarkWebSearchClient,
	*,
	start_date: Optional[str] = None,
	end_date: Optional[str] = None,
	**kwargs: Any,
	) -> ResearchOutput:
	"""
	Run research on a given topic.

	Args:
	topic: The research prompt / topic to investigate.
	llm: AsyncOpenAI client - use for all LLM calls.
	Call with: await llm.chat.completions.create(model=self.model_name, ...)
	websearch: WebSearch client - use for all web searches.
	Call with: await websearch.search(queries, start_date=..., end_date=...)
	start_date: Optional search date filter start (YYYY-MM-DD).
	end_date: Optional search date filter end (YYYY-MM-DD).
	**kwargs: Additional benchmark-provided parameters.

	Returns:
	ResearchOutput with the research report and metadata.
	"""
	...

	def get_info(self) -> dict[str, str]:
	"""Return agent metadata."""
	return {
	"name": self.name,
	"description": self.description,
	"author": self.author,
	"model_name": self.model_name,
	}