iLOVE2D's picture
Upload 2846 files
5374a2d verified
## Dux Distributed Global Search
from .search_base import SearchBase
from .tool import Tool,Toolkit
from ddgs import DDGS
from typing import Dict, Any, List, Optional
import pandas as pd
class SearchDDGS(SearchBase):
"""
DDGS (Dux Distributed Global Search) tool that aggregates results from multiple search engines.
Supports DuckDuckGo, Google, Bing, Brave, Yahoo, and other backends.
"""
def __init__(
self,
name: str = "SearchDDGS",
num_search_pages: Optional[int] = 5,
max_content_words: Optional[int] = None,
backend: str = "auto",
region: str = "us-en",
**kwargs
):
"""
Initialize the DDGS Search tool.
Args:
name (str): Name of the tool
num_search_pages (int): Number of search results to retrieve
max_content_words (int): Maximum number of words to include in content
backend (str): Search backend(s) to use. Options: "auto", "duckduckgo", "google", "bing", "brave", "yahoo", etc.
region (str): Search region (e.g., "us-en", "uk-en", "ru-ru")
**kwargs: Additional keyword arguments for parent class initialization
"""
super().__init__(name=name, num_search_pages=num_search_pages, max_content_words=max_content_words, **kwargs)
self.backend = backend
self.region = region
def search(self, query: str, num_search_pages: int = None, max_content_words: int = None, backend: str = None, region: str = None) -> Dict[str, Any]:
"""
Searches using DDGS for the given query and retrieves content from multiple pages.
Args:
query (str): The search query.
num_search_pages (int): Number of search results to retrieve
max_content_words (int): Maximum number of words to include in content, None means no limit
backend (str): Search backend to use (overrides instance default)
region (str): Search region to use (overrides instance default)
Returns:
Dict[str, Any]: Contains a list of search results and optional error message.
"""
# Use class defaults
num_search_pages = num_search_pages or self.num_search_pages
max_content_words = max_content_words or self.max_content_words
backend = backend or self.backend
region = region or self.region
results = []
try:
# Step 1: Get search results using DDGS
with DDGS() as ddgs:
search_results = list(ddgs.text(
query,
max_results=num_search_pages,
backend=backend,
region=region
))
if not search_results:
return {"results": [], "error": "No search results found."}
# Step 2: Process each search result
for result in search_results:
try:
title = result.get('title', 'No Title')
url = result.get('href', '') or result.get('link', '') or result.get('url', '')
# Always try to scrape the actual page content
if url and url.startswith(('http://', 'https://')):
try:
scraped_title, scraped_content = self._scrape_page(url)
if scraped_content:
title = scraped_title or title
content = scraped_content
else:
# Fall back to snippet if scraping fails
content = result.get('body', '')
except Exception:
# Fall back to snippet if scraping fails
content = result.get('body', '')
else:
# No valid URL, use snippet
content = result.get('body', '')
if content: # Ensure valid content exists
# Use the base class's content truncation method
display_content = self._truncate_content(content, max_content_words)
results.append({
"title": title,
"content": display_content,
"url": url,
})
except Exception:
continue # Skip results that cannot be processed
return {"results": results, "error": None}
except Exception as e:
return {"results": [], "error": str(e)}
class DDGSSearchTool(Tool):
name: str = "ddgs_search"
description: str = "Search using DDGS (Dux Distributed Global Search) which aggregates results from multiple search engines including DuckDuckGo, Google, Bing, and others"
inputs: Dict[str, Dict[str, str]] = {
"query": {
"type": "string",
"description": "The search query to execute"
},
"num_search_pages": {
"type": "integer",
"description": "Number of search results to retrieve. Default: 5"
},
"max_content_words": {
"type": "integer",
"description": "Maximum number of words to include in content per result. None means no limit. Default: None"
},
"backend": {
"type": "string",
"description": "Search backend to use. Options: 'auto', 'duckduckgo', 'google', 'bing', 'brave', 'yahoo'. Default: 'auto'"
},
"region": {
"type": "string",
"description": "Search region (e.g., 'us-en', 'uk-en', 'ru-ru'). Default: 'us-en'"
}
}
required: Optional[List[str]] = ["query"]
def __init__(self, search_ddgs: SearchDDGS = None):
super().__init__()
self.search_ddgs = search_ddgs
def __call__(self, query: str, num_search_pages: int = None, max_content_words: int = None, backend: str = None, region: str = None) -> Dict[str, Any]:
"""Execute DDGS search using the SearchDDGS instance."""
if not self.search_ddgs:
raise RuntimeError("DDGS search instance not initialized")
try:
return self.search_ddgs.search(query, num_search_pages, max_content_words, backend, region)
except Exception as e:
return {"results": [], "error": f"Error executing DDGS search: {str(e)}"}
class DDGSSearchToolkit(Toolkit):
def __init__(
self,
name: str = "DDGSSearchToolkit",
num_search_pages: Optional[int] = 5,
max_content_words: Optional[int] = None,
backend: str = "auto",
region: str = "us-en",
**kwargs
):
# Create the shared DDGS search instance
search_ddgs = SearchDDGS(
name="DDGSSearch",
num_search_pages=num_search_pages,
max_content_words=max_content_words,
backend=backend,
region=region,
**kwargs
)
# Create tools with the shared search instance
tools = [
DDGSSearchTool(search_ddgs=search_ddgs)
]
# Initialize parent with tools
super().__init__(name=name, tools=tools)
# Store search_ddgs as instance variable
self.search_ddgs = search_ddgs
class PERTSearchTool(Tool):
name: str = "pert_search"
description: str = "Search gene regulatory network and return the gene-gene pair"
inputs: Dict[str, Dict[str, str]] = {
"source_gene_name": {
"type": "string",
"description": "name of perturbed gene"
},
"target_gene_name": {
"type": "string",
"description": "name of targeted gene"
},
"cell_line": {
"type": "string",
"description": "Name of selected cell line"
},
}
required: Optional[List[str]] = ["source_gene_name", "target_gene_name", "cell_line"]
def __init__(self,sourcekey='k562', toplist = 20):
super().__init__()
self.toplist = toplist
self.sourcekey = sourcekey
self.filelist = pd.read_csv(f"/gpfs/radev/home/tl688/pitl688/selfevolve/EvoAgentX/examples/pertqa/{sourcekey}_processed_grn.csv",index_col=0)
def __call__(self, source_gene_name: str, target_gene_name: str, cell_line: str) -> Dict[str, Any]:
"""Execute DDGS search using the SearchDDGS instance."""
print(source_gene_name, target_gene_name, cell_line)
gene_name = target_gene_name
try:
searchinfo =f'''The detected gene list and gene regulatory strength in cell line {self.sourcekey} is: '''
finditem = self.filelist.loc[:,gene_name].sort_values(ascending=False).iloc[0:self.toplist]
searchinfo += '''RegulatorGeneName TargetGeneName Score\n'''
for name, sten in zip(finditem.index, finditem.values):
searchinfo += f'''{name} {target_gene_name} {sten}\n'''
print(searchinfo)
return {"results": searchinfo}
except Exception as e:
return {"results": [], "error": f"Error executing Perturbation searching: {str(e)}"}
class PertToolkit(Toolkit):
def __init__(
self,
name: str = "PertToolkit",
sourcekey = "k562",
toplist = 20,
**kwargs
):
# Create the shared DDGS search instance
# Create tools with the shared search instance
tools = [
PERTSearchTool(sourcekey=sourcekey,toplist=toplist)
]
# Initialize parent with tools
super().__init__(name=name, tools=tools)