selfevolveagent / evoagentx /tools /request_arxiv.py
iLOVE2D's picture
Upload 2846 files
5374a2d verified
import xml.etree.ElementTree as ET
from typing import Dict, Any, Optional, List
import re
from .tool import Tool, Toolkit
from .storage_handler import FileStorageHandler
from .request_base import RequestBase
class ArxivBase(RequestBase):
"""
Extended RequestBase class for arXiv API interactions.
Provides specialized methods for working with arXiv's Atom XML API.
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.base_url = "http://export.arxiv.org/api/query"
self.atom_namespace = "http://www.w3.org/2005/Atom"
self.arxiv_namespace = "http://arxiv.org/schemas/atom"
self.opensearch_namespace = "http://a9.com/-/spec/opensearch/1.1/"
def search_arxiv(self, search_query: str = None, id_list: List[str] = None,
start: int = 0, max_results: int = 10) -> Dict[str, Any]:
"""
Search arXiv using the API and return structured results.
Args:
search_query: Search query string (e.g., "all:electron", "cat:cs.AI")
id_list: List of arXiv IDs to retrieve
start: Starting index for results
max_results: Maximum number of results to return
Returns:
Dictionary containing parsed search results
"""
# Build query parameters
params = {
'start': start,
'max_results': max_results
}
if search_query:
params['search_query'] = search_query
if id_list:
params['id_list'] = ','.join(id_list)
try:
# Make the HTTP request
response = self.request(
url=self.base_url,
method='GET',
params=params
)
# Parse the XML response
return self._parse_atom_response(response.text)
except Exception as e:
return {
'success': False,
'error': str(e),
'query': search_query or str(id_list)
}
def _parse_atom_response(self, xml_content: str) -> Dict[str, Any]:
"""
Parse the Atom XML response from arXiv API.
Args:
xml_content: Raw XML content from the API response
Returns:
Dictionary with parsed paper information
"""
try:
# Parse XML
root = ET.fromstring(xml_content)
# Register namespaces
namespaces = {
'atom': self.atom_namespace,
'arxiv': self.arxiv_namespace,
'opensearch': self.opensearch_namespace
}
# Extract metadata
total_results = root.find('.//opensearch:totalResults', namespaces)
start_index = root.find('.//opensearch:startIndex', namespaces)
items_per_page = root.find('.//opensearch:itemsPerPage', namespaces)
result = {
'success': True,
'total_results': int(total_results.text) if total_results is not None else 0,
'start_index': int(start_index.text) if start_index is not None else 0,
'items_per_page': int(items_per_page.text) if items_per_page is not None else 0,
'papers': []
}
# Extract paper entries
entries = root.findall('.//atom:entry', namespaces)
for entry in entries:
paper = self._parse_paper_entry(entry, namespaces)
result['papers'].append(paper)
return result
except ET.ParseError as e:
return {
'success': False,
'error': f'XML parsing error: {str(e)}',
'raw_content': xml_content[:500] + '...' if len(xml_content) > 500 else xml_content
}
except Exception as e:
return {
'success': False,
'error': str(e)
}
def _parse_paper_entry(self, entry, namespaces) -> Dict[str, Any]:
"""
Parse a single paper entry from the XML.
Args:
entry: XML element for a paper entry
namespaces: Namespace mappings
Returns:
Dictionary with paper information
"""
paper = {}
# Basic information
paper['id'] = self._get_text(entry, 'atom:id', namespaces)
paper['title'] = self._get_text(entry, 'atom:title', namespaces, clean=True)
paper['summary'] = self._get_text(entry, 'atom:summary', namespaces, clean=True)
paper['published'] = self._get_text(entry, 'atom:published', namespaces)
paper['updated'] = self._get_text(entry, 'atom:updated', namespaces)
# Extract arXiv ID from the full ID URL
if paper['id']:
paper['arxiv_id'] = paper['id'].split('/')[-1]
# Authors
authors = entry.findall('.//atom:author', namespaces)
paper['authors'] = []
for author in authors:
name = self._get_text(author, 'atom:name', namespaces)
if name:
paper['authors'].append(name)
# Categories
categories = entry.findall('.//atom:category', namespaces)
paper['categories'] = []
for category in categories:
term = category.get('term')
if term:
paper['categories'].append(term)
# Primary category
primary_cat = entry.find('.//arxiv:primary_category', namespaces)
if primary_cat is not None:
paper['primary_category'] = primary_cat.get('term')
# Links (PDF, HTML)
links = entry.findall('.//atom:link', namespaces)
paper['links'] = {}
for link in links:
rel = link.get('rel')
href = link.get('href')
title = link.get('title')
if rel == 'alternate':
paper['links']['html'] = href
elif title == 'pdf':
paper['links']['pdf'] = href
# arXiv-specific fields
paper['comment'] = self._get_text(entry, 'arxiv:comment', namespaces)
paper['journal_ref'] = self._get_text(entry, 'arxiv:journal_ref', namespaces)
paper['doi'] = self._get_text(entry, 'arxiv:doi', namespaces)
# Map field names for better API
# Use the HTML link as the main URL, fallback to constructing from arxiv_id
if paper.get('links', {}).get('html'):
paper['url'] = paper['links']['html']
elif paper.get('arxiv_id'):
paper['url'] = f"https://arxiv.org/abs/{paper['arxiv_id']}"
else:
paper['url'] = ''
paper['published_date'] = paper.pop('published', '')
paper['updated_date'] = paper.pop('updated', '')
# Remove the old id field since we're replacing it with url
paper.pop('id', None)
return paper
def _get_text(self, element, xpath, namespaces, clean=False) -> str:
"""
Helper method to extract text from XML elements.
Args:
element: XML element to search in
xpath: XPath expression
namespaces: Namespace mappings
clean: Whether to clean whitespace
Returns:
Text content or empty string
"""
found = element.find(xpath, namespaces)
if found is not None:
text = found.text or ''
if clean:
# Clean up whitespace and newlines
text = re.sub(r'\s+', ' ', text.strip())
return text
return ''
def download_pdf(self, pdf_url: str, save_path: str, storage_handler: FileStorageHandler = None) -> Dict[str, Any]:
"""
Download a PDF from arXiv.
Args:
pdf_url: URL of the PDF to download
save_path: Local path to save the PDF
storage_handler: Storage handler for file operations
Returns:
Dictionary with download status
"""
try:
response = self.request(url=pdf_url, method='GET')
# Get the PDF content
pdf_content = response.content
# Save the PDF content using storage handler
result = storage_handler.save(save_path, pdf_content)
if result["success"]:
return {
'success': True,
'file_path': save_path,
'size': len(pdf_content),
'url': pdf_url,
'storage_handler': type(storage_handler).__name__
}
else:
return {
'success': False,
'error': f"Failed to save PDF: {result.get('error', 'Unknown error')}",
'url': pdf_url,
'save_path': save_path
}
except Exception as e:
return {
'success': False,
'error': str(e),
'url': pdf_url
}
class ArxivSearchTool(Tool):
"""Tool for searching papers on arXiv."""
name: str = "arxiv_search"
description: str = "Search for academic papers on arXiv using queries or paper IDs"
inputs: Dict[str, Dict[str, str]] = {
"search_query": {
"type": "string",
"description": "Search query (e.g., 'all:machine learning', 'cat:cs.AI', 'au:smith')"
},
"id_list": {
"type": "array",
"description": "List of arXiv IDs to retrieve (e.g., ['1706.03762', '1810.04805'])"
},
"max_results": {
"type": "integer",
"description": "Maximum number of results to return (default: 10)"
},
"start": {
"type": "integer",
"description": "Starting index for pagination (default: 0)"
}
}
required: Optional[List[str]] = []
def __init__(self, arxiv_base: ArxivBase = None):
super().__init__()
self.arxiv_base = arxiv_base
def __call__(self, search_query: str = None, id_list: list = None,
max_results: int = 10, start: int = 0) -> Dict[str, Any]:
"""
Search arXiv for papers.
Args:
search_query: Search query string
id_list: List of arXiv IDs
max_results: Maximum results to return
start: Starting index for pagination
Returns:
Dictionary with search results
"""
if not search_query and not id_list:
return {
'success': False,
'error': 'Either search_query or id_list must be provided'
}
return self.arxiv_base.search_arxiv(
search_query=search_query,
id_list=id_list,
start=start,
max_results=max_results
)
class ArxivDownloadTool(Tool):
"""Tool for downloading papers from arXiv."""
name: str = "arxiv_download"
description: str = "Download PDF papers from arXiv"
inputs: Dict[str, Dict[str, str]] = {
"pdf_url": {
"type": "string",
"description": "URL of the PDF to download"
},
"save_path": {
"type": "string",
"description": "Local path to save the PDF file"
}
}
required: Optional[List[str]] = ["pdf_url", "save_path"]
def __init__(self, arxiv_base: ArxivBase = None, storage_handler: FileStorageHandler = None):
super().__init__()
self.arxiv_base = arxiv_base
self.storage_handler = storage_handler
def __call__(self, pdf_url: str, save_path: str) -> Dict[str, Any]:
"""
Download a PDF from arXiv.
Args:
pdf_url: URL of the PDF
save_path: Where to save the file
Returns:
Dictionary with download status
"""
return self.arxiv_base.download_pdf(pdf_url, save_path, self.storage_handler)
class ArxivToolkit(Toolkit):
def __init__(self, name: str = "ArxivToolkit", storage_handler: FileStorageHandler = None):
# Initialize storage handler if not provided
if storage_handler is None:
from .storage_handler import LocalStorageHandler
storage_handler = LocalStorageHandler()
# Create the shared arxiv base instance
arxiv_base = ArxivBase()
# Create tools with the shared base and storage handler
tools = [
ArxivSearchTool(arxiv_base=arxiv_base),
ArxivDownloadTool(arxiv_base=arxiv_base, storage_handler=storage_handler)
]
# Initialize parent with tools
super().__init__(name=name, tools=tools)
# Store arxiv_base as instance variable
self.arxiv_base = arxiv_base
self.storage_handler = storage_handler