import xml.etree.ElementTree as ET from typing import Dict, Any, Optional, List import re from .tool import Tool, Toolkit from .storage_handler import FileStorageHandler from .request_base import RequestBase class ArxivBase(RequestBase): """ Extended RequestBase class for arXiv API interactions. Provides specialized methods for working with arXiv's Atom XML API. """ def __init__(self, **kwargs): super().__init__(**kwargs) self.base_url = "http://export.arxiv.org/api/query" self.atom_namespace = "http://www.w3.org/2005/Atom" self.arxiv_namespace = "http://arxiv.org/schemas/atom" self.opensearch_namespace = "http://a9.com/-/spec/opensearch/1.1/" def search_arxiv(self, search_query: str = None, id_list: List[str] = None, start: int = 0, max_results: int = 10) -> Dict[str, Any]: """ Search arXiv using the API and return structured results. Args: search_query: Search query string (e.g., "all:electron", "cat:cs.AI") id_list: List of arXiv IDs to retrieve start: Starting index for results max_results: Maximum number of results to return Returns: Dictionary containing parsed search results """ # Build query parameters params = { 'start': start, 'max_results': max_results } if search_query: params['search_query'] = search_query if id_list: params['id_list'] = ','.join(id_list) try: # Make the HTTP request response = self.request( url=self.base_url, method='GET', params=params ) # Parse the XML response return self._parse_atom_response(response.text) except Exception as e: return { 'success': False, 'error': str(e), 'query': search_query or str(id_list) } def _parse_atom_response(self, xml_content: str) -> Dict[str, Any]: """ Parse the Atom XML response from arXiv API. Args: xml_content: Raw XML content from the API response Returns: Dictionary with parsed paper information """ try: # Parse XML root = ET.fromstring(xml_content) # Register namespaces namespaces = { 'atom': self.atom_namespace, 'arxiv': self.arxiv_namespace, 'opensearch': self.opensearch_namespace } # Extract metadata total_results = root.find('.//opensearch:totalResults', namespaces) start_index = root.find('.//opensearch:startIndex', namespaces) items_per_page = root.find('.//opensearch:itemsPerPage', namespaces) result = { 'success': True, 'total_results': int(total_results.text) if total_results is not None else 0, 'start_index': int(start_index.text) if start_index is not None else 0, 'items_per_page': int(items_per_page.text) if items_per_page is not None else 0, 'papers': [] } # Extract paper entries entries = root.findall('.//atom:entry', namespaces) for entry in entries: paper = self._parse_paper_entry(entry, namespaces) result['papers'].append(paper) return result except ET.ParseError as e: return { 'success': False, 'error': f'XML parsing error: {str(e)}', 'raw_content': xml_content[:500] + '...' if len(xml_content) > 500 else xml_content } except Exception as e: return { 'success': False, 'error': str(e) } def _parse_paper_entry(self, entry, namespaces) -> Dict[str, Any]: """ Parse a single paper entry from the XML. Args: entry: XML element for a paper entry namespaces: Namespace mappings Returns: Dictionary with paper information """ paper = {} # Basic information paper['id'] = self._get_text(entry, 'atom:id', namespaces) paper['title'] = self._get_text(entry, 'atom:title', namespaces, clean=True) paper['summary'] = self._get_text(entry, 'atom:summary', namespaces, clean=True) paper['published'] = self._get_text(entry, 'atom:published', namespaces) paper['updated'] = self._get_text(entry, 'atom:updated', namespaces) # Extract arXiv ID from the full ID URL if paper['id']: paper['arxiv_id'] = paper['id'].split('/')[-1] # Authors authors = entry.findall('.//atom:author', namespaces) paper['authors'] = [] for author in authors: name = self._get_text(author, 'atom:name', namespaces) if name: paper['authors'].append(name) # Categories categories = entry.findall('.//atom:category', namespaces) paper['categories'] = [] for category in categories: term = category.get('term') if term: paper['categories'].append(term) # Primary category primary_cat = entry.find('.//arxiv:primary_category', namespaces) if primary_cat is not None: paper['primary_category'] = primary_cat.get('term') # Links (PDF, HTML) links = entry.findall('.//atom:link', namespaces) paper['links'] = {} for link in links: rel = link.get('rel') href = link.get('href') title = link.get('title') if rel == 'alternate': paper['links']['html'] = href elif title == 'pdf': paper['links']['pdf'] = href # arXiv-specific fields paper['comment'] = self._get_text(entry, 'arxiv:comment', namespaces) paper['journal_ref'] = self._get_text(entry, 'arxiv:journal_ref', namespaces) paper['doi'] = self._get_text(entry, 'arxiv:doi', namespaces) # Map field names for better API # Use the HTML link as the main URL, fallback to constructing from arxiv_id if paper.get('links', {}).get('html'): paper['url'] = paper['links']['html'] elif paper.get('arxiv_id'): paper['url'] = f"https://arxiv.org/abs/{paper['arxiv_id']}" else: paper['url'] = '' paper['published_date'] = paper.pop('published', '') paper['updated_date'] = paper.pop('updated', '') # Remove the old id field since we're replacing it with url paper.pop('id', None) return paper def _get_text(self, element, xpath, namespaces, clean=False) -> str: """ Helper method to extract text from XML elements. Args: element: XML element to search in xpath: XPath expression namespaces: Namespace mappings clean: Whether to clean whitespace Returns: Text content or empty string """ found = element.find(xpath, namespaces) if found is not None: text = found.text or '' if clean: # Clean up whitespace and newlines text = re.sub(r'\s+', ' ', text.strip()) return text return '' def download_pdf(self, pdf_url: str, save_path: str, storage_handler: FileStorageHandler = None) -> Dict[str, Any]: """ Download a PDF from arXiv. Args: pdf_url: URL of the PDF to download save_path: Local path to save the PDF storage_handler: Storage handler for file operations Returns: Dictionary with download status """ try: response = self.request(url=pdf_url, method='GET') # Get the PDF content pdf_content = response.content # Save the PDF content using storage handler result = storage_handler.save(save_path, pdf_content) if result["success"]: return { 'success': True, 'file_path': save_path, 'size': len(pdf_content), 'url': pdf_url, 'storage_handler': type(storage_handler).__name__ } else: return { 'success': False, 'error': f"Failed to save PDF: {result.get('error', 'Unknown error')}", 'url': pdf_url, 'save_path': save_path } except Exception as e: return { 'success': False, 'error': str(e), 'url': pdf_url } class ArxivSearchTool(Tool): """Tool for searching papers on arXiv.""" name: str = "arxiv_search" description: str = "Search for academic papers on arXiv using queries or paper IDs" inputs: Dict[str, Dict[str, str]] = { "search_query": { "type": "string", "description": "Search query (e.g., 'all:machine learning', 'cat:cs.AI', 'au:smith')" }, "id_list": { "type": "array", "description": "List of arXiv IDs to retrieve (e.g., ['1706.03762', '1810.04805'])" }, "max_results": { "type": "integer", "description": "Maximum number of results to return (default: 10)" }, "start": { "type": "integer", "description": "Starting index for pagination (default: 0)" } } required: Optional[List[str]] = [] def __init__(self, arxiv_base: ArxivBase = None): super().__init__() self.arxiv_base = arxiv_base def __call__(self, search_query: str = None, id_list: list = None, max_results: int = 10, start: int = 0) -> Dict[str, Any]: """ Search arXiv for papers. Args: search_query: Search query string id_list: List of arXiv IDs max_results: Maximum results to return start: Starting index for pagination Returns: Dictionary with search results """ if not search_query and not id_list: return { 'success': False, 'error': 'Either search_query or id_list must be provided' } return self.arxiv_base.search_arxiv( search_query=search_query, id_list=id_list, start=start, max_results=max_results ) class ArxivDownloadTool(Tool): """Tool for downloading papers from arXiv.""" name: str = "arxiv_download" description: str = "Download PDF papers from arXiv" inputs: Dict[str, Dict[str, str]] = { "pdf_url": { "type": "string", "description": "URL of the PDF to download" }, "save_path": { "type": "string", "description": "Local path to save the PDF file" } } required: Optional[List[str]] = ["pdf_url", "save_path"] def __init__(self, arxiv_base: ArxivBase = None, storage_handler: FileStorageHandler = None): super().__init__() self.arxiv_base = arxiv_base self.storage_handler = storage_handler def __call__(self, pdf_url: str, save_path: str) -> Dict[str, Any]: """ Download a PDF from arXiv. Args: pdf_url: URL of the PDF save_path: Where to save the file Returns: Dictionary with download status """ return self.arxiv_base.download_pdf(pdf_url, save_path, self.storage_handler) class ArxivToolkit(Toolkit): def __init__(self, name: str = "ArxivToolkit", storage_handler: FileStorageHandler = None): # Initialize storage handler if not provided if storage_handler is None: from .storage_handler import LocalStorageHandler storage_handler = LocalStorageHandler() # Create the shared arxiv base instance arxiv_base = ArxivBase() # Create tools with the shared base and storage handler tools = [ ArxivSearchTool(arxiv_base=arxiv_base), ArxivDownloadTool(arxiv_base=arxiv_base, storage_handler=storage_handler) ] # Initialize parent with tools super().__init__(name=name, tools=tools) # Store arxiv_base as instance variable self.arxiv_base = arxiv_base self.storage_handler = storage_handler