Spaces:

YuITC
/

arXivRAG-Multimodal-Conversational-RAG-System

Runtime error

arXivRAG-Multimodal-Conversational-RAG-System / src /fetcher /arxiv_fetcher.py

YuITC

Add application file

c8e875f 10 months ago

4.61 kB

	import arxiv
	import urllib.request
	from pathlib import Path
	from dateutil import parser
	from typing import List, Dict, Any, Optional

	from utils.setup_logger import setup_logger
	from src.config import TEMP_DIR

	# Configure logging
	logger = setup_logger(__name__)


	class ArxivFetcher:
	def __init__(self):
	self.client = arxiv.Client()


	def fetch_papers(self,
	subject_tags: List[str] = None,
	start_date : str = None,
	end_date : str = None,
	max_results : int = 10,
	query : str = None) -> List[Dict[str, Any]]:
	"""
	Fetches papers from arXiv based on subject tags and date range.

	Args:
	subject_tags (list): List of subject tags to filter papers by
	start_date (str): Start date in YYYY-MM-DD format
	end_date (str): End date in YYYY-MM-DD format
	query (str): Search query for text-based search
	max_results (int): Maximum number of results to return

	Returns:
	list: List of paper dictionaries with metadata
	"""
	# Search query
	if not subject_tags: filter_query = 'cat:cs.*' # Default to all CS tags
	else : filter_query = ' OR '.join([f"cat:{tag}" for tag in subject_tags]) # Query with selected tags

	if not query: search_query = ''
	else : search_query = ' AND (' + ' AND '.join([f"(ti:{q} OR abs:{q})" for q in query.split()]) + ')' # Search by title or abstract

	final_query = f"({filter_query}){search_query}"
	logger.info(f"Fetching papers with query: {final_query}")

	# Search object
	search = arxiv.Search(
	query = final_query,
	max_results = max_results,
	sort_by = arxiv.SortCriterion.SubmittedDate
	)

	try:
	results = list(self.client.results(search))

	# Filter by date
	if start_date or end_date:
	filtered_results = []
	start_date_obj = parser.parse(start_date).date() if start_date else None
	end_date_obj = parser.parse(end_date).date() if end_date else None

	for paper in results:
	paper_date = paper.published.date()
	if start_date_obj and paper_date < start_date_obj: continue
	if end_date_obj and paper_date > end_date_obj : continue

	filtered_results.append(paper)
	results = filtered_results

	# Convert to dictionary format with required metadata
	papers = []
	for paper in results:
	papers.append({
	'title' : paper.title,
	'authors' : [author.name for author in paper.authors],
	'published' : paper.published.strftime('%Y-%m-%d'),
	'updated' : paper.updated.strftime('%Y-%m-%d') if paper.updated else None,
	'arxiv_id' : paper.get_short_id(),
	'pdf_url' : paper.pdf_url,
	'entry_id' : paper.entry_id,
	'abstract' : paper.summary,
	'categories' : paper.categories,
	'primary_category': paper.primary_category
	})
	return papers

	except Exception as e:
	print(f"Error fetching papers: {e}")
	return []


	def download_paper(self, paper_id: str) -> Optional[Path]:
	"""
	Downloads a paper's PDF from arXiv.

	Args:
	paper_id (str): The arXiv ID of the paper

	Returns:
	Optional[Path]: Path to the downloaded PDF file, or None if download failed
	"""
	try:
	# Create the filename
	filename = f"{paper_id.replace('/', '_')}.pdf"
	filepath = TEMP_DIR / filename
	if filepath.exists():
	return filepath

	# Download the PDF
	pdf_url = f"https://arxiv.org/pdf/{paper_id}"
	urllib.request.urlretrieve(pdf_url, filepath)
	return filepath

	except Exception as e:
	print(f"Error downloading paper {paper_id}: {e}")
	return None