Spaces:

coderpotter
/

research-assistant

Runtime error

App Files Files Community

research-assistant / src /research_assistant /components /arxiv_search_api.py

coderpotter

Upload folder using huggingface_hub

7b2e5db verified about 1 year ago

raw

history blame contribute delete

3.67 kB

	from datetime import datetime

	import requests
	import xmltodict

	from research_assistant.app_logging import app_logger
	from research_assistant.constants import ARXIV_API_ACCESS_POINT
	from research_assistant.entity import ArticleSearchConfig


	class ArxivApiWrap:
	def __init__(self, config: ArticleSearchConfig):
	self.config = config

	def convert_link_to_pdflink(self, link):
	return link.replace("/abs/", "/pdf/") + ".pdf"

	def convert_date(self, date):
	return datetime.strptime(date, "%Y-%m-%d").strftime("%Y%m%d")

	"""
	Fetches the response from the arXiv API based on the specified search terms and parameters.
	Args used by the arXiv API:
	Keywords (list of str): Contains the search terms
	max_length (int): Maximum number of articles to retrieve
	Date range : Contains start and end dates for the search
	Sort by : Sorts the results by a specific field (e.g., submittedDate)
	Sort order (str): Sort order for the results (e.g., asc, desc)

	Returns:
	requests.Response: The HTTP response object returned by the arXiv API.
	"""

	def get_arxiv_api_response(self):
	keyword_query = " AND all:".join([f"'{kw}'" for kw in self.config.search_terms])
	if self.config.date_range.start_date:
	query = f" all:{keyword_query} AND submittedDate:[{self.convert_date(self.config.date_range.start_date)} TO {self.convert_date(self.config.date_range.end_date)}]"
	else:
	query = f" all:{keyword_query}"
	params = {
	"search_query": query,
	"start": 0, # Starts from page 1 of the results obtained
	"max_results": self.config.num_results, # Adjust the number of results as needed
	"sortBy": self.config.sort_by, # Sort by submission date
	"sortOrder": self.config.sort_order, # Sort in descending order (latest first)
	}
	return requests.get(ARXIV_API_ACCESS_POINT, params=params)

	"""
	Retrieves article search results from the arXiv API and logs detailed information about each article.

	This method fetches the API response, parses the XML content into a structured format,
	and extracts key information such as the title, summary, link, and authors for each article.

	Returns:
	list of str: A list of article links retrieved from the arXiv API.
	"""

	def get_article_search_result(self):
	response = self.get_arxiv_api_response() # Fetch the API response
	article_links = []
	if response.status_code == 200: # Check if the request was successful
	# Parse the response (arXiv API returns XML)
	data = xmltodict.parse(response.content)
	for entry in data["feed"]["entry"]:
	title, summary, link, authors = (
	entry["title"],
	entry["summary"],
	entry["id"],
	[author["name"] for author in entry["author"]],
	)
	app_logger.info(
	f"Title: {title}\n Authors: {authors} \n,Abstract: {summary}\n Page Link: {link}\n PDF Link: {self.convert_link_to_pdflink(link)}\n Paper Id: {link.split('/')[-1]}\n {'-'*80}"
	)
	article_links.append(link)
	else:
	app_logger.info(f"Failed to retrieve papers: {response.status_code}")
	return article_links

	def download_pdf(self, pdf_url):
	response, title = requests.get(pdf_url), pdf_url.split("/")[0]
	with open(f"data/{title}.pdf", "wb") as f:
	f.write(response.content)
	print(f"Downloaded: {title}.pdf")