Spaces:

thinkwee
/

BibGuard

Running

BibGuard / src /fetchers /semantic_scholar_fetcher.py

thinkwee

init

46df5f0 3 months ago

5.39 kB

	"""
	Semantic Scholar API fetcher.
	Official API with high quality metadata and generous rate limits.
	"""
	import time
	from dataclasses import dataclass
	from typing import Optional

	import requests


	@dataclass
	class SemanticScholarResult:
	"""Search result from Semantic Scholar API."""
	title: str
	authors: list[str]
	year: str
	abstract: str
	paper_id: str
	citation_count: int
	url: str


	class SemanticScholarFetcher:
	"""
	Fetcher using Semantic Scholar's official API.

	API Docs: https://api.semanticscholar.org/
	Rate Limits:
	- Without API key: 100 requests per 5 minutes
	- With API key: 5,000 requests per 5 minutes (free)
	"""

	BASE_URL = "https://api.semanticscholar.org/graph/v1"
	RATE_LIMIT_DELAY = 0.5 # Conservative delay (120 req/min max)

	def __init__(self, api_key: Optional[str] = None):
	"""
	Initialize Semantic Scholar fetcher.

	Args:
	api_key: Optional API key for higher rate limits (free from semanticscholar.org)
	"""
	self.api_key = api_key
	self._last_request_time = 0.0
	self._session = requests.Session()

	if api_key:
	self._session.headers.update({'x-api-key': api_key})

	def _rate_limit(self):
	"""Ensure rate limiting between requests."""
	elapsed = time.time() - self._last_request_time
	if elapsed < self.RATE_LIMIT_DELAY:
	time.sleep(self.RATE_LIMIT_DELAY - elapsed)
	self._last_request_time = time.time()

	def search_by_title(self, title: str, max_results: int = 5) -> Optional[SemanticScholarResult]:
	"""
	Search for a paper by title.

	Args:
	title: Paper title to search for
	max_results: Maximum number of results to fetch (default: 5)

	Returns:
	SemanticScholarResult if found, None otherwise
	"""
	self._rate_limit()

	url = f"{self.BASE_URL}/paper/search"
	params = {
	'query': title,
	'limit': max_results,
	'fields': 'title,authors,year,abstract,paperId,citationCount,url'
	}

	try:
	response = self._session.get(url, params=params, timeout=10)
	response.raise_for_status()
	data = response.json()

	papers = data.get('data', [])
	if not papers:
	return None

	# Return the first (most relevant) result
	return self._parse_paper(papers[0])

	except requests.RequestException:
	return None

	def fetch_by_doi(self, doi: str) -> Optional[SemanticScholarResult]:
	"""
	Fetch paper metadata by DOI.

	Args:
	doi: DOI of the paper

	Returns:
	SemanticScholarResult if found, None otherwise
	"""
	self._rate_limit()

	url = f"{self.BASE_URL}/paper/DOI:{doi}"
	params = {
	'fields': 'title,authors,year,abstract,paperId,citationCount,url'
	}

	try:
	response = self._session.get(url, params=params, timeout=10)
	response.raise_for_status()
	data = response.json()
	return self._parse_paper(data)

	except requests.RequestException:
	return None

	def fetch_by_arxiv_id(self, arxiv_id: str) -> Optional[SemanticScholarResult]:
	"""
	Fetch paper metadata by arXiv ID.

	Args:
	arxiv_id: arXiv ID (e.g., "2301.12345" or "arXiv:2301.12345")

	Returns:
	SemanticScholarResult if found, None otherwise
	"""
	self._rate_limit()

	# Clean arXiv ID (remove "arXiv:" prefix if present)
	clean_id = arxiv_id.replace('arXiv:', '')

	url = f"{self.BASE_URL}/paper/ARXIV:{clean_id}"
	params = {
	'fields': 'title,authors,year,abstract,paperId,citationCount,url'
	}

	try:
	response = self._session.get(url, params=params, timeout=10)
	response.raise_for_status()
	data = response.json()
	return self._parse_paper(data)

	except requests.RequestException:
	return None

	def _parse_paper(self, paper_data: dict) -> Optional[SemanticScholarResult]:
	"""Parse paper data from API response."""
	try:
	# Extract author names
	authors = []
	for author in paper_data.get('authors', []):
	name = author.get('name', '')
	if name:
	authors.append(name)

	# Get year (may be None)
	year = paper_data.get('year')
	year_str = str(year) if year else ""

	return SemanticScholarResult(
	title=paper_data.get('title', ''),
	authors=authors,
	year=year_str,
	abstract=paper_data.get('abstract', ''),
	paper_id=paper_data.get('paperId', ''),
	citation_count=paper_data.get('citationCount', 0),
	url=paper_data.get('url', '')
	)
	except (KeyError, TypeError):
	return None