|
|
""" |
|
|
CrossRef API fetcher for bibliography metadata. |
|
|
|
|
|
CrossRef provides free, reliable access to metadata for academic publications. |
|
|
No API key required, no rate limiting for reasonable use. |
|
|
""" |
|
|
import requests |
|
|
from dataclasses import dataclass |
|
|
from typing import Optional, List |
|
|
import time |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class CrossRefResult: |
|
|
"""Metadata result from CrossRef API.""" |
|
|
title: str |
|
|
authors: List[str] |
|
|
year: str |
|
|
doi: str |
|
|
publisher: str |
|
|
container_title: str |
|
|
abstract: str = "" |
|
|
url: str = "" |
|
|
|
|
|
|
|
|
class CrossRefFetcher: |
|
|
""" |
|
|
Fetcher for CrossRef API. |
|
|
|
|
|
CrossRef is a reliable, free API for academic metadata. |
|
|
Much more reliable than Google Scholar scraping. |
|
|
""" |
|
|
|
|
|
BASE_URL = "https://api.crossref.org/works" |
|
|
RATE_LIMIT_DELAY = 1.0 |
|
|
|
|
|
def __init__(self, mailto: str = "bibguard@example.com"): |
|
|
""" |
|
|
Initialize CrossRef fetcher. |
|
|
|
|
|
Args: |
|
|
mailto: Email for polite pool (gets better rate limits) |
|
|
""" |
|
|
self.mailto = mailto |
|
|
self._last_request_time = 0.0 |
|
|
self._session = requests.Session() |
|
|
|
|
|
def _rate_limit(self): |
|
|
"""Ensure rate limiting between requests.""" |
|
|
elapsed = time.time() - self._last_request_time |
|
|
if elapsed < self.RATE_LIMIT_DELAY: |
|
|
time.sleep(self.RATE_LIMIT_DELAY - elapsed) |
|
|
self._last_request_time = time.time() |
|
|
|
|
|
def _get_headers(self) -> dict: |
|
|
"""Get request headers with mailto for polite pool.""" |
|
|
return { |
|
|
'User-Agent': f'BibGuard/1.0 (mailto:{self.mailto})', |
|
|
'Accept': 'application/json', |
|
|
} |
|
|
|
|
|
def search_by_title(self, title: str, max_results: int = 5) -> Optional[CrossRefResult]: |
|
|
""" |
|
|
Search for a paper by title. |
|
|
|
|
|
Args: |
|
|
title: Paper title to search for |
|
|
max_results: Maximum number of results to retrieve |
|
|
|
|
|
Returns: |
|
|
Best matching CrossRefResult or None if not found |
|
|
""" |
|
|
self._rate_limit() |
|
|
|
|
|
params = { |
|
|
'query.title': title, |
|
|
'rows': max_results, |
|
|
'select': 'title,author,published-print,published-online,DOI,publisher,container-title,abstract' |
|
|
} |
|
|
|
|
|
try: |
|
|
response = self._session.get( |
|
|
self.BASE_URL, |
|
|
params=params, |
|
|
headers=self._get_headers(), |
|
|
timeout=30 |
|
|
) |
|
|
response.raise_for_status() |
|
|
|
|
|
data = response.json() |
|
|
|
|
|
if data.get('status') != 'ok': |
|
|
return None |
|
|
|
|
|
items = data.get('message', {}).get('items', []) |
|
|
|
|
|
if not items: |
|
|
return None |
|
|
|
|
|
|
|
|
return self._parse_item(items[0]) |
|
|
|
|
|
except requests.RequestException: |
|
|
return None |
|
|
|
|
|
def search_by_doi(self, doi: str) -> Optional[CrossRefResult]: |
|
|
""" |
|
|
Fetch metadata by DOI. |
|
|
|
|
|
Args: |
|
|
doi: DOI of the paper |
|
|
|
|
|
Returns: |
|
|
CrossRefResult or None if not found |
|
|
""" |
|
|
self._rate_limit() |
|
|
|
|
|
|
|
|
doi = doi.replace('https://doi.org/', '').replace('http://doi.org/', '') |
|
|
|
|
|
try: |
|
|
response = self._session.get( |
|
|
f"{self.BASE_URL}/{doi}", |
|
|
headers=self._get_headers(), |
|
|
timeout=30 |
|
|
) |
|
|
response.raise_for_status() |
|
|
|
|
|
data = response.json() |
|
|
|
|
|
if data.get('status') != 'ok': |
|
|
return None |
|
|
|
|
|
item = data.get('message', {}) |
|
|
return self._parse_item(item) |
|
|
|
|
|
except requests.RequestException: |
|
|
return None |
|
|
|
|
|
def _parse_item(self, item: dict) -> Optional[CrossRefResult]: |
|
|
"""Parse a CrossRef API item into CrossRefResult.""" |
|
|
try: |
|
|
|
|
|
titles = item.get('title', []) |
|
|
title = titles[0] if titles else "" |
|
|
|
|
|
if not title: |
|
|
return None |
|
|
|
|
|
|
|
|
authors = [] |
|
|
for author in item.get('author', []): |
|
|
given = author.get('given', '') |
|
|
family = author.get('family', '') |
|
|
if family: |
|
|
if given: |
|
|
authors.append(f"{given} {family}") |
|
|
else: |
|
|
authors.append(family) |
|
|
|
|
|
|
|
|
year = "" |
|
|
for date_field in ['published-print', 'published-online', 'created']: |
|
|
date_parts = item.get(date_field, {}).get('date-parts', [[]]) |
|
|
if date_parts and date_parts[0]: |
|
|
year = str(date_parts[0][0]) |
|
|
break |
|
|
|
|
|
|
|
|
doi = item.get('DOI', '') |
|
|
|
|
|
|
|
|
publisher = item.get('publisher', '') |
|
|
|
|
|
|
|
|
container_titles = item.get('container-title', []) |
|
|
container_title = container_titles[0] if container_titles else "" |
|
|
|
|
|
|
|
|
abstract = item.get('abstract', '') |
|
|
|
|
|
|
|
|
url = f"https://doi.org/{doi}" if doi else "" |
|
|
|
|
|
return CrossRefResult( |
|
|
title=title, |
|
|
authors=authors, |
|
|
year=year, |
|
|
doi=doi, |
|
|
publisher=publisher, |
|
|
container_title=container_title, |
|
|
abstract=abstract, |
|
|
url=url |
|
|
) |
|
|
|
|
|
except (KeyError, IndexError, TypeError): |
|
|
return None |
|
|
|