ResearchRadar / app /fetcher /arxiv_client.py
ak0601's picture
Upload 63 files
fdcd9e5 verified
"""
ResearchRadar β€” arXiv Atom API client.
Fetches papers submitted/updated within the last N days for given arXiv
categories. Uses xml.etree.ElementTree (stdlib) β€” no lxml needed.
"""
from __future__ import annotations
import logging
import xml.etree.ElementTree as ET
from datetime import date, datetime, timedelta
from typing import List
from app.core.config import ARXIV_BASE_URL, ARXIV_MAX_RESULTS
from app.core.models import Paper
from app.fetcher.http_session import FetchError, RetrySession
logger = logging.getLogger(__name__)
# arXiv Atom namespace
_NS = {
'atom': 'http://www.w3.org/2005/Atom',
'arxiv': 'http://arxiv.org/schemas/atom',
}
def fetch_papers(
category_slug: str,
arxiv_cats: List[str],
session: RetrySession,
days_back: int = 7,
) -> List[Paper]:
"""
Fetch papers submitted/updated within *days_back* days across all
arXiv categories in *arxiv_cats*.
Returns a list of Paper instances. Never raises β€” returns [] on error.
"""
today = date.today()
start = today - timedelta(days=days_back)
end = today
query = '(' + ' OR '.join(f'cat:{c}' for c in arxiv_cats) + ')'
params = {
'search_query': query,
'start': 0,
'max_results': ARXIV_MAX_RESULTS,
'sortBy': 'submittedDate',
'sortOrder': 'descending',
}
try:
response = session.get(ARXIV_BASE_URL, params=params)
except FetchError as exc:
logger.error('arXiv fetch failed for %s: %s', category_slug, exc)
return []
try:
root = ET.fromstring(response.text)
except ET.ParseError as exc:
logger.error(
'arXiv XML parse error: %s β€” snippet: %s',
exc, response.text[:300],
)
return []
papers: List[Paper] = []
for entry in root.findall('atom:entry', _NS):
try:
paper = _parse_entry(entry, category_slug, start, end)
if paper is not None:
papers.append(paper)
except Exception:
logger.debug('Skipping malformed arXiv entry', exc_info=True)
logger.info(
'arXiv: fetched %d papers for [%s] (%s)',
len(papers), category_slug, ', '.join(arxiv_cats),
)
return papers
def _parse_entry(
entry: ET.Element,
category_slug: str,
start: date,
end: date,
) -> Paper | None:
"""Parse a single <entry> element into a Paper, or return None."""
title_el = entry.find('atom:title', _NS)
abstract_el = entry.find('atom:summary', _NS)
if title_el is None or abstract_el is None:
return None
title = ' '.join((title_el.text or '').split())
abstract = ' '.join((abstract_el.text or '').split())
if not title or not abstract:
logger.debug('Skipping entry with empty title/abstract')
return None
# arXiv ID
id_el = entry.find('atom:id', _NS)
raw_id = (id_el.text or '') if id_el is not None else ''
arxiv_id = raw_id.replace('http://arxiv.org/abs/', '').strip()
if not arxiv_id:
return None
paper_id = f'arxiv:{arxiv_id}'
# Authors
authors = []
for author_el in entry.findall('atom:author', _NS):
name_el = author_el.find('atom:name', _NS)
if name_el is not None and name_el.text:
authors.append(name_el.text.strip())
# Published date
pub_el = entry.find('atom:published', _NS)
pub_text = (pub_el.text or '') if pub_el is not None else ''
try:
published = datetime.fromisoformat(
pub_text.replace('Z', '+00:00')
).date()
except (ValueError, TypeError):
published = date.today()
# Categories
categories = []
for cat_el in entry.findall('atom:category', _NS):
term = cat_el.get('term', '')
if term:
categories.append(term)
# PDF link
pdf_url = None
for link_el in entry.findall('atom:link', _NS):
if link_el.get('title') == 'pdf':
pdf_url = link_el.get('href')
break
if pdf_url is None and arxiv_id:
pdf_url = f'https://arxiv.org/pdf/{arxiv_id}'
abstract_url = f'https://arxiv.org/abs/{arxiv_id}'
return Paper(
paper_id=paper_id,
source='arxiv',
title=title,
abstract=abstract,
authors=authors,
published_date=published,
categories=categories,
app_category=category_slug,
pdf_url=pdf_url,
abstract_url=abstract_url,
)