scicoqa / core /arxiv2md_demo.py
timbmg's picture
inital commit
4caa453 unverified
"""Standalone arxiv2md integration for converting arXiv papers to markdown."""
import hashlib
import logging
import os
import re
from pathlib import Path
from urllib.parse import urlparse
import requests
logger = logging.getLogger(__name__)
class Arxiv2MD:
"""Convert arXiv papers to markdown using arxiv2md API."""
API_BASE = "https://arxiv2md.org/api/markdown"
RATE_LIMIT_RPM = 30 # 30 requests per minute per IP
def __init__(self, output_dir: Path = Path("data") / "papers"):
self.output_dir = output_dir
self.output_dir.mkdir(parents=True, exist_ok=True)
def _extract_paper_id(self, arxiv_url: str) -> str:
"""Extract paper ID from arXiv URL."""
logger.info(f"Extracting paper ID from URL: {arxiv_url}")
# Handle different arXiv URL formats
if "arxiv.org" in arxiv_url:
# Remove version suffix if present (e.g., v1, v2)
arxiv_url = re.sub(r"v\d+$", "", arxiv_url)
# Extract ID from URL
parts = arxiv_url.split("/")
paper_id = parts[-1].replace(".pdf", "").replace(".html", "")
logger.info(f"Extracted arXiv ID: {paper_id}")
return paper_id
else:
# Assume it's already an ID
paper_id = arxiv_url.replace(".pdf", "").replace(".html", "")
return paper_id
def _get_paper_path(self, paper_id: str) -> Path:
"""Get the file path for a cached paper."""
return self.output_dir / f"{paper_id}.md"
def _load_cached_paper(self, paper_id: str) -> str | None:
"""Load cached paper if available."""
paper_path = self._get_paper_path(paper_id)
if paper_path.exists():
with open(paper_path, "r", encoding="utf-8") as f:
text = f.read()
logger.info(f"Loaded cached paper {paper_id} from {paper_path}")
return text
return None
def _save_paper(self, paper_id: str, markdown: str):
"""Save processed paper to cache."""
paper_path = self._get_paper_path(paper_id)
with open(paper_path, "w", encoding="utf-8") as f:
f.write(markdown)
logger.info(f"Saved paper {paper_id} to {paper_path}")
def _fetch_markdown(self, arxiv_url: str) -> str:
"""Fetch markdown from arxiv2md API."""
logger.info(f"Fetching markdown from arxiv2md API for {arxiv_url}")
# Prepare API parameters
params = {
"url": arxiv_url,
"remove_refs": "true", # Remove references section (required)
"remove_toc": "true", # Remove table of contents
"remove_citations": "true", # Remove inline citations
}
try:
response = requests.get(self.API_BASE, params=params, timeout=60)
response.raise_for_status()
markdown = response.text
logger.info(f"Successfully fetched markdown ({len(markdown)} chars)")
return markdown
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching from arxiv2md API: {e}")
raise Exception(f"Failed to fetch paper from arxiv2md: {e}")
def __call__(self, arxiv_url: str) -> str:
"""Process an arXiv URL and return its markdown content.
Args:
arxiv_url: URL to the arXiv paper (e.g., https://arxiv.org/abs/2006.12834)
Returns:
Markdown text of the paper with references removed
"""
logger.debug(f"Arxiv2MD({arxiv_url})")
# Extract paper ID
paper_id = self._extract_paper_id(arxiv_url)
# Check cache first
cached_text = self._load_cached_paper(paper_id)
if cached_text is not None:
return cached_text
# Fetch from API
markdown = self._fetch_markdown(arxiv_url)
# Save to cache
self._save_paper(paper_id, markdown)
return markdown