| """Standalone arxiv2md integration for converting arXiv papers to markdown.""" |
|
|
| import hashlib |
| import logging |
| import os |
| import re |
| from pathlib import Path |
| from urllib.parse import urlparse |
|
|
| import requests |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class Arxiv2MD: |
| """Convert arXiv papers to markdown using arxiv2md API.""" |
|
|
| API_BASE = "https://arxiv2md.org/api/markdown" |
| RATE_LIMIT_RPM = 30 |
|
|
| def __init__(self, output_dir: Path = Path("data") / "papers"): |
| self.output_dir = output_dir |
| self.output_dir.mkdir(parents=True, exist_ok=True) |
|
|
| def _extract_paper_id(self, arxiv_url: str) -> str: |
| """Extract paper ID from arXiv URL.""" |
| logger.info(f"Extracting paper ID from URL: {arxiv_url}") |
|
|
| |
| if "arxiv.org" in arxiv_url: |
| |
| arxiv_url = re.sub(r"v\d+$", "", arxiv_url) |
| |
| parts = arxiv_url.split("/") |
| paper_id = parts[-1].replace(".pdf", "").replace(".html", "") |
| logger.info(f"Extracted arXiv ID: {paper_id}") |
| return paper_id |
| else: |
| |
| paper_id = arxiv_url.replace(".pdf", "").replace(".html", "") |
| return paper_id |
|
|
| def _get_paper_path(self, paper_id: str) -> Path: |
| """Get the file path for a cached paper.""" |
| return self.output_dir / f"{paper_id}.md" |
|
|
| def _load_cached_paper(self, paper_id: str) -> str | None: |
| """Load cached paper if available.""" |
| paper_path = self._get_paper_path(paper_id) |
| if paper_path.exists(): |
| with open(paper_path, "r", encoding="utf-8") as f: |
| text = f.read() |
| logger.info(f"Loaded cached paper {paper_id} from {paper_path}") |
| return text |
| return None |
|
|
| def _save_paper(self, paper_id: str, markdown: str): |
| """Save processed paper to cache.""" |
| paper_path = self._get_paper_path(paper_id) |
| with open(paper_path, "w", encoding="utf-8") as f: |
| f.write(markdown) |
| logger.info(f"Saved paper {paper_id} to {paper_path}") |
|
|
| def _fetch_markdown(self, arxiv_url: str) -> str: |
| """Fetch markdown from arxiv2md API.""" |
| logger.info(f"Fetching markdown from arxiv2md API for {arxiv_url}") |
|
|
| |
| params = { |
| "url": arxiv_url, |
| "remove_refs": "true", |
| "remove_toc": "true", |
| "remove_citations": "true", |
| } |
|
|
| try: |
| response = requests.get(self.API_BASE, params=params, timeout=60) |
| response.raise_for_status() |
| markdown = response.text |
| logger.info(f"Successfully fetched markdown ({len(markdown)} chars)") |
| return markdown |
| except requests.exceptions.RequestException as e: |
| logger.error(f"Error fetching from arxiv2md API: {e}") |
| raise Exception(f"Failed to fetch paper from arxiv2md: {e}") |
|
|
| def __call__(self, arxiv_url: str) -> str: |
| """Process an arXiv URL and return its markdown content. |
| |
| Args: |
| arxiv_url: URL to the arXiv paper (e.g., https://arxiv.org/abs/2006.12834) |
| |
| Returns: |
| Markdown text of the paper with references removed |
| """ |
| logger.debug(f"Arxiv2MD({arxiv_url})") |
|
|
| |
| paper_id = self._extract_paper_id(arxiv_url) |
|
|
| |
| cached_text = self._load_cached_paper(paper_id) |
| if cached_text is not None: |
| return cached_text |
|
|
| |
| markdown = self._fetch_markdown(arxiv_url) |
|
|
| |
| self._save_paper(paper_id, markdown) |
|
|
| return markdown |
|
|
|
|
|
|