Spaces:

UKPLab
/

scicoqa

Running

App Files Files Community

scicoqa / core /arxiv2md_demo.py

timbmg

inital commit

4caa453 unverified 3 months ago

raw

history blame contribute delete

3.95 kB

	"""Standalone arxiv2md integration for converting arXiv papers to markdown."""

	import hashlib
	import logging
	import os
	import re
	from pathlib import Path
	from urllib.parse import urlparse

	import requests

	logger = logging.getLogger(__name__)


	class Arxiv2MD:
	"""Convert arXiv papers to markdown using arxiv2md API."""

	API_BASE = "https://arxiv2md.org/api/markdown"
	RATE_LIMIT_RPM = 30 # 30 requests per minute per IP

	def __init__(self, output_dir: Path = Path("data") / "papers"):
	self.output_dir = output_dir
	self.output_dir.mkdir(parents=True, exist_ok=True)

	def _extract_paper_id(self, arxiv_url: str) -> str:
	"""Extract paper ID from arXiv URL."""
	logger.info(f"Extracting paper ID from URL: {arxiv_url}")

	# Handle different arXiv URL formats
	if "arxiv.org" in arxiv_url:
	# Remove version suffix if present (e.g., v1, v2)
	arxiv_url = re.sub(r"v\d+$", "", arxiv_url)
	# Extract ID from URL
	parts = arxiv_url.split("/")
	paper_id = parts[-1].replace(".pdf", "").replace(".html", "")
	logger.info(f"Extracted arXiv ID: {paper_id}")
	return paper_id
	else:
	# Assume it's already an ID
	paper_id = arxiv_url.replace(".pdf", "").replace(".html", "")
	return paper_id

	def _get_paper_path(self, paper_id: str) -> Path:
	"""Get the file path for a cached paper."""
	return self.output_dir / f"{paper_id}.md"

	def _load_cached_paper(self, paper_id: str) -> str \| None:
	"""Load cached paper if available."""
	paper_path = self._get_paper_path(paper_id)
	if paper_path.exists():
	with open(paper_path, "r", encoding="utf-8") as f:
	text = f.read()
	logger.info(f"Loaded cached paper {paper_id} from {paper_path}")
	return text
	return None

	def _save_paper(self, paper_id: str, markdown: str):
	"""Save processed paper to cache."""
	paper_path = self._get_paper_path(paper_id)
	with open(paper_path, "w", encoding="utf-8") as f:
	f.write(markdown)
	logger.info(f"Saved paper {paper_id} to {paper_path}")

	def _fetch_markdown(self, arxiv_url: str) -> str:
	"""Fetch markdown from arxiv2md API."""
	logger.info(f"Fetching markdown from arxiv2md API for {arxiv_url}")

	# Prepare API parameters
	params = {
	"url": arxiv_url,
	"remove_refs": "true", # Remove references section (required)
	"remove_toc": "true", # Remove table of contents
	"remove_citations": "true", # Remove inline citations
	}

	try:
	response = requests.get(self.API_BASE, params=params, timeout=60)
	response.raise_for_status()
	markdown = response.text
	logger.info(f"Successfully fetched markdown ({len(markdown)} chars)")
	return markdown
	except requests.exceptions.RequestException as e:
	logger.error(f"Error fetching from arxiv2md API: {e}")
	raise Exception(f"Failed to fetch paper from arxiv2md: {e}")

	def __call__(self, arxiv_url: str) -> str:
	"""Process an arXiv URL and return its markdown content.

	Args:
	arxiv_url: URL to the arXiv paper (e.g., https://arxiv.org/abs/2006.12834)

	Returns:
	Markdown text of the paper with references removed
	"""
	logger.debug(f"Arxiv2MD({arxiv_url})")

	# Extract paper ID
	paper_id = self._extract_paper_id(arxiv_url)

	# Check cache first
	cached_text = self._load_cached_paper(paper_id)
	if cached_text is not None:
	return cached_text

	# Fetch from API
	markdown = self._fetch_markdown(arxiv_url)

	# Save to cache
	self._save_paper(paper_id, markdown)

	return markdown