sen-1 / references /fetch_papers.py

Add references folder and research skills

ef06968 5 days ago

12.3 kB

	"""
	Fetch papers from arXiv using only stdlib + requests.
	Downloads PDF and generates paper.md with YAML front matter.
	"""
	import os
	import re
	import sys
	import json
	import time
	import unicodedata
	import xml.etree.ElementTree as ET
	import requests

	REFS_DIR = os.path.join(os.path.dirname(__file__))


	def normalize_name(name):
	parts = name.strip().split()
	lastname = parts[-1] if parts else name
	normalized = unicodedata.normalize('NFD', lastname)
	return ''.join(c for c in normalized if unicodedata.category(c) != 'Mn').lower()


	def fetch_arxiv_metadata(arxiv_id):
	"""Fetch metadata from arXiv API."""
	url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
	resp = requests.get(url, timeout=30)
	resp.raise_for_status()

	ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
	root = ET.fromstring(resp.text)
	entry = root.find('atom:entry', ns)
	if entry is None:
	return None

	title = entry.find('atom:title', ns).text.strip().replace('\n', ' ')
	title = re.sub(r'\s+', ' ', title)

	authors = []
	for a in entry.findall('atom:author', ns):
	name = a.find('atom:name', ns).text.strip()
	authors.append(name)

	published = entry.find('atom:published', ns).text
	year = int(published[:4])

	summary = entry.find('atom:summary', ns).text.strip()

	return {
	'title': title,
	'authors': authors,
	'year': year,
	'abstract': summary,
	'arxiv_id': arxiv_id,
	'url': f'https://arxiv.org/abs/{arxiv_id}'
	}


	def download_pdf(arxiv_id, dest_path):
	"""Download PDF from arXiv."""
	url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
	resp = requests.get(url, timeout=60, allow_redirects=True)
	resp.raise_for_status()
	with open(dest_path, 'wb') as f:
	f.write(resp.content)
	return True


	def download_source(arxiv_id, folder):
	"""Try to download LaTeX source from arXiv."""
	import tarfile
	import gzip
	from io import BytesIO

	url = f"https://arxiv.org/e-print/{arxiv_id}"
	try:
	resp = requests.get(url, timeout=60, allow_redirects=True)
	resp.raise_for_status()
	content = resp.content

	# Try tar.gz
	try:
	with tarfile.open(fileobj=BytesIO(content), mode='r:gz') as tar:
	tex_files = [m.name for m in tar.getmembers() if m.name.endswith('.tex')]
	source_dir = os.path.join(folder, 'source')
	os.makedirs(source_dir, exist_ok=True)
	tar.extractall(path=source_dir)

	main_tex = None
	for name in tex_files:
	if 'main' in name.lower():
	main_tex = name
	break
	if not main_tex and tex_files:
	main_tex = tex_files[0]

	if main_tex:
	with open(os.path.join(source_dir, main_tex), 'r', errors='ignore') as f:
	tex_content = f.read()
	# Save paper.tex
	with open(os.path.join(folder, 'paper.tex'), 'w') as f:
	f.write(tex_content)
	return tex_content
	except tarfile.TarError:
	pass

	# Try plain gzip
	try:
	tex_content = gzip.decompress(content).decode('utf-8', errors='ignore')
	if '\\documentclass' in tex_content or '\\begin{document}' in tex_content:
	with open(os.path.join(folder, 'paper.tex'), 'w') as f:
	f.write(tex_content)
	return tex_content
	except:
	pass

	except Exception as e:
	print(f" Source download failed: {e}")
	return None


	def tex_to_md(tex_content):
	"""Basic LaTeX to Markdown conversion."""
	md = tex_content

	doc_match = re.search(r'\\begin\{document\}', md)
	if doc_match:
	md = md[doc_match.end():]
	md = re.sub(r'\\end\{document\}.*', '', md, flags=re.DOTALL)
	md = re.sub(r'%.*$', '', md, flags=re.MULTILINE)
	md = re.sub(r'\\section\*?\{([^}]+)\}', r'# \1', md)
	md = re.sub(r'\\subsection\*?\{([^}]+)\}', r'## \1', md)
	md = re.sub(r'\\subsubsection\*?\{([^}]+)\}', r'### \1', md)
	md = re.sub(r'\\textbf\{([^}]+)\}', r'\1', md)
	md = re.sub(r'\\textit\{([^}]+)\}', r'\1', md)
	md = re.sub(r'\\emph\{([^}]+)\}', r'\1', md)
	md = re.sub(r'\\texttt\{([^}]+)\}', r'`\1`', md)
	md = re.sub(r'\\cite\w*\{([^}]+)\}', r'[\1]', md)
	md = re.sub(r'\\url\{([^}]+)\}', r'\1', md)
	md = re.sub(r'\\href\{([^}]+)\}\{([^}]+)\}', r'[\2](\1)', md)
	md = re.sub(r'\\begin\{itemize\}', '', md)
	md = re.sub(r'\\end\{itemize\}', '', md)
	md = re.sub(r'\\begin\{enumerate\}', '', md)
	md = re.sub(r'\\end\{enumerate\}', '', md)
	md = re.sub(r'\\item\s*', '- ', md)
	md = re.sub(r'\n{3,}', '\n\n', md)
	return md.strip()


	def fetch_paper(arxiv_id):
	"""Fetch a single paper from arXiv."""
	arxiv_id = re.sub(r'^(arxiv:\|https?://arxiv\.org/(abs\|pdf)/)', '', arxiv_id)
	arxiv_id = arxiv_id.rstrip('.pdf').rstrip('/')

	print(f"\n{'='*60}")
	print(f"Fetching arXiv:{arxiv_id}")
	print(f"{'='*60}")

	# 1. Get metadata
	meta = fetch_arxiv_metadata(arxiv_id)
	if not meta:
	print(f" ERROR: Paper not found: {arxiv_id}")
	return None

	print(f" Title: {meta['title'][:70]}...")
	print(f" Authors: {', '.join(meta['authors'][:3])}")
	print(f" Year: {meta['year']}")

	# 2. Create folder
	author = normalize_name(meta['authors'][0]) if meta['authors'] else 'unknown'
	folder_name = f"{meta['year']}.arxiv.{author}"
	folder = os.path.join(REFS_DIR, folder_name)
	os.makedirs(folder, exist_ok=True)
	print(f" Folder: {folder_name}/")

	# 3. Build front matter
	authors_yaml = '\n'.join(f' - "{a}"' for a in meta['authors'])
	front_matter = f'''---
	title: "{meta['title']}"
	authors:
	{authors_yaml}
	year: {meta['year']}
	venue: "arXiv"
	url: "{meta['url']}"
	arxiv: "{arxiv_id}"
	---

	'''

	# 4. Download PDF
	pdf_path = os.path.join(folder, 'paper.pdf')
	if not os.path.exists(pdf_path):
	print(f" Downloading PDF...")
	download_pdf(arxiv_id, pdf_path)
	print(f" Saved: paper.pdf")
	else:
	print(f" PDF already exists")

	# 5. Try to get LaTeX source
	md_path = os.path.join(folder, 'paper.md')
	if not os.path.exists(md_path):
	print(f" Downloading LaTeX source...")
	tex_content = download_source(arxiv_id, folder)

	if tex_content:
	md_text = tex_to_md(tex_content)
	with open(md_path, 'w') as f:
	f.write(front_matter + md_text)
	print(f" Generated: paper.md (from LaTeX)")
	else:
	# Fallback: write front matter + abstract
	with open(md_path, 'w') as f:
	f.write(front_matter)
	f.write(f"# {meta['title']}\n\n")
	f.write(f"## Abstract\n\n{meta['abstract']}\n\n")
	f.write(f"Full text available in paper.pdf\n")
	print(f" Generated: paper.md (metadata + abstract only)")

	else:
	print(f" paper.md already exists")

	# 6. Update paper_db.json
	update_paper_db(folder_name, meta)

	print(f" Done: {folder_name}/")
	return folder_name


	def fetch_acl(acl_id):
	"""Fetch a paper from ACL Anthology."""
	acl_id = re.sub(r'^https?://aclanthology\.org/', '', acl_id)
	acl_id = acl_id.rstrip('/').rstrip('.pdf')

	print(f"\n{'='*60}")
	print(f"Fetching ACL:{acl_id}")
	print(f"{'='*60}")

	# Get BibTeX for metadata
	bib_url = f"https://aclanthology.org/{acl_id}.bib"
	try:
	resp = requests.get(bib_url, timeout=15)
	bib_text = resp.text

	title_match = re.search(r'title\s=\s["{]([^"}]+)', bib_text)
	title = title_match.group(1) if title_match else acl_id

	author_match = re.search(r'author\s=\s["{]([^"}]+)', bib_text)
	authors = []
	if author_match:
	authors = [a.strip() for a in author_match.group(1).split(' and ')]

	year_match = re.search(r'year\s=\s["{]?(\d{4})', bib_text)
	year = int(year_match.group(1)) if year_match else 2020

	venue_match = re.search(r'booktitle\s=\s["{]([^"}]+)', bib_text)
	venue = venue_match.group(1) if venue_match else "ACL"
	except:
	title = acl_id
	authors = ["unknown"]
	year = 2020
	venue = "ACL"

	print(f" Title: {title[:70]}...")
	print(f" Authors: {', '.join(authors[:3])}")

	# Parse venue from ID
	venue_short = "acl"
	m = re.match(r'(\d{4})\.([a-z\-]+)', acl_id)
	if m:
	venue_short = m.group(2).split('-')[0]
	else:
	prefix_map = {'P': 'acl', 'N': 'naacl', 'E': 'eacl', 'D': 'emnlp', 'C': 'coling', 'W': 'workshop'}
	m2 = re.match(r'([A-Z])(\d{2})', acl_id)
	if m2:
	venue_short = prefix_map.get(m2.group(1), 'acl')

	author_name = normalize_name(authors[0]) if authors else 'unknown'
	folder_name = f"{year}.{venue_short}.{author_name}"
	folder = os.path.join(REFS_DIR, folder_name)
	os.makedirs(folder, exist_ok=True)
	print(f" Folder: {folder_name}/")

	# Download PDF
	pdf_path = os.path.join(folder, 'paper.pdf')
	if not os.path.exists(pdf_path):
	pdf_url = f"https://aclanthology.org/{acl_id}.pdf"
	print(f" Downloading PDF...")
	resp = requests.get(pdf_url, timeout=60, allow_redirects=True)
	resp.raise_for_status()
	with open(pdf_path, 'wb') as f:
	f.write(resp.content)
	print(f" Saved: paper.pdf")

	# Create paper.md with metadata
	md_path = os.path.join(folder, 'paper.md')
	if not os.path.exists(md_path):
	authors_yaml = '\n'.join(f' - "{a}"' for a in authors)
	front_matter = f'''---
	title: "{title}"
	authors:
	{authors_yaml}
	year: {year}
	venue: "{venue}"
	url: "https://aclanthology.org/{acl_id}"
	---

	'''
	with open(md_path, 'w') as f:
	f.write(front_matter)
	f.write(f"# {title}\n\n")
	f.write(f"Full text available in paper.pdf\n")
	print(f" Generated: paper.md")

	meta = {
	'title': title,
	'authors': authors,
	'year': year,
	'abstract': '',
	'arxiv_id': None,
	'url': f'https://aclanthology.org/{acl_id}'
	}
	update_paper_db(folder_name, meta)

	print(f" Done: {folder_name}/")
	return folder_name


	def update_paper_db(folder_name, meta):
	"""Update paper_db.json with new paper entry."""
	db_path = os.path.join(REFS_DIR, 'paper_db.json')

	if os.path.exists(db_path):
	with open(db_path, 'r') as f:
	db = json.load(f)
	else:
	db = {"papers": {}, "s2_cache": {}}

	db['papers'][folder_name] = {
	"id": folder_name,
	"title": meta['title'],
	"authors": meta['authors'],
	"year": meta['year'],
	"venue": "arXiv" if meta.get('arxiv_id') else "ACL",
	"url": meta['url'],
	"arxiv_id": meta.get('arxiv_id'),
	"s2_id": None,
	"doi": None,
	"citation_count": 0,
	"abstract": meta.get('abstract', ''),
	"tldr": "",
	"keywords": [],
	"references": [],
	"cited_by": [],
	"local_path": f"references/{folder_name}",
	"fetched": True
	}

	with open(db_path, 'w') as f:
	json.dump(db, f, indent=2, ensure_ascii=False)


	if __name__ == "__main__":
	if len(sys.argv) < 2:
	print("Usage: python3 fetch_papers.py <arxiv_id_or_acl_id> [...]")
	sys.exit(1)

	for paper_id in sys.argv[1:]:
	paper_id = paper_id.strip()
	if not paper_id:
	continue

	try:
	# Detect if arXiv or ACL
	if re.match(r'^\d{4}\.\d{4,5}', paper_id) or paper_id.startswith('arxiv:'):
	fetch_paper(paper_id)
	elif re.match(r'^[A-Z]\d{2}-\d+$', paper_id) or re.match(r'^\d{4}\.[a-z]', paper_id):
	fetch_acl(paper_id)
	else:
	# Default: try arXiv
	fetch_paper(paper_id)

	time.sleep(3) # Rate limiting
	except Exception as e:
	print(f" ERROR fetching {paper_id}: {e}")