sen-1 / references /fetch_papers.py
Tiep's picture
Add references folder and research skills
ef06968
"""
Fetch papers from arXiv using only stdlib + requests.
Downloads PDF and generates paper.md with YAML front matter.
"""
import os
import re
import sys
import json
import time
import unicodedata
import xml.etree.ElementTree as ET
import requests
REFS_DIR = os.path.join(os.path.dirname(__file__))
def normalize_name(name):
parts = name.strip().split()
lastname = parts[-1] if parts else name
normalized = unicodedata.normalize('NFD', lastname)
return ''.join(c for c in normalized if unicodedata.category(c) != 'Mn').lower()
def fetch_arxiv_metadata(arxiv_id):
"""Fetch metadata from arXiv API."""
url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
resp = requests.get(url, timeout=30)
resp.raise_for_status()
ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
root = ET.fromstring(resp.text)
entry = root.find('atom:entry', ns)
if entry is None:
return None
title = entry.find('atom:title', ns).text.strip().replace('\n', ' ')
title = re.sub(r'\s+', ' ', title)
authors = []
for a in entry.findall('atom:author', ns):
name = a.find('atom:name', ns).text.strip()
authors.append(name)
published = entry.find('atom:published', ns).text
year = int(published[:4])
summary = entry.find('atom:summary', ns).text.strip()
return {
'title': title,
'authors': authors,
'year': year,
'abstract': summary,
'arxiv_id': arxiv_id,
'url': f'https://arxiv.org/abs/{arxiv_id}'
}
def download_pdf(arxiv_id, dest_path):
"""Download PDF from arXiv."""
url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
resp = requests.get(url, timeout=60, allow_redirects=True)
resp.raise_for_status()
with open(dest_path, 'wb') as f:
f.write(resp.content)
return True
def download_source(arxiv_id, folder):
"""Try to download LaTeX source from arXiv."""
import tarfile
import gzip
from io import BytesIO
url = f"https://arxiv.org/e-print/{arxiv_id}"
try:
resp = requests.get(url, timeout=60, allow_redirects=True)
resp.raise_for_status()
content = resp.content
# Try tar.gz
try:
with tarfile.open(fileobj=BytesIO(content), mode='r:gz') as tar:
tex_files = [m.name for m in tar.getmembers() if m.name.endswith('.tex')]
source_dir = os.path.join(folder, 'source')
os.makedirs(source_dir, exist_ok=True)
tar.extractall(path=source_dir)
main_tex = None
for name in tex_files:
if 'main' in name.lower():
main_tex = name
break
if not main_tex and tex_files:
main_tex = tex_files[0]
if main_tex:
with open(os.path.join(source_dir, main_tex), 'r', errors='ignore') as f:
tex_content = f.read()
# Save paper.tex
with open(os.path.join(folder, 'paper.tex'), 'w') as f:
f.write(tex_content)
return tex_content
except tarfile.TarError:
pass
# Try plain gzip
try:
tex_content = gzip.decompress(content).decode('utf-8', errors='ignore')
if '\\documentclass' in tex_content or '\\begin{document}' in tex_content:
with open(os.path.join(folder, 'paper.tex'), 'w') as f:
f.write(tex_content)
return tex_content
except:
pass
except Exception as e:
print(f" Source download failed: {e}")
return None
def tex_to_md(tex_content):
"""Basic LaTeX to Markdown conversion."""
md = tex_content
doc_match = re.search(r'\\begin\{document\}', md)
if doc_match:
md = md[doc_match.end():]
md = re.sub(r'\\end\{document\}.*', '', md, flags=re.DOTALL)
md = re.sub(r'%.*$', '', md, flags=re.MULTILINE)
md = re.sub(r'\\section\*?\{([^}]+)\}', r'# \1', md)
md = re.sub(r'\\subsection\*?\{([^}]+)\}', r'## \1', md)
md = re.sub(r'\\subsubsection\*?\{([^}]+)\}', r'### \1', md)
md = re.sub(r'\\textbf\{([^}]+)\}', r'**\1**', md)
md = re.sub(r'\\textit\{([^}]+)\}', r'*\1*', md)
md = re.sub(r'\\emph\{([^}]+)\}', r'*\1*', md)
md = re.sub(r'\\texttt\{([^}]+)\}', r'`\1`', md)
md = re.sub(r'\\cite\w*\{([^}]+)\}', r'[\1]', md)
md = re.sub(r'\\url\{([^}]+)\}', r'\1', md)
md = re.sub(r'\\href\{([^}]+)\}\{([^}]+)\}', r'[\2](\1)', md)
md = re.sub(r'\\begin\{itemize\}', '', md)
md = re.sub(r'\\end\{itemize\}', '', md)
md = re.sub(r'\\begin\{enumerate\}', '', md)
md = re.sub(r'\\end\{enumerate\}', '', md)
md = re.sub(r'\\item\s*', '- ', md)
md = re.sub(r'\n{3,}', '\n\n', md)
return md.strip()
def fetch_paper(arxiv_id):
"""Fetch a single paper from arXiv."""
arxiv_id = re.sub(r'^(arxiv:|https?://arxiv\.org/(abs|pdf)/)', '', arxiv_id)
arxiv_id = arxiv_id.rstrip('.pdf').rstrip('/')
print(f"\n{'='*60}")
print(f"Fetching arXiv:{arxiv_id}")
print(f"{'='*60}")
# 1. Get metadata
meta = fetch_arxiv_metadata(arxiv_id)
if not meta:
print(f" ERROR: Paper not found: {arxiv_id}")
return None
print(f" Title: {meta['title'][:70]}...")
print(f" Authors: {', '.join(meta['authors'][:3])}")
print(f" Year: {meta['year']}")
# 2. Create folder
author = normalize_name(meta['authors'][0]) if meta['authors'] else 'unknown'
folder_name = f"{meta['year']}.arxiv.{author}"
folder = os.path.join(REFS_DIR, folder_name)
os.makedirs(folder, exist_ok=True)
print(f" Folder: {folder_name}/")
# 3. Build front matter
authors_yaml = '\n'.join(f' - "{a}"' for a in meta['authors'])
front_matter = f'''---
title: "{meta['title']}"
authors:
{authors_yaml}
year: {meta['year']}
venue: "arXiv"
url: "{meta['url']}"
arxiv: "{arxiv_id}"
---
'''
# 4. Download PDF
pdf_path = os.path.join(folder, 'paper.pdf')
if not os.path.exists(pdf_path):
print(f" Downloading PDF...")
download_pdf(arxiv_id, pdf_path)
print(f" Saved: paper.pdf")
else:
print(f" PDF already exists")
# 5. Try to get LaTeX source
md_path = os.path.join(folder, 'paper.md')
if not os.path.exists(md_path):
print(f" Downloading LaTeX source...")
tex_content = download_source(arxiv_id, folder)
if tex_content:
md_text = tex_to_md(tex_content)
with open(md_path, 'w') as f:
f.write(front_matter + md_text)
print(f" Generated: paper.md (from LaTeX)")
else:
# Fallback: write front matter + abstract
with open(md_path, 'w') as f:
f.write(front_matter)
f.write(f"# {meta['title']}\n\n")
f.write(f"## Abstract\n\n{meta['abstract']}\n\n")
f.write(f"*Full text available in paper.pdf*\n")
print(f" Generated: paper.md (metadata + abstract only)")
else:
print(f" paper.md already exists")
# 6. Update paper_db.json
update_paper_db(folder_name, meta)
print(f" Done: {folder_name}/")
return folder_name
def fetch_acl(acl_id):
"""Fetch a paper from ACL Anthology."""
acl_id = re.sub(r'^https?://aclanthology\.org/', '', acl_id)
acl_id = acl_id.rstrip('/').rstrip('.pdf')
print(f"\n{'='*60}")
print(f"Fetching ACL:{acl_id}")
print(f"{'='*60}")
# Get BibTeX for metadata
bib_url = f"https://aclanthology.org/{acl_id}.bib"
try:
resp = requests.get(bib_url, timeout=15)
bib_text = resp.text
title_match = re.search(r'title\s*=\s*["{]([^"}]+)', bib_text)
title = title_match.group(1) if title_match else acl_id
author_match = re.search(r'author\s*=\s*["{]([^"}]+)', bib_text)
authors = []
if author_match:
authors = [a.strip() for a in author_match.group(1).split(' and ')]
year_match = re.search(r'year\s*=\s*["{]?(\d{4})', bib_text)
year = int(year_match.group(1)) if year_match else 2020
venue_match = re.search(r'booktitle\s*=\s*["{]([^"}]+)', bib_text)
venue = venue_match.group(1) if venue_match else "ACL"
except:
title = acl_id
authors = ["unknown"]
year = 2020
venue = "ACL"
print(f" Title: {title[:70]}...")
print(f" Authors: {', '.join(authors[:3])}")
# Parse venue from ID
venue_short = "acl"
m = re.match(r'(\d{4})\.([a-z\-]+)', acl_id)
if m:
venue_short = m.group(2).split('-')[0]
else:
prefix_map = {'P': 'acl', 'N': 'naacl', 'E': 'eacl', 'D': 'emnlp', 'C': 'coling', 'W': 'workshop'}
m2 = re.match(r'([A-Z])(\d{2})', acl_id)
if m2:
venue_short = prefix_map.get(m2.group(1), 'acl')
author_name = normalize_name(authors[0]) if authors else 'unknown'
folder_name = f"{year}.{venue_short}.{author_name}"
folder = os.path.join(REFS_DIR, folder_name)
os.makedirs(folder, exist_ok=True)
print(f" Folder: {folder_name}/")
# Download PDF
pdf_path = os.path.join(folder, 'paper.pdf')
if not os.path.exists(pdf_path):
pdf_url = f"https://aclanthology.org/{acl_id}.pdf"
print(f" Downloading PDF...")
resp = requests.get(pdf_url, timeout=60, allow_redirects=True)
resp.raise_for_status()
with open(pdf_path, 'wb') as f:
f.write(resp.content)
print(f" Saved: paper.pdf")
# Create paper.md with metadata
md_path = os.path.join(folder, 'paper.md')
if not os.path.exists(md_path):
authors_yaml = '\n'.join(f' - "{a}"' for a in authors)
front_matter = f'''---
title: "{title}"
authors:
{authors_yaml}
year: {year}
venue: "{venue}"
url: "https://aclanthology.org/{acl_id}"
---
'''
with open(md_path, 'w') as f:
f.write(front_matter)
f.write(f"# {title}\n\n")
f.write(f"*Full text available in paper.pdf*\n")
print(f" Generated: paper.md")
meta = {
'title': title,
'authors': authors,
'year': year,
'abstract': '',
'arxiv_id': None,
'url': f'https://aclanthology.org/{acl_id}'
}
update_paper_db(folder_name, meta)
print(f" Done: {folder_name}/")
return folder_name
def update_paper_db(folder_name, meta):
"""Update paper_db.json with new paper entry."""
db_path = os.path.join(REFS_DIR, 'paper_db.json')
if os.path.exists(db_path):
with open(db_path, 'r') as f:
db = json.load(f)
else:
db = {"papers": {}, "s2_cache": {}}
db['papers'][folder_name] = {
"id": folder_name,
"title": meta['title'],
"authors": meta['authors'],
"year": meta['year'],
"venue": "arXiv" if meta.get('arxiv_id') else "ACL",
"url": meta['url'],
"arxiv_id": meta.get('arxiv_id'),
"s2_id": None,
"doi": None,
"citation_count": 0,
"abstract": meta.get('abstract', ''),
"tldr": "",
"keywords": [],
"references": [],
"cited_by": [],
"local_path": f"references/{folder_name}",
"fetched": True
}
with open(db_path, 'w') as f:
json.dump(db, f, indent=2, ensure_ascii=False)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python3 fetch_papers.py <arxiv_id_or_acl_id> [...]")
sys.exit(1)
for paper_id in sys.argv[1:]:
paper_id = paper_id.strip()
if not paper_id:
continue
try:
# Detect if arXiv or ACL
if re.match(r'^\d{4}\.\d{4,5}', paper_id) or paper_id.startswith('arxiv:'):
fetch_paper(paper_id)
elif re.match(r'^[A-Z]\d{2}-\d+$', paper_id) or re.match(r'^\d{4}\.[a-z]', paper_id):
fetch_acl(paper_id)
else:
# Default: try arXiv
fetch_paper(paper_id)
time.sleep(3) # Rate limiting
except Exception as e:
print(f" ERROR fetching {paper_id}: {e}")