|
|
""" |
|
|
Fetch papers from arXiv using only stdlib + requests. |
|
|
Downloads PDF and generates paper.md with YAML front matter. |
|
|
""" |
|
|
import os |
|
|
import re |
|
|
import sys |
|
|
import json |
|
|
import time |
|
|
import unicodedata |
|
|
import xml.etree.ElementTree as ET |
|
|
import requests |
|
|
|
|
|
REFS_DIR = os.path.join(os.path.dirname(__file__)) |
|
|
|
|
|
|
|
|
def normalize_name(name): |
|
|
parts = name.strip().split() |
|
|
lastname = parts[-1] if parts else name |
|
|
normalized = unicodedata.normalize('NFD', lastname) |
|
|
return ''.join(c for c in normalized if unicodedata.category(c) != 'Mn').lower() |
|
|
|
|
|
|
|
|
def fetch_arxiv_metadata(arxiv_id): |
|
|
"""Fetch metadata from arXiv API.""" |
|
|
url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}" |
|
|
resp = requests.get(url, timeout=30) |
|
|
resp.raise_for_status() |
|
|
|
|
|
ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'} |
|
|
root = ET.fromstring(resp.text) |
|
|
entry = root.find('atom:entry', ns) |
|
|
if entry is None: |
|
|
return None |
|
|
|
|
|
title = entry.find('atom:title', ns).text.strip().replace('\n', ' ') |
|
|
title = re.sub(r'\s+', ' ', title) |
|
|
|
|
|
authors = [] |
|
|
for a in entry.findall('atom:author', ns): |
|
|
name = a.find('atom:name', ns).text.strip() |
|
|
authors.append(name) |
|
|
|
|
|
published = entry.find('atom:published', ns).text |
|
|
year = int(published[:4]) |
|
|
|
|
|
summary = entry.find('atom:summary', ns).text.strip() |
|
|
|
|
|
return { |
|
|
'title': title, |
|
|
'authors': authors, |
|
|
'year': year, |
|
|
'abstract': summary, |
|
|
'arxiv_id': arxiv_id, |
|
|
'url': f'https://arxiv.org/abs/{arxiv_id}' |
|
|
} |
|
|
|
|
|
|
|
|
def download_pdf(arxiv_id, dest_path): |
|
|
"""Download PDF from arXiv.""" |
|
|
url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" |
|
|
resp = requests.get(url, timeout=60, allow_redirects=True) |
|
|
resp.raise_for_status() |
|
|
with open(dest_path, 'wb') as f: |
|
|
f.write(resp.content) |
|
|
return True |
|
|
|
|
|
|
|
|
def download_source(arxiv_id, folder): |
|
|
"""Try to download LaTeX source from arXiv.""" |
|
|
import tarfile |
|
|
import gzip |
|
|
from io import BytesIO |
|
|
|
|
|
url = f"https://arxiv.org/e-print/{arxiv_id}" |
|
|
try: |
|
|
resp = requests.get(url, timeout=60, allow_redirects=True) |
|
|
resp.raise_for_status() |
|
|
content = resp.content |
|
|
|
|
|
|
|
|
try: |
|
|
with tarfile.open(fileobj=BytesIO(content), mode='r:gz') as tar: |
|
|
tex_files = [m.name for m in tar.getmembers() if m.name.endswith('.tex')] |
|
|
source_dir = os.path.join(folder, 'source') |
|
|
os.makedirs(source_dir, exist_ok=True) |
|
|
tar.extractall(path=source_dir) |
|
|
|
|
|
main_tex = None |
|
|
for name in tex_files: |
|
|
if 'main' in name.lower(): |
|
|
main_tex = name |
|
|
break |
|
|
if not main_tex and tex_files: |
|
|
main_tex = tex_files[0] |
|
|
|
|
|
if main_tex: |
|
|
with open(os.path.join(source_dir, main_tex), 'r', errors='ignore') as f: |
|
|
tex_content = f.read() |
|
|
|
|
|
with open(os.path.join(folder, 'paper.tex'), 'w') as f: |
|
|
f.write(tex_content) |
|
|
return tex_content |
|
|
except tarfile.TarError: |
|
|
pass |
|
|
|
|
|
|
|
|
try: |
|
|
tex_content = gzip.decompress(content).decode('utf-8', errors='ignore') |
|
|
if '\\documentclass' in tex_content or '\\begin{document}' in tex_content: |
|
|
with open(os.path.join(folder, 'paper.tex'), 'w') as f: |
|
|
f.write(tex_content) |
|
|
return tex_content |
|
|
except: |
|
|
pass |
|
|
|
|
|
except Exception as e: |
|
|
print(f" Source download failed: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def tex_to_md(tex_content): |
|
|
"""Basic LaTeX to Markdown conversion.""" |
|
|
md = tex_content |
|
|
|
|
|
doc_match = re.search(r'\\begin\{document\}', md) |
|
|
if doc_match: |
|
|
md = md[doc_match.end():] |
|
|
md = re.sub(r'\\end\{document\}.*', '', md, flags=re.DOTALL) |
|
|
md = re.sub(r'%.*$', '', md, flags=re.MULTILINE) |
|
|
md = re.sub(r'\\section\*?\{([^}]+)\}', r'# \1', md) |
|
|
md = re.sub(r'\\subsection\*?\{([^}]+)\}', r'## \1', md) |
|
|
md = re.sub(r'\\subsubsection\*?\{([^}]+)\}', r'### \1', md) |
|
|
md = re.sub(r'\\textbf\{([^}]+)\}', r'**\1**', md) |
|
|
md = re.sub(r'\\textit\{([^}]+)\}', r'*\1*', md) |
|
|
md = re.sub(r'\\emph\{([^}]+)\}', r'*\1*', md) |
|
|
md = re.sub(r'\\texttt\{([^}]+)\}', r'`\1`', md) |
|
|
md = re.sub(r'\\cite\w*\{([^}]+)\}', r'[\1]', md) |
|
|
md = re.sub(r'\\url\{([^}]+)\}', r'\1', md) |
|
|
md = re.sub(r'\\href\{([^}]+)\}\{([^}]+)\}', r'[\2](\1)', md) |
|
|
md = re.sub(r'\\begin\{itemize\}', '', md) |
|
|
md = re.sub(r'\\end\{itemize\}', '', md) |
|
|
md = re.sub(r'\\begin\{enumerate\}', '', md) |
|
|
md = re.sub(r'\\end\{enumerate\}', '', md) |
|
|
md = re.sub(r'\\item\s*', '- ', md) |
|
|
md = re.sub(r'\n{3,}', '\n\n', md) |
|
|
return md.strip() |
|
|
|
|
|
|
|
|
def fetch_paper(arxiv_id): |
|
|
"""Fetch a single paper from arXiv.""" |
|
|
arxiv_id = re.sub(r'^(arxiv:|https?://arxiv\.org/(abs|pdf)/)', '', arxiv_id) |
|
|
arxiv_id = arxiv_id.rstrip('.pdf').rstrip('/') |
|
|
|
|
|
print(f"\n{'='*60}") |
|
|
print(f"Fetching arXiv:{arxiv_id}") |
|
|
print(f"{'='*60}") |
|
|
|
|
|
|
|
|
meta = fetch_arxiv_metadata(arxiv_id) |
|
|
if not meta: |
|
|
print(f" ERROR: Paper not found: {arxiv_id}") |
|
|
return None |
|
|
|
|
|
print(f" Title: {meta['title'][:70]}...") |
|
|
print(f" Authors: {', '.join(meta['authors'][:3])}") |
|
|
print(f" Year: {meta['year']}") |
|
|
|
|
|
|
|
|
author = normalize_name(meta['authors'][0]) if meta['authors'] else 'unknown' |
|
|
folder_name = f"{meta['year']}.arxiv.{author}" |
|
|
folder = os.path.join(REFS_DIR, folder_name) |
|
|
os.makedirs(folder, exist_ok=True) |
|
|
print(f" Folder: {folder_name}/") |
|
|
|
|
|
|
|
|
authors_yaml = '\n'.join(f' - "{a}"' for a in meta['authors']) |
|
|
front_matter = f'''--- |
|
|
title: "{meta['title']}" |
|
|
authors: |
|
|
{authors_yaml} |
|
|
year: {meta['year']} |
|
|
venue: "arXiv" |
|
|
url: "{meta['url']}" |
|
|
arxiv: "{arxiv_id}" |
|
|
--- |
|
|
|
|
|
''' |
|
|
|
|
|
|
|
|
pdf_path = os.path.join(folder, 'paper.pdf') |
|
|
if not os.path.exists(pdf_path): |
|
|
print(f" Downloading PDF...") |
|
|
download_pdf(arxiv_id, pdf_path) |
|
|
print(f" Saved: paper.pdf") |
|
|
else: |
|
|
print(f" PDF already exists") |
|
|
|
|
|
|
|
|
md_path = os.path.join(folder, 'paper.md') |
|
|
if not os.path.exists(md_path): |
|
|
print(f" Downloading LaTeX source...") |
|
|
tex_content = download_source(arxiv_id, folder) |
|
|
|
|
|
if tex_content: |
|
|
md_text = tex_to_md(tex_content) |
|
|
with open(md_path, 'w') as f: |
|
|
f.write(front_matter + md_text) |
|
|
print(f" Generated: paper.md (from LaTeX)") |
|
|
else: |
|
|
|
|
|
with open(md_path, 'w') as f: |
|
|
f.write(front_matter) |
|
|
f.write(f"# {meta['title']}\n\n") |
|
|
f.write(f"## Abstract\n\n{meta['abstract']}\n\n") |
|
|
f.write(f"*Full text available in paper.pdf*\n") |
|
|
print(f" Generated: paper.md (metadata + abstract only)") |
|
|
|
|
|
else: |
|
|
print(f" paper.md already exists") |
|
|
|
|
|
|
|
|
update_paper_db(folder_name, meta) |
|
|
|
|
|
print(f" Done: {folder_name}/") |
|
|
return folder_name |
|
|
|
|
|
|
|
|
def fetch_acl(acl_id): |
|
|
"""Fetch a paper from ACL Anthology.""" |
|
|
acl_id = re.sub(r'^https?://aclanthology\.org/', '', acl_id) |
|
|
acl_id = acl_id.rstrip('/').rstrip('.pdf') |
|
|
|
|
|
print(f"\n{'='*60}") |
|
|
print(f"Fetching ACL:{acl_id}") |
|
|
print(f"{'='*60}") |
|
|
|
|
|
|
|
|
bib_url = f"https://aclanthology.org/{acl_id}.bib" |
|
|
try: |
|
|
resp = requests.get(bib_url, timeout=15) |
|
|
bib_text = resp.text |
|
|
|
|
|
title_match = re.search(r'title\s*=\s*["{]([^"}]+)', bib_text) |
|
|
title = title_match.group(1) if title_match else acl_id |
|
|
|
|
|
author_match = re.search(r'author\s*=\s*["{]([^"}]+)', bib_text) |
|
|
authors = [] |
|
|
if author_match: |
|
|
authors = [a.strip() for a in author_match.group(1).split(' and ')] |
|
|
|
|
|
year_match = re.search(r'year\s*=\s*["{]?(\d{4})', bib_text) |
|
|
year = int(year_match.group(1)) if year_match else 2020 |
|
|
|
|
|
venue_match = re.search(r'booktitle\s*=\s*["{]([^"}]+)', bib_text) |
|
|
venue = venue_match.group(1) if venue_match else "ACL" |
|
|
except: |
|
|
title = acl_id |
|
|
authors = ["unknown"] |
|
|
year = 2020 |
|
|
venue = "ACL" |
|
|
|
|
|
print(f" Title: {title[:70]}...") |
|
|
print(f" Authors: {', '.join(authors[:3])}") |
|
|
|
|
|
|
|
|
venue_short = "acl" |
|
|
m = re.match(r'(\d{4})\.([a-z\-]+)', acl_id) |
|
|
if m: |
|
|
venue_short = m.group(2).split('-')[0] |
|
|
else: |
|
|
prefix_map = {'P': 'acl', 'N': 'naacl', 'E': 'eacl', 'D': 'emnlp', 'C': 'coling', 'W': 'workshop'} |
|
|
m2 = re.match(r'([A-Z])(\d{2})', acl_id) |
|
|
if m2: |
|
|
venue_short = prefix_map.get(m2.group(1), 'acl') |
|
|
|
|
|
author_name = normalize_name(authors[0]) if authors else 'unknown' |
|
|
folder_name = f"{year}.{venue_short}.{author_name}" |
|
|
folder = os.path.join(REFS_DIR, folder_name) |
|
|
os.makedirs(folder, exist_ok=True) |
|
|
print(f" Folder: {folder_name}/") |
|
|
|
|
|
|
|
|
pdf_path = os.path.join(folder, 'paper.pdf') |
|
|
if not os.path.exists(pdf_path): |
|
|
pdf_url = f"https://aclanthology.org/{acl_id}.pdf" |
|
|
print(f" Downloading PDF...") |
|
|
resp = requests.get(pdf_url, timeout=60, allow_redirects=True) |
|
|
resp.raise_for_status() |
|
|
with open(pdf_path, 'wb') as f: |
|
|
f.write(resp.content) |
|
|
print(f" Saved: paper.pdf") |
|
|
|
|
|
|
|
|
md_path = os.path.join(folder, 'paper.md') |
|
|
if not os.path.exists(md_path): |
|
|
authors_yaml = '\n'.join(f' - "{a}"' for a in authors) |
|
|
front_matter = f'''--- |
|
|
title: "{title}" |
|
|
authors: |
|
|
{authors_yaml} |
|
|
year: {year} |
|
|
venue: "{venue}" |
|
|
url: "https://aclanthology.org/{acl_id}" |
|
|
--- |
|
|
|
|
|
''' |
|
|
with open(md_path, 'w') as f: |
|
|
f.write(front_matter) |
|
|
f.write(f"# {title}\n\n") |
|
|
f.write(f"*Full text available in paper.pdf*\n") |
|
|
print(f" Generated: paper.md") |
|
|
|
|
|
meta = { |
|
|
'title': title, |
|
|
'authors': authors, |
|
|
'year': year, |
|
|
'abstract': '', |
|
|
'arxiv_id': None, |
|
|
'url': f'https://aclanthology.org/{acl_id}' |
|
|
} |
|
|
update_paper_db(folder_name, meta) |
|
|
|
|
|
print(f" Done: {folder_name}/") |
|
|
return folder_name |
|
|
|
|
|
|
|
|
def update_paper_db(folder_name, meta): |
|
|
"""Update paper_db.json with new paper entry.""" |
|
|
db_path = os.path.join(REFS_DIR, 'paper_db.json') |
|
|
|
|
|
if os.path.exists(db_path): |
|
|
with open(db_path, 'r') as f: |
|
|
db = json.load(f) |
|
|
else: |
|
|
db = {"papers": {}, "s2_cache": {}} |
|
|
|
|
|
db['papers'][folder_name] = { |
|
|
"id": folder_name, |
|
|
"title": meta['title'], |
|
|
"authors": meta['authors'], |
|
|
"year": meta['year'], |
|
|
"venue": "arXiv" if meta.get('arxiv_id') else "ACL", |
|
|
"url": meta['url'], |
|
|
"arxiv_id": meta.get('arxiv_id'), |
|
|
"s2_id": None, |
|
|
"doi": None, |
|
|
"citation_count": 0, |
|
|
"abstract": meta.get('abstract', ''), |
|
|
"tldr": "", |
|
|
"keywords": [], |
|
|
"references": [], |
|
|
"cited_by": [], |
|
|
"local_path": f"references/{folder_name}", |
|
|
"fetched": True |
|
|
} |
|
|
|
|
|
with open(db_path, 'w') as f: |
|
|
json.dump(db, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
if len(sys.argv) < 2: |
|
|
print("Usage: python3 fetch_papers.py <arxiv_id_or_acl_id> [...]") |
|
|
sys.exit(1) |
|
|
|
|
|
for paper_id in sys.argv[1:]: |
|
|
paper_id = paper_id.strip() |
|
|
if not paper_id: |
|
|
continue |
|
|
|
|
|
try: |
|
|
|
|
|
if re.match(r'^\d{4}\.\d{4,5}', paper_id) or paper_id.startswith('arxiv:'): |
|
|
fetch_paper(paper_id) |
|
|
elif re.match(r'^[A-Z]\d{2}-\d+$', paper_id) or re.match(r'^\d{4}\.[a-z]', paper_id): |
|
|
fetch_acl(paper_id) |
|
|
else: |
|
|
|
|
|
fetch_paper(paper_id) |
|
|
|
|
|
time.sleep(3) |
|
|
except Exception as e: |
|
|
print(f" ERROR fetching {paper_id}: {e}") |
|
|
|