""" Fetch papers from arXiv using only stdlib + requests. Downloads PDF and generates paper.md with YAML front matter. """ import os import re import sys import json import time import unicodedata import xml.etree.ElementTree as ET import requests REFS_DIR = os.path.join(os.path.dirname(__file__)) def normalize_name(name): parts = name.strip().split() lastname = parts[-1] if parts else name normalized = unicodedata.normalize('NFD', lastname) return ''.join(c for c in normalized if unicodedata.category(c) != 'Mn').lower() def fetch_arxiv_metadata(arxiv_id): """Fetch metadata from arXiv API.""" url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}" resp = requests.get(url, timeout=30) resp.raise_for_status() ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'} root = ET.fromstring(resp.text) entry = root.find('atom:entry', ns) if entry is None: return None title = entry.find('atom:title', ns).text.strip().replace('\n', ' ') title = re.sub(r'\s+', ' ', title) authors = [] for a in entry.findall('atom:author', ns): name = a.find('atom:name', ns).text.strip() authors.append(name) published = entry.find('atom:published', ns).text year = int(published[:4]) summary = entry.find('atom:summary', ns).text.strip() return { 'title': title, 'authors': authors, 'year': year, 'abstract': summary, 'arxiv_id': arxiv_id, 'url': f'https://arxiv.org/abs/{arxiv_id}' } def download_pdf(arxiv_id, dest_path): """Download PDF from arXiv.""" url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" resp = requests.get(url, timeout=60, allow_redirects=True) resp.raise_for_status() with open(dest_path, 'wb') as f: f.write(resp.content) return True def download_source(arxiv_id, folder): """Try to download LaTeX source from arXiv.""" import tarfile import gzip from io import BytesIO url = f"https://arxiv.org/e-print/{arxiv_id}" try: resp = requests.get(url, timeout=60, allow_redirects=True) resp.raise_for_status() content = resp.content # Try tar.gz try: with tarfile.open(fileobj=BytesIO(content), mode='r:gz') as tar: tex_files = [m.name for m in tar.getmembers() if m.name.endswith('.tex')] source_dir = os.path.join(folder, 'source') os.makedirs(source_dir, exist_ok=True) tar.extractall(path=source_dir) main_tex = None for name in tex_files: if 'main' in name.lower(): main_tex = name break if not main_tex and tex_files: main_tex = tex_files[0] if main_tex: with open(os.path.join(source_dir, main_tex), 'r', errors='ignore') as f: tex_content = f.read() # Save paper.tex with open(os.path.join(folder, 'paper.tex'), 'w') as f: f.write(tex_content) return tex_content except tarfile.TarError: pass # Try plain gzip try: tex_content = gzip.decompress(content).decode('utf-8', errors='ignore') if '\\documentclass' in tex_content or '\\begin{document}' in tex_content: with open(os.path.join(folder, 'paper.tex'), 'w') as f: f.write(tex_content) return tex_content except: pass except Exception as e: print(f" Source download failed: {e}") return None def tex_to_md(tex_content): """Basic LaTeX to Markdown conversion.""" md = tex_content doc_match = re.search(r'\\begin\{document\}', md) if doc_match: md = md[doc_match.end():] md = re.sub(r'\\end\{document\}.*', '', md, flags=re.DOTALL) md = re.sub(r'%.*$', '', md, flags=re.MULTILINE) md = re.sub(r'\\section\*?\{([^}]+)\}', r'# \1', md) md = re.sub(r'\\subsection\*?\{([^}]+)\}', r'## \1', md) md = re.sub(r'\\subsubsection\*?\{([^}]+)\}', r'### \1', md) md = re.sub(r'\\textbf\{([^}]+)\}', r'**\1**', md) md = re.sub(r'\\textit\{([^}]+)\}', r'*\1*', md) md = re.sub(r'\\emph\{([^}]+)\}', r'*\1*', md) md = re.sub(r'\\texttt\{([^}]+)\}', r'`\1`', md) md = re.sub(r'\\cite\w*\{([^}]+)\}', r'[\1]', md) md = re.sub(r'\\url\{([^}]+)\}', r'\1', md) md = re.sub(r'\\href\{([^}]+)\}\{([^}]+)\}', r'[\2](\1)', md) md = re.sub(r'\\begin\{itemize\}', '', md) md = re.sub(r'\\end\{itemize\}', '', md) md = re.sub(r'\\begin\{enumerate\}', '', md) md = re.sub(r'\\end\{enumerate\}', '', md) md = re.sub(r'\\item\s*', '- ', md) md = re.sub(r'\n{3,}', '\n\n', md) return md.strip() def fetch_paper(arxiv_id): """Fetch a single paper from arXiv.""" arxiv_id = re.sub(r'^(arxiv:|https?://arxiv\.org/(abs|pdf)/)', '', arxiv_id) arxiv_id = arxiv_id.rstrip('.pdf').rstrip('/') print(f"\n{'='*60}") print(f"Fetching arXiv:{arxiv_id}") print(f"{'='*60}") # 1. Get metadata meta = fetch_arxiv_metadata(arxiv_id) if not meta: print(f" ERROR: Paper not found: {arxiv_id}") return None print(f" Title: {meta['title'][:70]}...") print(f" Authors: {', '.join(meta['authors'][:3])}") print(f" Year: {meta['year']}") # 2. Create folder author = normalize_name(meta['authors'][0]) if meta['authors'] else 'unknown' folder_name = f"{meta['year']}.arxiv.{author}" folder = os.path.join(REFS_DIR, folder_name) os.makedirs(folder, exist_ok=True) print(f" Folder: {folder_name}/") # 3. Build front matter authors_yaml = '\n'.join(f' - "{a}"' for a in meta['authors']) front_matter = f'''--- title: "{meta['title']}" authors: {authors_yaml} year: {meta['year']} venue: "arXiv" url: "{meta['url']}" arxiv: "{arxiv_id}" --- ''' # 4. Download PDF pdf_path = os.path.join(folder, 'paper.pdf') if not os.path.exists(pdf_path): print(f" Downloading PDF...") download_pdf(arxiv_id, pdf_path) print(f" Saved: paper.pdf") else: print(f" PDF already exists") # 5. Try to get LaTeX source md_path = os.path.join(folder, 'paper.md') if not os.path.exists(md_path): print(f" Downloading LaTeX source...") tex_content = download_source(arxiv_id, folder) if tex_content: md_text = tex_to_md(tex_content) with open(md_path, 'w') as f: f.write(front_matter + md_text) print(f" Generated: paper.md (from LaTeX)") else: # Fallback: write front matter + abstract with open(md_path, 'w') as f: f.write(front_matter) f.write(f"# {meta['title']}\n\n") f.write(f"## Abstract\n\n{meta['abstract']}\n\n") f.write(f"*Full text available in paper.pdf*\n") print(f" Generated: paper.md (metadata + abstract only)") else: print(f" paper.md already exists") # 6. Update paper_db.json update_paper_db(folder_name, meta) print(f" Done: {folder_name}/") return folder_name def fetch_acl(acl_id): """Fetch a paper from ACL Anthology.""" acl_id = re.sub(r'^https?://aclanthology\.org/', '', acl_id) acl_id = acl_id.rstrip('/').rstrip('.pdf') print(f"\n{'='*60}") print(f"Fetching ACL:{acl_id}") print(f"{'='*60}") # Get BibTeX for metadata bib_url = f"https://aclanthology.org/{acl_id}.bib" try: resp = requests.get(bib_url, timeout=15) bib_text = resp.text title_match = re.search(r'title\s*=\s*["{]([^"}]+)', bib_text) title = title_match.group(1) if title_match else acl_id author_match = re.search(r'author\s*=\s*["{]([^"}]+)', bib_text) authors = [] if author_match: authors = [a.strip() for a in author_match.group(1).split(' and ')] year_match = re.search(r'year\s*=\s*["{]?(\d{4})', bib_text) year = int(year_match.group(1)) if year_match else 2020 venue_match = re.search(r'booktitle\s*=\s*["{]([^"}]+)', bib_text) venue = venue_match.group(1) if venue_match else "ACL" except: title = acl_id authors = ["unknown"] year = 2020 venue = "ACL" print(f" Title: {title[:70]}...") print(f" Authors: {', '.join(authors[:3])}") # Parse venue from ID venue_short = "acl" m = re.match(r'(\d{4})\.([a-z\-]+)', acl_id) if m: venue_short = m.group(2).split('-')[0] else: prefix_map = {'P': 'acl', 'N': 'naacl', 'E': 'eacl', 'D': 'emnlp', 'C': 'coling', 'W': 'workshop'} m2 = re.match(r'([A-Z])(\d{2})', acl_id) if m2: venue_short = prefix_map.get(m2.group(1), 'acl') author_name = normalize_name(authors[0]) if authors else 'unknown' folder_name = f"{year}.{venue_short}.{author_name}" folder = os.path.join(REFS_DIR, folder_name) os.makedirs(folder, exist_ok=True) print(f" Folder: {folder_name}/") # Download PDF pdf_path = os.path.join(folder, 'paper.pdf') if not os.path.exists(pdf_path): pdf_url = f"https://aclanthology.org/{acl_id}.pdf" print(f" Downloading PDF...") resp = requests.get(pdf_url, timeout=60, allow_redirects=True) resp.raise_for_status() with open(pdf_path, 'wb') as f: f.write(resp.content) print(f" Saved: paper.pdf") # Create paper.md with metadata md_path = os.path.join(folder, 'paper.md') if not os.path.exists(md_path): authors_yaml = '\n'.join(f' - "{a}"' for a in authors) front_matter = f'''--- title: "{title}" authors: {authors_yaml} year: {year} venue: "{venue}" url: "https://aclanthology.org/{acl_id}" --- ''' with open(md_path, 'w') as f: f.write(front_matter) f.write(f"# {title}\n\n") f.write(f"*Full text available in paper.pdf*\n") print(f" Generated: paper.md") meta = { 'title': title, 'authors': authors, 'year': year, 'abstract': '', 'arxiv_id': None, 'url': f'https://aclanthology.org/{acl_id}' } update_paper_db(folder_name, meta) print(f" Done: {folder_name}/") return folder_name def update_paper_db(folder_name, meta): """Update paper_db.json with new paper entry.""" db_path = os.path.join(REFS_DIR, 'paper_db.json') if os.path.exists(db_path): with open(db_path, 'r') as f: db = json.load(f) else: db = {"papers": {}, "s2_cache": {}} db['papers'][folder_name] = { "id": folder_name, "title": meta['title'], "authors": meta['authors'], "year": meta['year'], "venue": "arXiv" if meta.get('arxiv_id') else "ACL", "url": meta['url'], "arxiv_id": meta.get('arxiv_id'), "s2_id": None, "doi": None, "citation_count": 0, "abstract": meta.get('abstract', ''), "tldr": "", "keywords": [], "references": [], "cited_by": [], "local_path": f"references/{folder_name}", "fetched": True } with open(db_path, 'w') as f: json.dump(db, f, indent=2, ensure_ascii=False) if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: python3 fetch_papers.py [...]") sys.exit(1) for paper_id in sys.argv[1:]: paper_id = paper_id.strip() if not paper_id: continue try: # Detect if arXiv or ACL if re.match(r'^\d{4}\.\d{4,5}', paper_id) or paper_id.startswith('arxiv:'): fetch_paper(paper_id) elif re.match(r'^[A-Z]\d{2}-\d+$', paper_id) or re.match(r'^\d{4}\.[a-z]', paper_id): fetch_acl(paper_id) else: # Default: try arXiv fetch_paper(paper_id) time.sleep(3) # Rate limiting except Exception as e: print(f" ERROR fetching {paper_id}: {e}")