sen-1

File size: 12,316 Bytes

ef06968

"""
Fetch papers from arXiv using only stdlib + requests.
Downloads PDF and generates paper.md with YAML front matter.
"""
import os
import re
import sys
import json
import time
import unicodedata
import xml.etree.ElementTree as ET
import requests

REFS_DIR = os.path.join(os.path.dirname(__file__))


def normalize_name(name):
    parts = name.strip().split()
    lastname = parts[-1] if parts else name
    normalized = unicodedata.normalize('NFD', lastname)
    return ''.join(c for c in normalized if unicodedata.category(c) != 'Mn').lower()


def fetch_arxiv_metadata(arxiv_id):
    """Fetch metadata from arXiv API."""
    url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
    resp = requests.get(url, timeout=30)
    resp.raise_for_status()

    ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
    root = ET.fromstring(resp.text)
    entry = root.find('atom:entry', ns)
    if entry is None:
        return None

    title = entry.find('atom:title', ns).text.strip().replace('\n', ' ')
    title = re.sub(r'\s+', ' ', title)

    authors = []
    for a in entry.findall('atom:author', ns):
        name = a.find('atom:name', ns).text.strip()
        authors.append(name)

    published = entry.find('atom:published', ns).text
    year = int(published[:4])

    summary = entry.find('atom:summary', ns).text.strip()

    return {
        'title': title,
        'authors': authors,
        'year': year,
        'abstract': summary,
        'arxiv_id': arxiv_id,
        'url': f'https://arxiv.org/abs/{arxiv_id}'
    }


def download_pdf(arxiv_id, dest_path):
    """Download PDF from arXiv."""
    url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    resp = requests.get(url, timeout=60, allow_redirects=True)
    resp.raise_for_status()
    with open(dest_path, 'wb') as f:
        f.write(resp.content)
    return True


def download_source(arxiv_id, folder):
    """Try to download LaTeX source from arXiv."""
    import tarfile
    import gzip
    from io import BytesIO

    url = f"https://arxiv.org/e-print/{arxiv_id}"
    try:
        resp = requests.get(url, timeout=60, allow_redirects=True)
        resp.raise_for_status()
        content = resp.content

        # Try tar.gz
        try:
            with tarfile.open(fileobj=BytesIO(content), mode='r:gz') as tar:
                tex_files = [m.name for m in tar.getmembers() if m.name.endswith('.tex')]
                source_dir = os.path.join(folder, 'source')
                os.makedirs(source_dir, exist_ok=True)
                tar.extractall(path=source_dir)

                main_tex = None
                for name in tex_files:
                    if 'main' in name.lower():
                        main_tex = name
                        break
                if not main_tex and tex_files:
                    main_tex = tex_files[0]

                if main_tex:
                    with open(os.path.join(source_dir, main_tex), 'r', errors='ignore') as f:
                        tex_content = f.read()
                    # Save paper.tex
                    with open(os.path.join(folder, 'paper.tex'), 'w') as f:
                        f.write(tex_content)
                    return tex_content
        except tarfile.TarError:
            pass

        # Try plain gzip
        try:
            tex_content = gzip.decompress(content).decode('utf-8', errors='ignore')
            if '\\documentclass' in tex_content or '\\begin{document}' in tex_content:
                with open(os.path.join(folder, 'paper.tex'), 'w') as f:
                    f.write(tex_content)
                return tex_content
        except:
            pass

    except Exception as e:
        print(f"  Source download failed: {e}")
    return None


def tex_to_md(tex_content):
    """Basic LaTeX to Markdown conversion."""
    md = tex_content

    doc_match = re.search(r'\\begin\{document\}', md)
    if doc_match:
        md = md[doc_match.end():]
    md = re.sub(r'\\end\{document\}.*', '', md, flags=re.DOTALL)
    md = re.sub(r'%.*$', '', md, flags=re.MULTILINE)
    md = re.sub(r'\\section\*?\{([^}]+)\}', r'# \1', md)
    md = re.sub(r'\\subsection\*?\{([^}]+)\}', r'## \1', md)
    md = re.sub(r'\\subsubsection\*?\{([^}]+)\}', r'### \1', md)
    md = re.sub(r'\\textbf\{([^}]+)\}', r'**\1**', md)
    md = re.sub(r'\\textit\{([^}]+)\}', r'*\1*', md)
    md = re.sub(r'\\emph\{([^}]+)\}', r'*\1*', md)
    md = re.sub(r'\\texttt\{([^}]+)\}', r'`\1`', md)
    md = re.sub(r'\\cite\w*\{([^}]+)\}', r'[\1]', md)
    md = re.sub(r'\\url\{([^}]+)\}', r'\1', md)
    md = re.sub(r'\\href\{([^}]+)\}\{([^}]+)\}', r'[\2](\1)', md)
    md = re.sub(r'\\begin\{itemize\}', '', md)
    md = re.sub(r'\\end\{itemize\}', '', md)
    md = re.sub(r'\\begin\{enumerate\}', '', md)
    md = re.sub(r'\\end\{enumerate\}', '', md)
    md = re.sub(r'\\item\s*', '- ', md)
    md = re.sub(r'\n{3,}', '\n\n', md)
    return md.strip()


def fetch_paper(arxiv_id):
    """Fetch a single paper from arXiv."""
    arxiv_id = re.sub(r'^(arxiv:|https?://arxiv\.org/(abs|pdf)/)', '', arxiv_id)
    arxiv_id = arxiv_id.rstrip('.pdf').rstrip('/')

    print(f"\n{'='*60}")
    print(f"Fetching arXiv:{arxiv_id}")
    print(f"{'='*60}")

    # 1. Get metadata
    meta = fetch_arxiv_metadata(arxiv_id)
    if not meta:
        print(f"  ERROR: Paper not found: {arxiv_id}")
        return None

    print(f"  Title: {meta['title'][:70]}...")
    print(f"  Authors: {', '.join(meta['authors'][:3])}")
    print(f"  Year: {meta['year']}")

    # 2. Create folder
    author = normalize_name(meta['authors'][0]) if meta['authors'] else 'unknown'
    folder_name = f"{meta['year']}.arxiv.{author}"
    folder = os.path.join(REFS_DIR, folder_name)
    os.makedirs(folder, exist_ok=True)
    print(f"  Folder: {folder_name}/")

    # 3. Build front matter
    authors_yaml = '\n'.join(f'  - "{a}"' for a in meta['authors'])
    front_matter = f'''---
title: "{meta['title']}"
authors:
{authors_yaml}
year: {meta['year']}
venue: "arXiv"
url: "{meta['url']}"
arxiv: "{arxiv_id}"
---

'''

    # 4. Download PDF
    pdf_path = os.path.join(folder, 'paper.pdf')
    if not os.path.exists(pdf_path):
        print(f"  Downloading PDF...")
        download_pdf(arxiv_id, pdf_path)
        print(f"  Saved: paper.pdf")
    else:
        print(f"  PDF already exists")

    # 5. Try to get LaTeX source
    md_path = os.path.join(folder, 'paper.md')
    if not os.path.exists(md_path):
        print(f"  Downloading LaTeX source...")
        tex_content = download_source(arxiv_id, folder)

        if tex_content:
            md_text = tex_to_md(tex_content)
            with open(md_path, 'w') as f:
                f.write(front_matter + md_text)
            print(f"  Generated: paper.md (from LaTeX)")
        else:
            # Fallback: write front matter + abstract
            with open(md_path, 'w') as f:
                f.write(front_matter)
                f.write(f"# {meta['title']}\n\n")
                f.write(f"## Abstract\n\n{meta['abstract']}\n\n")
                f.write(f"*Full text available in paper.pdf*\n")
            print(f"  Generated: paper.md (metadata + abstract only)")

    else:
        print(f"  paper.md already exists")

    # 6. Update paper_db.json
    update_paper_db(folder_name, meta)

    print(f"  Done: {folder_name}/")
    return folder_name


def fetch_acl(acl_id):
    """Fetch a paper from ACL Anthology."""
    acl_id = re.sub(r'^https?://aclanthology\.org/', '', acl_id)
    acl_id = acl_id.rstrip('/').rstrip('.pdf')

    print(f"\n{'='*60}")
    print(f"Fetching ACL:{acl_id}")
    print(f"{'='*60}")

    # Get BibTeX for metadata
    bib_url = f"https://aclanthology.org/{acl_id}.bib"
    try:
        resp = requests.get(bib_url, timeout=15)
        bib_text = resp.text

        title_match = re.search(r'title\s*=\s*["{]([^"}]+)', bib_text)
        title = title_match.group(1) if title_match else acl_id

        author_match = re.search(r'author\s*=\s*["{]([^"}]+)', bib_text)
        authors = []
        if author_match:
            authors = [a.strip() for a in author_match.group(1).split(' and ')]

        year_match = re.search(r'year\s*=\s*["{]?(\d{4})', bib_text)
        year = int(year_match.group(1)) if year_match else 2020

        venue_match = re.search(r'booktitle\s*=\s*["{]([^"}]+)', bib_text)
        venue = venue_match.group(1) if venue_match else "ACL"
    except:
        title = acl_id
        authors = ["unknown"]
        year = 2020
        venue = "ACL"

    print(f"  Title: {title[:70]}...")
    print(f"  Authors: {', '.join(authors[:3])}")

    # Parse venue from ID
    venue_short = "acl"
    m = re.match(r'(\d{4})\.([a-z\-]+)', acl_id)
    if m:
        venue_short = m.group(2).split('-')[0]
    else:
        prefix_map = {'P': 'acl', 'N': 'naacl', 'E': 'eacl', 'D': 'emnlp', 'C': 'coling', 'W': 'workshop'}
        m2 = re.match(r'([A-Z])(\d{2})', acl_id)
        if m2:
            venue_short = prefix_map.get(m2.group(1), 'acl')

    author_name = normalize_name(authors[0]) if authors else 'unknown'
    folder_name = f"{year}.{venue_short}.{author_name}"
    folder = os.path.join(REFS_DIR, folder_name)
    os.makedirs(folder, exist_ok=True)
    print(f"  Folder: {folder_name}/")

    # Download PDF
    pdf_path = os.path.join(folder, 'paper.pdf')
    if not os.path.exists(pdf_path):
        pdf_url = f"https://aclanthology.org/{acl_id}.pdf"
        print(f"  Downloading PDF...")
        resp = requests.get(pdf_url, timeout=60, allow_redirects=True)
        resp.raise_for_status()
        with open(pdf_path, 'wb') as f:
            f.write(resp.content)
        print(f"  Saved: paper.pdf")

    # Create paper.md with metadata
    md_path = os.path.join(folder, 'paper.md')
    if not os.path.exists(md_path):
        authors_yaml = '\n'.join(f'  - "{a}"' for a in authors)
        front_matter = f'''---
title: "{title}"
authors:
{authors_yaml}
year: {year}
venue: "{venue}"
url: "https://aclanthology.org/{acl_id}"
---

'''
        with open(md_path, 'w') as f:
            f.write(front_matter)
            f.write(f"# {title}\n\n")
            f.write(f"*Full text available in paper.pdf*\n")
        print(f"  Generated: paper.md")

    meta = {
        'title': title,
        'authors': authors,
        'year': year,
        'abstract': '',
        'arxiv_id': None,
        'url': f'https://aclanthology.org/{acl_id}'
    }
    update_paper_db(folder_name, meta)

    print(f"  Done: {folder_name}/")
    return folder_name


def update_paper_db(folder_name, meta):
    """Update paper_db.json with new paper entry."""
    db_path = os.path.join(REFS_DIR, 'paper_db.json')

    if os.path.exists(db_path):
        with open(db_path, 'r') as f:
            db = json.load(f)
    else:
        db = {"papers": {}, "s2_cache": {}}

    db['papers'][folder_name] = {
        "id": folder_name,
        "title": meta['title'],
        "authors": meta['authors'],
        "year": meta['year'],
        "venue": "arXiv" if meta.get('arxiv_id') else "ACL",
        "url": meta['url'],
        "arxiv_id": meta.get('arxiv_id'),
        "s2_id": None,
        "doi": None,
        "citation_count": 0,
        "abstract": meta.get('abstract', ''),
        "tldr": "",
        "keywords": [],
        "references": [],
        "cited_by": [],
        "local_path": f"references/{folder_name}",
        "fetched": True
    }

    with open(db_path, 'w') as f:
        json.dump(db, f, indent=2, ensure_ascii=False)


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python3 fetch_papers.py <arxiv_id_or_acl_id> [...]")
        sys.exit(1)

    for paper_id in sys.argv[1:]:
        paper_id = paper_id.strip()
        if not paper_id:
            continue

        try:
            # Detect if arXiv or ACL
            if re.match(r'^\d{4}\.\d{4,5}', paper_id) or paper_id.startswith('arxiv:'):
                fetch_paper(paper_id)
            elif re.match(r'^[A-Z]\d{2}-\d+$', paper_id) or re.match(r'^\d{4}\.[a-z]', paper_id):
                fetch_acl(paper_id)
            else:
                # Default: try arXiv
                fetch_paper(paper_id)

            time.sleep(3)  # Rate limiting
        except Exception as e:
            print(f"  ERROR fetching {paper_id}: {e}")