siddhm11
/

researchit-reranker-phase6

ml-intern

Model card Files Files and versions

xet

Community

siddhm11 commited on Apr 26

Commit

c82215c

verified ·

1 Parent(s): dc5b2e5

Add 01_fetch_citation_edges.py

Browse files

Files changed (1) hide show

scripts/01_fetch_citation_edges.py +388 -0

scripts/01_fetch_citation_edges.py ADDED Viewed

	@@ -0,0 +1,388 @@

+"""
+Step 1: Fetch citation edges from Semantic Scholar API.
+Produces: citations.parquet → (citing_arxiv_id, cited_arxiv_id)
+          where BOTH IDs exist in the ResearchIT Qdrant corpus.
+Usage:
+  # Option A: Batch API (no API key needed, slower, ~1-2 hours for 1.6M papers)
+  python 01_fetch_citation_edges.py --corpus-file arxiv_ids.txt --output citations.parquet
+  # Option B: Batch API with API key (faster, ~30-60 min)
+  python 01_fetch_citation_edges.py --corpus-file arxiv_ids.txt --output citations.parquet --api-key YOUR_KEY
+  # Option C: If you already have the S2 bulk datasets downloaded:
+  python 01_fetch_citation_edges.py --bulk-papers paper-ids.jsonl.gz --bulk-citations citations.jsonl.gz \
+      --corpus-file arxiv_ids.txt --output citations.parquet
+Prerequisites:
+  - arxiv_ids.txt: one arXiv ID per line (e.g. "2303.14957"), exported from Qdrant/Turso
+  - pip install httpx pyarrow tqdm
+Output schema:
+  citing_arxiv_id  (string)  — the paper that contains the citation
+  cited_arxiv_id   (string)  — the paper being cited
+  is_influential   (bool)    — S2's influential citation flag (if available)
+Author: ResearchIT ML Pipeline — Phase 6, Step 1
+"""
+from __future__ import annotations
+import argparse
+import asyncio
+import gzip
+import json
+import os
+import sys
+import time
+from pathlib import Path
+import httpx
+import pyarrow as pa
+import pyarrow.parquet as pq
+from tqdm import tqdm
+# ── Constants ────────────────────────────────────────────────────────────────
+S2_BATCH_URL = "https://api.semanticscholar.org/graph/v1/paper/batch"
+S2_BATCH_FIELDS = "externalIds,references.externalIds"
+BATCH_SIZE = 500        # S2 hard limit
+MAX_RETRIES = 5         # per batch
+RETRY_BACKOFF_BASE = 2  # exponential backoff base (seconds)
+CHECKPOINT_EVERY = 50   # save checkpoint every N batches
+# ── Batch API Path ───────────────────────────────────────────────────────────
+async def fetch_one_batch(
+    client: httpx.AsyncClient,
+    arxiv_ids: list[str],
+    api_key: str | None,
+    batch_idx: int,
+) -> list[tuple[str, str, bool]]:
+    """
+    Fetch references for a batch of arXiv IDs via S2 batch endpoint.
+    Returns list of (citing_arxiv_id, cited_arxiv_id, is_influential) tuples.
+    Only returns edges where the cited paper has an arXiv ID.
+    (In-corpus filtering happens later.)
+    """
+    # Format IDs for S2: "arXiv:2303.14957"
+    s2_ids = [f"arXiv:{aid}" for aid in arxiv_ids]
+    headers = {"Content-Type": "application/json"}
+    if api_key:
+        headers["x-api-key"] = api_key
+    url = f"{S2_BATCH_URL}?fields={S2_BATCH_FIELDS}"
+    for attempt in range(MAX_RETRIES):
+        try:
+            resp = await client.post(
+                url,
+                json={"ids": s2_ids},
+                headers=headers,
+                timeout=30.0,
+            )
+            if resp.status_code == 200:
+                results = resp.json()
+                edges = []
+                for i, paper in enumerate(results):
+                    if paper is None:
+                        continue
+                    citing_id = arxiv_ids[i]
+                    refs = paper.get("references") or []
+                    for ref in refs:
+                        ext_ids = ref.get("externalIds") or {}
+                        cited_arxiv = ext_ids.get("ArXiv")
+                        if cited_arxiv:
+                            edges.append((citing_id, cited_arxiv, False))
+                return edges
+            elif resp.status_code == 429:
+                retry_after = int(resp.headers.get("Retry-After", RETRY_BACKOFF_BASE ** attempt))
+                print(f"  [batch {batch_idx}] Rate limited. Waiting {retry_after}s (attempt {attempt+1}/{MAX_RETRIES})")
+                await asyncio.sleep(retry_after)
+            elif resp.status_code == 400:
+                print(f"  [batch {batch_idx}] Bad request (400). Skipping batch.")
+                return []
+            else:
+                print(f"  [batch {batch_idx}] HTTP {resp.status_code}. Retrying (attempt {attempt+1}/{MAX_RETRIES})")
+                await asyncio.sleep(RETRY_BACKOFF_BASE ** attempt)
+        except (httpx.TimeoutException, httpx.ConnectError, httpx.ReadError) as e:
+            print(f"  [batch {batch_idx}] Network error: {type(e).__name__}. Retrying (attempt {attempt+1}/{MAX_RETRIES})")
+            await asyncio.sleep(RETRY_BACKOFF_BASE ** attempt)
+    print(f"  [batch {batch_idx}] FAILED after {MAX_RETRIES} attempts. Skipping.")
+    return []
+async def fetch_all_batches(
+    corpus_ids: list[str],
+    api_key: str | None,
+    checkpoint_dir: Path,
+) -> list[tuple[str, str, bool]]:
+    """
+    Fetch citation edges for all corpus IDs using the S2 batch API.
+    Supports checkpoint/resume.
+    """
+    # Check for existing checkpoint
+    checkpoint_file = checkpoint_dir / "checkpoint.json"
+    all_edges: list[tuple[str, str, bool]] = []
+    start_batch = 0
+    if checkpoint_file.exists():
+        with open(checkpoint_file) as f:
+            ckpt = json.load(f)
+        start_batch = ckpt["next_batch"]
+        # Load previously saved edges
+        edges_file = checkpoint_dir / "edges_partial.jsonl"
+        if edges_file.exists():
+            with open(edges_file) as f:
+                for line in f:
+                    row = json.loads(line)
+                    all_edges.append((row["citing"], row["cited"], row["influential"]))
+        print(f"Resuming from batch {start_batch} ({len(all_edges)} edges already collected)")
+    # Split into batches
+    batches = []
+    for i in range(0, len(corpus_ids), BATCH_SIZE):
+        batches.append(corpus_ids[i : i + BATCH_SIZE])
+    total_batches = len(batches)
+    print(f"Total: {len(corpus_ids)} papers → {total_batches} batches of {BATCH_SIZE}")
+    print(f"Starting from batch {start_batch}")
+    # Rate limiting: 1 req/s without key, slightly faster with key
+    delay = 0.5 if api_key else 1.1
+    edges_file = checkpoint_dir / "edges_partial.jsonl"
+    async with httpx.AsyncClient() as client:
+        pbar = tqdm(range(start_batch, total_batches), initial=start_batch, total=total_batches)
+        for batch_idx in pbar:
+            batch = batches[batch_idx]
+            edges = await fetch_one_batch(client, batch, api_key, batch_idx)
+            all_edges.extend(edges)
+            # Append edges to partial file
+            with open(edges_file, "a") as f:
+                for citing, cited, influential in edges:
+                    f.write(json.dumps({"citing": citing, "cited": cited, "influential": influential}) + "\n")
+            pbar.set_postfix({"edges": len(all_edges), "batch_edges": len(edges)})
+            # Checkpoint periodically
+            if (batch_idx + 1) % CHECKPOINT_EVERY == 0:
+                with open(checkpoint_file, "w") as f:
+                    json.dump({"next_batch": batch_idx + 1}, f)
+            await asyncio.sleep(delay)
+    # Final checkpoint
+    with open(checkpoint_file, "w") as f:
+        json.dump({"next_batch": total_batches, "status": "complete"}, f)
+    return all_edges
+# ── Bulk Download Path ───────────────────────────────────────────────────────
+def process_bulk_downloads(
+    papers_file: str,
+    citations_file: str,
+    corpus_set: set[str],
+) -> list[tuple[str, str, bool]]:
+    """
+    Process S2 bulk dataset downloads to extract in-corpus citation edges.
+    papers_file:    paper-ids.jsonl.gz (corpusid → externalIds mapping)
+    citations_file: citations.jsonl.gz (citingcorpusid → citedcorpusid edges)
+    """
+    print("Step 1/2: Building corpusid → arxiv_id mapping from paper-ids...")
+    corpusid_to_arxiv: dict[int, str] = {}
+    with gzip.open(papers_file, "rt") as f:
+        for line in tqdm(f, desc="Reading paper-ids"):
+            try:
+                rec = json.loads(line)
+                ext_ids = rec.get("externalids") or rec.get("externalIds") or {}
+                arxiv_id = ext_ids.get("ArXiv")
+                corpus_id = rec.get("corpusid") or rec.get("corpusId")
+                if arxiv_id and corpus_id and arxiv_id in corpus_set:
+                    corpusid_to_arxiv[int(corpus_id)] = arxiv_id
+            except (json.JSONDecodeError, ValueError):
+                continue
+    print(f"  Mapped {len(corpusid_to_arxiv)} corpus IDs to arXiv IDs in your corpus")
+    print("Step 2/2: Filtering citation edges to in-corpus pairs...")
+    edges: list[tuple[str, str, bool]] = []
+    with gzip.open(citations_file, "rt") as f:
+        for line in tqdm(f, desc="Reading citations"):
+            try:
+                rec = json.loads(line)
+                citing_cid = rec.get("citingcorpusid") or rec.get("citingCorpusId")
+                cited_cid = rec.get("citedcorpusid") or rec.get("citedCorpusId")
+                is_influential = rec.get("isinfluential", False) or rec.get("isInfluential", False)
+                citing_arxiv = corpusid_to_arxiv.get(int(citing_cid)) if citing_cid else None
+                cited_arxiv = corpusid_to_arxiv.get(int(cited_cid)) if cited_cid else None
+                if citing_arxiv and cited_arxiv:
+                    edges.append((citing_arxiv, cited_arxiv, bool(is_influential)))
+            except (json.JSONDecodeError, ValueError, TypeError):
+                continue
+    print(f"  Found {len(edges)} in-corpus citation edges")
+    return edges
+# ── Filter & Save ────────────────────────────────────────────────────────────
+def filter_and_save(
+    edges: list[tuple[str, str, bool]],
+    corpus_set: set[str],
+    output_path: str,
+):
+    """
+    Filter edges to in-corpus pairs, deduplicate, and save as parquet.
+    """
+    print(f"Raw edges before filtering: {len(edges)}")
+    # Filter: both citing and cited must be in corpus
+    filtered = [
+        (citing, cited, influential)
+        for citing, cited, influential in edges
+        if citing in corpus_set and cited in corpus_set and citing != cited
+    ]
+    print(f"In-corpus edges (both sides in corpus): {len(filtered)}")
+    # Deduplicate
+    seen = set()
+    deduped = []
+    for citing, cited, influential in filtered:
+        key = (citing, cited)
+        if key not in seen:
+            seen.add(key)
+            deduped.append((citing, cited, influential))
+    print(f"After deduplication: {len(deduped)}")
+    # Save as parquet
+    table = pa.table({
+        "citing_arxiv_id": pa.array([e[0] for e in deduped], type=pa.string()),
+        "cited_arxiv_id": pa.array([e[1] for e in deduped], type=pa.string()),
+        "is_influential": pa.array([e[2] for e in deduped], type=pa.bool_()),
+    })
+    pq.write_table(table, output_path, compression="snappy")
+    print(f"Saved {len(deduped)} citation edges to {output_path}")
+    # Print stats
+    citing_papers = set(e[0] for e in deduped)
+    cited_papers = set(e[1] for e in deduped)
+    print(f"\nStats:")
+    print(f"  Unique citing papers: {len(citing_papers)}")
+    print(f"  Unique cited papers:  {len(cited_papers)}")
+    print(f"  Unique papers total:  {len(citing_papers | cited_papers)}")
+    print(f"  Avg references per citing paper: {len(deduped) / max(len(citing_papers), 1):.1f}")
+    influential_count = sum(1 for e in deduped if e[2])
+    print(f"  Influential citations: {influential_count} ({100*influential_count/max(len(deduped),1):.1f}%)")
+# ── Main ─────────────────────────────────────────────────────────────────────
+def load_corpus_ids(path: str) -> list[str]:
+    """Load arXiv IDs from a text file (one per line)."""
+    ids = []
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith("#"):
+                # Handle various formats: "2303.14957", "arXiv:2303.14957", etc.
+                if line.startswith("arXiv:"):
+                    line = line[6:]
+                elif line.startswith("ARXIV:"):
+                    line = line[6:]
+                ids.append(line)
+    print(f"Loaded {len(ids)} arXiv IDs from {path}")
+    return ids
+def main():
+    parser = argparse.ArgumentParser(
+        description="Fetch citation edges from Semantic Scholar for ResearchIT corpus"
+    )
+    parser.add_argument(
+        "--corpus-file", required=True,
+        help="Text file with one arXiv ID per line (e.g. arxiv_ids.txt)"
+    )
+    parser.add_argument(
+        "--output", default="citations.parquet",
+        help="Output parquet file path (default: citations.parquet)"
+    )
+    parser.add_argument(
+        "--api-key", default=None,
+        help="Semantic Scholar API key (optional, speeds up rate limit)"
+    )
+    parser.add_argument(
+        "--bulk-papers", default=None,
+        help="Path to S2 bulk paper-ids.jsonl.gz (use bulk download path)"
+    )
+    parser.add_argument(
+        "--bulk-citations", default=None,
+        help="Path to S2 bulk citations.jsonl.gz (use bulk download path)"
+    )
+    parser.add_argument(
+        "--checkpoint-dir", default="./citation_checkpoint",
+        help="Directory for checkpoint files (batch API mode)"
+    )
+    parser.add_argument(
+        "--max-papers", type=int, default=None,
+        help="Limit to first N papers (for testing)"
+    )
+    args = parser.parse_args()
+    # Load corpus
+    corpus_ids = load_corpus_ids(args.corpus_file)
+    if args.max_papers:
+        corpus_ids = corpus_ids[:args.max_papers]
+        print(f"  Limited to {len(corpus_ids)} papers (--max-papers)")
+    corpus_set = set(corpus_ids)
+    # Choose path
+    if args.bulk_papers and args.bulk_citations:
+        print("\n=== BULK DOWNLOAD PATH ===")
+        edges = process_bulk_downloads(args.bulk_papers, args.bulk_citations, corpus_set)
+    else:
+        print("\n=== BATCH API PATH ===")
+        if not args.api_key:
+            # Check environment variable
+            args.api_key = os.environ.get("S2_API_KEY")
+        if args.api_key:
+            print(f"Using API key: {args.api_key[:8]}...")
+        else:
+            print("No API key — using unauthenticated rate (1 req/s)")
+            print("Get a free key at: https://www.semanticscholar.org/product/api#Partner-Form")
+        checkpoint_dir = Path(args.checkpoint_dir)
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        edges = asyncio.run(fetch_all_batches(corpus_ids, args.api_key, checkpoint_dir))
+    # Filter to in-corpus and save
+    filter_and_save(edges, corpus_set, args.output)
+    print(f"\n✅ Done! Citation edges saved to: {args.output}")
+if __name__ == "__main__":
+    main()