researchit-reranker-phase6 / scripts /01_fetch_citation_edges.py

Add 01_fetch_citation_edges.py

c82215c verified about 1 month ago

15.1 kB

	"""
	Step 1: Fetch citation edges from Semantic Scholar API.

	Produces: citations.parquet → (citing_arxiv_id, cited_arxiv_id)
	where BOTH IDs exist in the ResearchIT Qdrant corpus.

	Usage:
	# Option A: Batch API (no API key needed, slower, ~1-2 hours for 1.6M papers)
	python 01_fetch_citation_edges.py --corpus-file arxiv_ids.txt --output citations.parquet

	# Option B: Batch API with API key (faster, ~30-60 min)
	python 01_fetch_citation_edges.py --corpus-file arxiv_ids.txt --output citations.parquet --api-key YOUR_KEY

	# Option C: If you already have the S2 bulk datasets downloaded:
	python 01_fetch_citation_edges.py --bulk-papers paper-ids.jsonl.gz --bulk-citations citations.jsonl.gz \
	--corpus-file arxiv_ids.txt --output citations.parquet

	Prerequisites:
	- arxiv_ids.txt: one arXiv ID per line (e.g. "2303.14957"), exported from Qdrant/Turso
	- pip install httpx pyarrow tqdm

	Output schema:
	citing_arxiv_id (string) — the paper that contains the citation
	cited_arxiv_id (string) — the paper being cited
	is_influential (bool) — S2's influential citation flag (if available)

	Author: ResearchIT ML Pipeline — Phase 6, Step 1
	"""
	from __future__ import annotations

	import argparse
	import asyncio
	import gzip
	import json
	import os
	import sys
	import time
	from pathlib import Path

	import httpx
	import pyarrow as pa
	import pyarrow.parquet as pq
	from tqdm import tqdm


	# ── Constants ────────────────────────────────────────────────────────────────

	S2_BATCH_URL = "https://api.semanticscholar.org/graph/v1/paper/batch"
	S2_BATCH_FIELDS = "externalIds,references.externalIds"
	BATCH_SIZE = 500 # S2 hard limit
	MAX_RETRIES = 5 # per batch
	RETRY_BACKOFF_BASE = 2 # exponential backoff base (seconds)
	CHECKPOINT_EVERY = 50 # save checkpoint every N batches


	# ── Batch API Path ───────────────────────────────────────────────────────────

	async def fetch_one_batch(
	client: httpx.AsyncClient,
	arxiv_ids: list[str],
	api_key: str \| None,
	batch_idx: int,
	) -> list[tuple[str, str, bool]]:
	"""
	Fetch references for a batch of arXiv IDs via S2 batch endpoint.

	Returns list of (citing_arxiv_id, cited_arxiv_id, is_influential) tuples.
	Only returns edges where the cited paper has an arXiv ID.
	(In-corpus filtering happens later.)
	"""
	# Format IDs for S2: "arXiv:2303.14957"
	s2_ids = [f"arXiv:{aid}" for aid in arxiv_ids]

	headers = {"Content-Type": "application/json"}
	if api_key:
	headers["x-api-key"] = api_key

	url = f"{S2_BATCH_URL}?fields={S2_BATCH_FIELDS}"

	for attempt in range(MAX_RETRIES):
	try:
	resp = await client.post(
	url,
	json={"ids": s2_ids},
	headers=headers,
	timeout=30.0,
	)

	if resp.status_code == 200:
	results = resp.json()
	edges = []
	for i, paper in enumerate(results):
	if paper is None:
	continue
	citing_id = arxiv_ids[i]
	refs = paper.get("references") or []
	for ref in refs:
	ext_ids = ref.get("externalIds") or {}
	cited_arxiv = ext_ids.get("ArXiv")
	if cited_arxiv:
	edges.append((citing_id, cited_arxiv, False))
	return edges

	elif resp.status_code == 429:
	retry_after = int(resp.headers.get("Retry-After", RETRY_BACKOFF_BASE ** attempt))
	print(f" [batch {batch_idx}] Rate limited. Waiting {retry_after}s (attempt {attempt+1}/{MAX_RETRIES})")
	await asyncio.sleep(retry_after)

	elif resp.status_code == 400:
	print(f" [batch {batch_idx}] Bad request (400). Skipping batch.")
	return []

	else:
	print(f" [batch {batch_idx}] HTTP {resp.status_code}. Retrying (attempt {attempt+1}/{MAX_RETRIES})")
	await asyncio.sleep(RETRY_BACKOFF_BASE ** attempt)

	except (httpx.TimeoutException, httpx.ConnectError, httpx.ReadError) as e:
	print(f" [batch {batch_idx}] Network error: {type(e).__name__}. Retrying (attempt {attempt+1}/{MAX_RETRIES})")
	await asyncio.sleep(RETRY_BACKOFF_BASE ** attempt)

	print(f" [batch {batch_idx}] FAILED after {MAX_RETRIES} attempts. Skipping.")
	return []


	async def fetch_all_batches(
	corpus_ids: list[str],
	api_key: str \| None,
	checkpoint_dir: Path,
	) -> list[tuple[str, str, bool]]:
	"""
	Fetch citation edges for all corpus IDs using the S2 batch API.
	Supports checkpoint/resume.
	"""
	# Check for existing checkpoint
	checkpoint_file = checkpoint_dir / "checkpoint.json"
	all_edges: list[tuple[str, str, bool]] = []
	start_batch = 0

	if checkpoint_file.exists():
	with open(checkpoint_file) as f:
	ckpt = json.load(f)
	start_batch = ckpt["next_batch"]
	# Load previously saved edges
	edges_file = checkpoint_dir / "edges_partial.jsonl"
	if edges_file.exists():
	with open(edges_file) as f:
	for line in f:
	row = json.loads(line)
	all_edges.append((row["citing"], row["cited"], row["influential"]))
	print(f"Resuming from batch {start_batch} ({len(all_edges)} edges already collected)")

	# Split into batches
	batches = []
	for i in range(0, len(corpus_ids), BATCH_SIZE):
	batches.append(corpus_ids[i : i + BATCH_SIZE])

	total_batches = len(batches)
	print(f"Total: {len(corpus_ids)} papers → {total_batches} batches of {BATCH_SIZE}")
	print(f"Starting from batch {start_batch}")

	# Rate limiting: 1 req/s without key, slightly faster with key
	delay = 0.5 if api_key else 1.1

	edges_file = checkpoint_dir / "edges_partial.jsonl"

	async with httpx.AsyncClient() as client:
	pbar = tqdm(range(start_batch, total_batches), initial=start_batch, total=total_batches)
	for batch_idx in pbar:
	batch = batches[batch_idx]

	edges = await fetch_one_batch(client, batch, api_key, batch_idx)
	all_edges.extend(edges)

	# Append edges to partial file
	with open(edges_file, "a") as f:
	for citing, cited, influential in edges:
	f.write(json.dumps({"citing": citing, "cited": cited, "influential": influential}) + "\n")

	pbar.set_postfix({"edges": len(all_edges), "batch_edges": len(edges)})

	# Checkpoint periodically
	if (batch_idx + 1) % CHECKPOINT_EVERY == 0:
	with open(checkpoint_file, "w") as f:
	json.dump({"next_batch": batch_idx + 1}, f)

	await asyncio.sleep(delay)

	# Final checkpoint
	with open(checkpoint_file, "w") as f:
	json.dump({"next_batch": total_batches, "status": "complete"}, f)

	return all_edges


	# ── Bulk Download Path ───────────────────────────────────────────────────────

	def process_bulk_downloads(
	papers_file: str,
	citations_file: str,
	corpus_set: set[str],
	) -> list[tuple[str, str, bool]]:
	"""
	Process S2 bulk dataset downloads to extract in-corpus citation edges.

	papers_file: paper-ids.jsonl.gz (corpusid → externalIds mapping)
	citations_file: citations.jsonl.gz (citingcorpusid → citedcorpusid edges)
	"""
	print("Step 1/2: Building corpusid → arxiv_id mapping from paper-ids...")
	corpusid_to_arxiv: dict[int, str] = {}
	with gzip.open(papers_file, "rt") as f:
	for line in tqdm(f, desc="Reading paper-ids"):
	try:
	rec = json.loads(line)
	ext_ids = rec.get("externalids") or rec.get("externalIds") or {}
	arxiv_id = ext_ids.get("ArXiv")
	corpus_id = rec.get("corpusid") or rec.get("corpusId")
	if arxiv_id and corpus_id and arxiv_id in corpus_set:
	corpusid_to_arxiv[int(corpus_id)] = arxiv_id
	except (json.JSONDecodeError, ValueError):
	continue

	print(f" Mapped {len(corpusid_to_arxiv)} corpus IDs to arXiv IDs in your corpus")

	print("Step 2/2: Filtering citation edges to in-corpus pairs...")
	edges: list[tuple[str, str, bool]] = []
	with gzip.open(citations_file, "rt") as f:
	for line in tqdm(f, desc="Reading citations"):
	try:
	rec = json.loads(line)
	citing_cid = rec.get("citingcorpusid") or rec.get("citingCorpusId")
	cited_cid = rec.get("citedcorpusid") or rec.get("citedCorpusId")
	is_influential = rec.get("isinfluential", False) or rec.get("isInfluential", False)

	citing_arxiv = corpusid_to_arxiv.get(int(citing_cid)) if citing_cid else None
	cited_arxiv = corpusid_to_arxiv.get(int(cited_cid)) if cited_cid else None

	if citing_arxiv and cited_arxiv:
	edges.append((citing_arxiv, cited_arxiv, bool(is_influential)))
	except (json.JSONDecodeError, ValueError, TypeError):
	continue

	print(f" Found {len(edges)} in-corpus citation edges")
	return edges


	# ── Filter & Save ────────────────────────────────────────────────────────────

	def filter_and_save(
	edges: list[tuple[str, str, bool]],
	corpus_set: set[str],
	output_path: str,
	):
	"""
	Filter edges to in-corpus pairs, deduplicate, and save as parquet.
	"""
	print(f"Raw edges before filtering: {len(edges)}")

	# Filter: both citing and cited must be in corpus
	filtered = [
	(citing, cited, influential)
	for citing, cited, influential in edges
	if citing in corpus_set and cited in corpus_set and citing != cited
	]
	print(f"In-corpus edges (both sides in corpus): {len(filtered)}")

	# Deduplicate
	seen = set()
	deduped = []
	for citing, cited, influential in filtered:
	key = (citing, cited)
	if key not in seen:
	seen.add(key)
	deduped.append((citing, cited, influential))

	print(f"After deduplication: {len(deduped)}")

	# Save as parquet
	table = pa.table({
	"citing_arxiv_id": pa.array([e[0] for e in deduped], type=pa.string()),
	"cited_arxiv_id": pa.array([e[1] for e in deduped], type=pa.string()),
	"is_influential": pa.array([e[2] for e in deduped], type=pa.bool_()),
	})

	pq.write_table(table, output_path, compression="snappy")
	print(f"Saved {len(deduped)} citation edges to {output_path}")

	# Print stats
	citing_papers = set(e[0] for e in deduped)
	cited_papers = set(e[1] for e in deduped)
	print(f"\nStats:")
	print(f" Unique citing papers: {len(citing_papers)}")
	print(f" Unique cited papers: {len(cited_papers)}")
	print(f" Unique papers total: {len(citing_papers \| cited_papers)}")
	print(f" Avg references per citing paper: {len(deduped) / max(len(citing_papers), 1):.1f}")
	influential_count = sum(1 for e in deduped if e[2])
	print(f" Influential citations: {influential_count} ({100*influential_count/max(len(deduped),1):.1f}%)")


	# ── Main ─────────────────────────────────────────────────────────────────────

	def load_corpus_ids(path: str) -> list[str]:
	"""Load arXiv IDs from a text file (one per line)."""
	ids = []
	with open(path) as f:
	for line in f:
	line = line.strip()
	if line and not line.startswith("#"):
	# Handle various formats: "2303.14957", "arXiv:2303.14957", etc.
	if line.startswith("arXiv:"):
	line = line[6:]
	elif line.startswith("ARXIV:"):
	line = line[6:]
	ids.append(line)
	print(f"Loaded {len(ids)} arXiv IDs from {path}")
	return ids


	def main():
	parser = argparse.ArgumentParser(
	description="Fetch citation edges from Semantic Scholar for ResearchIT corpus"
	)
	parser.add_argument(
	"--corpus-file", required=True,
	help="Text file with one arXiv ID per line (e.g. arxiv_ids.txt)"
	)
	parser.add_argument(
	"--output", default="citations.parquet",
	help="Output parquet file path (default: citations.parquet)"
	)
	parser.add_argument(
	"--api-key", default=None,
	help="Semantic Scholar API key (optional, speeds up rate limit)"
	)
	parser.add_argument(
	"--bulk-papers", default=None,
	help="Path to S2 bulk paper-ids.jsonl.gz (use bulk download path)"
	)
	parser.add_argument(
	"--bulk-citations", default=None,
	help="Path to S2 bulk citations.jsonl.gz (use bulk download path)"
	)
	parser.add_argument(
	"--checkpoint-dir", default="./citation_checkpoint",
	help="Directory for checkpoint files (batch API mode)"
	)
	parser.add_argument(
	"--max-papers", type=int, default=None,
	help="Limit to first N papers (for testing)"
	)

	args = parser.parse_args()

	# Load corpus
	corpus_ids = load_corpus_ids(args.corpus_file)
	if args.max_papers:
	corpus_ids = corpus_ids[:args.max_papers]
	print(f" Limited to {len(corpus_ids)} papers (--max-papers)")

	corpus_set = set(corpus_ids)

	# Choose path
	if args.bulk_papers and args.bulk_citations:
	print("\n=== BULK DOWNLOAD PATH ===")
	edges = process_bulk_downloads(args.bulk_papers, args.bulk_citations, corpus_set)
	else:
	print("\n=== BATCH API PATH ===")
	if not args.api_key:
	# Check environment variable
	args.api_key = os.environ.get("S2_API_KEY")
	if args.api_key:
	print(f"Using API key: {args.api_key[:8]}...")
	else:
	print("No API key — using unauthenticated rate (1 req/s)")
	print("Get a free key at: https://www.semanticscholar.org/product/api#Partner-Form")

	checkpoint_dir = Path(args.checkpoint_dir)
	checkpoint_dir.mkdir(parents=True, exist_ok=True)

	edges = asyncio.run(fetch_all_batches(corpus_ids, args.api_key, checkpoint_dir))

	# Filter to in-corpus and save
	filter_and_save(edges, corpus_set, args.output)

	print(f"\n✅ Done! Citation edges saved to: {args.output}")


	if __name__ == "__main__":
	main()