Spaces:

amarck
/

Researcher

Sleeping

App Files Files Community

Researcher / src /pipelines /semantic_scholar.py

amarck

Initial commit: Research Intelligence System

a0f27fa about 2 months ago

raw

history blame contribute delete

11.8 kB

	"""Semantic Scholar enrichment — connected papers, TL;DR, and topic extraction.

	Uses the free S2 Academic Graph API. No API key required but rate-limited
	to a shared pool. With a key (x-api-key header), 1 req/sec guaranteed.

	Enrichment strategy:
	1. Batch lookup all papers → TL;DR + S2 paper ID (1 API call per 500 papers)
	2. Top N papers by score → references + recommendations (2 calls each)
	3. Topic extraction from title/abstract (local, no API)
	"""

	import json
	import logging
	import re
	import time

	import requests

	log = logging.getLogger(__name__)

	from src.db import (
	clear_connections,
	get_arxiv_id_map,
	get_conn,
	get_top_papers,
	insert_connections,
	update_paper_s2,
	update_paper_topics,
	)

	S2_GRAPH = "https://api.semanticscholar.org/graph/v1"
	S2_RECO = "https://api.semanticscholar.org/recommendations/v1"
	S2_HEADERS: dict[str, str] = {} # Add {"x-api-key": "..."} if you have one

	# How many top papers get full connection enrichment
	TOP_N_CONNECTIONS = 30
	# Rate limit pause between requests (seconds)
	RATE_LIMIT = 1.1


	# ---------------------------------------------------------------------------
	# Main entry point
	# ---------------------------------------------------------------------------


	def enrich_run(run_id: int, domain: str):
	"""Enrich all scored papers in a run with S2 data + topics."""
	with get_conn() as conn:
	rows = conn.execute(
	"SELECT id, arxiv_id, title, abstract, composite FROM papers "
	"WHERE run_id=? AND composite IS NOT NULL "
	"ORDER BY composite DESC",
	(run_id,),
	).fetchall()
	papers = [dict(r) for r in rows]

	if not papers:
	log.info("No scored papers in run %d, skipping", run_id)
	return

	arxiv_map = get_arxiv_id_map(run_id)
	log.info("Enriching %d papers from run %d (%s)...", len(papers), run_id, domain)

	# Step 1: Batch TL;DR + S2 ID
	_batch_tldr(papers)

	# Step 2: Connected papers for top N
	top_papers = papers[:TOP_N_CONNECTIONS]
	for i, p in enumerate(top_papers):
	try:
	_fetch_connections(p, arxiv_map)
	except Exception as e:
	log.warning("Error fetching connections for %s: %s", p['arxiv_id'], e)
	if (i + 1) % 10 == 0:
	log.info("Connections: %d/%d", i + 1, len(top_papers))

	# Step 3: Topic extraction (local, instant)
	for p in papers:
	topics = extract_topics(p["title"], p.get("abstract", ""), domain)
	if topics:
	update_paper_topics(p["id"], topics)

	log.info("Done enriching run %d", run_id)


	# ---------------------------------------------------------------------------
	# Step 1: Batch TL;DR
	# ---------------------------------------------------------------------------


	def _batch_tldr(papers: list[dict]):
	"""Batch fetch TL;DR and S2 paper IDs."""
	chunk_size = 500
	for start in range(0, len(papers), chunk_size):
	chunk = papers[start : start + chunk_size]
	ids = [f"arXiv:{p['arxiv_id']}" for p in chunk]

	try:
	resp = requests.post(
	f"{S2_GRAPH}/paper/batch",
	params={"fields": "externalIds,tldr"},
	json={"ids": ids},
	headers=S2_HEADERS,
	timeout=30,
	)
	resp.raise_for_status()
	results = resp.json()
	except Exception as e:
	log.warning("Batch TL;DR failed: %s", e)
	time.sleep(RATE_LIMIT)
	continue

	for paper, s2_data in zip(chunk, results):
	if s2_data is None:
	continue
	s2_id = s2_data.get("paperId", "")
	tldr_obj = s2_data.get("tldr")
	tldr_text = tldr_obj.get("text", "") if tldr_obj else ""
	update_paper_s2(paper["id"], s2_id, tldr_text)
	paper["s2_paper_id"] = s2_id

	found = sum(1 for r in results if r is not None)
	log.info("Batch TL;DR: %d/%d papers found in S2", found, len(chunk))
	time.sleep(RATE_LIMIT)


	# ---------------------------------------------------------------------------
	# Step 2: Connected papers (references + recommendations)
	# ---------------------------------------------------------------------------


	def _fetch_connections(paper: dict, arxiv_map: dict[str, int]):
	"""Fetch references and recommendations for a single paper."""
	arxiv_id = paper["arxiv_id"]
	paper_id = paper["id"]

	# Clear old connections before re-fetching
	clear_connections(paper_id)

	connections: list[dict] = []

	# References
	time.sleep(RATE_LIMIT)
	try:
	resp = requests.get(
	f"{S2_GRAPH}/paper/arXiv:{arxiv_id}/references",
	params={"fields": "title,year,externalIds", "limit": 30},
	headers=S2_HEADERS,
	timeout=15,
	)
	if resp.ok:
	for item in resp.json().get("data", []):
	cited = item.get("citedPaper")
	if not cited or not cited.get("title"):
	continue
	ext = cited.get("externalIds") or {}
	c_arxiv = ext.get("ArXiv", "")
	connections.append({
	"paper_id": paper_id,
	"connected_arxiv_id": c_arxiv,
	"connected_s2_id": cited.get("paperId", ""),
	"connected_title": cited.get("title", ""),
	"connected_year": cited.get("year"),
	"connection_type": "reference",
	"in_db_paper_id": arxiv_map.get(c_arxiv),
	})
	except requests.RequestException as e:
	log.warning("References failed for %s: %s", arxiv_id, e)

	# Recommendations
	time.sleep(RATE_LIMIT)
	try:
	resp = requests.get(
	f"{S2_RECO}/papers/forpaper/arXiv:{arxiv_id}",
	params={"fields": "title,year,externalIds", "limit": 15},
	headers=S2_HEADERS,
	timeout=15,
	)
	if resp.ok:
	for rec in resp.json().get("recommendedPapers", []):
	if not rec or not rec.get("title"):
	continue
	ext = rec.get("externalIds") or {}
	c_arxiv = ext.get("ArXiv", "")
	connections.append({
	"paper_id": paper_id,
	"connected_arxiv_id": c_arxiv,
	"connected_s2_id": rec.get("paperId", ""),
	"connected_title": rec.get("title", ""),
	"connected_year": rec.get("year"),
	"connection_type": "recommendation",
	"in_db_paper_id": arxiv_map.get(c_arxiv),
	})
	except requests.RequestException as e:
	log.warning("Recommendations failed for %s: %s", arxiv_id, e)

	if connections:
	insert_connections(connections)


	# ---------------------------------------------------------------------------
	# Step 3: Topic extraction (local, no API)
	# ---------------------------------------------------------------------------

	AIML_TOPICS = {
	"Video Generation": re.compile(
	r"video.generat\|text.to.video\|video.diffusion\|video.synth\|video.edit", re.I),
	"Image Generation": re.compile(
	r"image.generat\|text.to.image\|(?:stable\|latent).diffusion\|image.synth\|image.edit", re.I),
	"Language Models": re.compile(
	r"language.model\|(?:large\|foundation).model\|\bllm\b\|\bgpt\b\|instruction.tun\|fine.tun", re.I),
	"Code": re.compile(
	r"code.generat\|code.complet\|program.synth\|vibe.cod\|software.engineer", re.I),
	"Multimodal": re.compile(
	r"multimodal\|vision.language\|\bvlm\b\|visual.question\|image.text", re.I),
	"Efficiency": re.compile(
	r"quantiz\|distillat\|pruning\|efficient\|scaling.law\|compress\|accelerat", re.I),
	"Agents": re.compile(
	r"\bagent\b\|tool.use\|function.call\|planning\|agentic", re.I),
	"Speech / Audio": re.compile(
	r"text.to.speech\|\btts\b\|speech\|audio.generat\|voice\|music.generat", re.I),
	"3D / Vision": re.compile(
	r"\b3d\b\|nerf\|gaussian.splat\|point.cloud\|depth.estim\|object.detect\|segmentat", re.I),
	"Retrieval / RAG": re.compile(
	r"retriev\|\brag\b\|knowledge.(?:base\|graph)\|in.context.learn\|embedding", re.I),
	"Robotics": re.compile(
	r"robot\|embodied\|manipulat\|locomotion\|navigation", re.I),
	"Reasoning": re.compile(
	r"reasoning\|chain.of.thought\|mathemat\|logic\|theorem", re.I),
	"Training": re.compile(
	r"reinforcement.learn\|\brlhf\b\|\bdpo\b\|preference\|reward.model\|alignment", re.I),
	"Architecture": re.compile(
	r"attention.mechanism\|state.space\|\bmamba\b\|mixture.of.expert\|\bmoe\b\|transformer", re.I),
	"Benchmark": re.compile(
	r"benchmark\|evaluat\|leaderboard\|dataset\|scaling.law", re.I),
	"World Models": re.compile(
	r"world.model\|environment.model\|predictive.model\|dynamics.model", re.I),
	"Optimization": re.compile(
	r"optimi[zs]\|gradient\|convergence\|learning.rate\|loss.function\|multi.objective\|adversarial.train", re.I),
	"RL": re.compile(
	r"reinforcement.learn\|\brl\b\|reward\|policy.gradient\|q.learning\|bandit", re.I),
	}

	SECURITY_TOPICS = {
	"Web Security": re.compile(
	r"web.(?:secur\|app\|vuln)\|xss\|injection\|csrf\|waf\|\bbrowser.secur", re.I),
	"Network": re.compile(
	r"network.secur\|intrusion\|\bids\b\|firewall\|traffic\|\bdns\b\|\bbgp\b\|\bddos\b\|fingerprint\|scanning\|packet", re.I),
	"Malware": re.compile(
	r"malware\|ransomware\|trojan\|botnet\|rootkit\|worm\|backdoor", re.I),
	"Vulnerabilities": re.compile(
	r"vulnerab\|\bcve\b\|exploit\|fuzzing\|fuzz\|buffer.overflow\|zero.day\|attack.surface\|security.bench", re.I),
	"Cryptography": re.compile(
	r"cryptograph\|encryption\|decrypt\|protocol\|\btls\b\|\bssl\b\|cipher", re.I),
	"Hardware": re.compile(
	r"side.channel\|timing.attack\|spectre\|meltdown\|hardware\|firmware\|microarch\|fault.inject\|emfi\|embedded.secur", re.I),
	"Reverse Engineering": re.compile(
	r"reverse.engineer\|binary\|decompil\|obfuscat\|disassembl", re.I),
	"Mobile": re.compile(
	r"\bandroid\b\|\bios.secur\|mobile.secur", re.I),
	"Cloud": re.compile(
	r"cloud.secur\|container.secur\|docker\|kubernetes\|serverless\|devsecops", re.I),
	"Authentication": re.compile(
	r"authentica\|identity\|credential\|phishing\|password\|oauth\|passkey\|webauthn", re.I),
	"Privacy": re.compile(
	r"privacy\|anonymi\|differential.privacy\|data.leak\|tracking\|membership.inference", re.I),
	"LLM Security": re.compile(
	r"(?:llm\|language.model).*(secur\|attack\|jailbreak\|safety\|risk\|unsafe\|inject\|adversar)\|prompt.inject\|red.team\|rubric.attack\|preference.drift", re.I),
	"Forensics": re.compile(
	r"forensic\|incident.response\|audit\|log.analy\|carver\|tamper\|evidence", re.I),
	"Blockchain": re.compile(
	r"blockchain\|smart.contract\|solana\|ethereum\|memecoin\|mev\|defi\|token\|cryptocurrency", re.I),
	"Supply Chain": re.compile(
	r"supply.chain\|dependency\|package.secur\|software.comp\|sbom", re.I),
	}


	def extract_topics(title: str, abstract: str, domain: str) -> list[str]:
	"""Extract up to 3 topic tags from title and abstract."""
	patterns = AIML_TOPICS if domain == "aiml" else SECURITY_TOPICS
	abstract_head = (abstract or "")[:500]

	scored: dict[str, int] = {}
	for topic, pattern in patterns.items():
	score = 0
	if pattern.search(title):
	score += 3 # Title match is strong signal
	if pattern.search(abstract_head):
	score += 1
	if score > 0:
	scored[topic] = score

	ranked = sorted(scored.items(), key=lambda x: -x[1])
	return [t for t, _ in ranked[:3]]