Spaces:

amarck
/

Researcher

Sleeping

App Files Files Community

Researcher / src /pipelines /aiml.py

amarck

Initial commit: Research Intelligence System

a0f27fa about 1 month ago

raw

history blame contribute delete

10.7 kB

	"""AI/ML paper pipeline.

	Fetches papers from HuggingFace Daily Papers + arXiv, enriches with
	HF ecosystem metadata, and writes to the database.
	"""

	import logging
	import re
	import time
	from datetime import datetime, timedelta, timezone

	import arxiv
	import requests

	from src.config import (
	ARXIV_LARGE_CATS,
	ARXIV_SMALL_CATS,
	EXCLUDE_RE,
	GITHUB_URL_RE,
	HF_API,
	HF_MAX_AGE_DAYS,
	INCLUDE_RE,
	MAX_ABSTRACT_CHARS_AIML,
	)
	from src.db import create_run, finish_run, insert_papers

	log = logging.getLogger(__name__)


	# ---------------------------------------------------------------------------
	# HuggingFace API
	# ---------------------------------------------------------------------------


	def fetch_hf_daily(date_str: str) -> list[dict]:
	"""Fetch HF Daily Papers for a given date."""
	url = f"{HF_API}/daily_papers?date={date_str}"
	try:
	resp = requests.get(url, timeout=30)
	resp.raise_for_status()
	return resp.json()
	except (requests.RequestException, ValueError):
	return []


	def fetch_hf_trending(limit: int = 50) -> list[dict]:
	"""Fetch HF trending papers."""
	url = f"{HF_API}/daily_papers?sort=trending&limit={limit}"
	try:
	resp = requests.get(url, timeout=30)
	resp.raise_for_status()
	return resp.json()
	except (requests.RequestException, ValueError):
	return []


	def arxiv_id_to_date(arxiv_id: str) -> datetime \| None:
	"""Extract approximate publication date from arXiv ID (YYMM.NNNNN)."""
	match = re.match(r"(\d{2})(\d{2})\.\d+", arxiv_id)
	if not match:
	return None
	year = 2000 + int(match.group(1))
	month = int(match.group(2))
	if not (1 <= month <= 12):
	return None
	return datetime(year, month, 1, tzinfo=timezone.utc)


	def normalize_hf_paper(hf_entry: dict) -> dict \| None:
	"""Convert an HF daily_papers entry to our normalized format.

	Returns None if the paper is too old.
	"""
	paper = hf_entry.get("paper", hf_entry)
	arxiv_id = paper.get("id", "")

	authors_raw = paper.get("authors", [])
	authors = []
	for a in authors_raw:
	if isinstance(a, dict):
	name = a.get("name", a.get("user", {}).get("fullname", ""))
	if name:
	authors.append(name)
	elif isinstance(a, str):
	authors.append(a)

	github_repo = hf_entry.get("githubRepo") or paper.get("githubRepo") or ""

	pub_date = arxiv_id_to_date(arxiv_id)
	if pub_date and (datetime.now(timezone.utc) - pub_date).days > HF_MAX_AGE_DAYS:
	return None

	return {
	"arxiv_id": arxiv_id,
	"title": paper.get("title", "").replace("\n", " ").strip(),
	"authors": authors[:10],
	"abstract": paper.get("summary", paper.get("abstract", "")).replace("\n", " ").strip(),
	"published": paper.get("publishedAt", paper.get("published", "")),
	"categories": paper.get("categories", []),
	"pdf_url": f"https://arxiv.org/pdf/{arxiv_id}" if arxiv_id else "",
	"arxiv_url": f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else "",
	"comment": "",
	"source": "hf",
	"hf_upvotes": hf_entry.get("paper", {}).get("upvotes", hf_entry.get("upvotes", 0)),
	"github_repo": github_repo,
	"github_stars": None,
	"hf_models": [],
	"hf_datasets": [],
	"hf_spaces": [],
	}


	# ---------------------------------------------------------------------------
	# arXiv fetching
	# ---------------------------------------------------------------------------


	def fetch_arxiv_category(
	cat: str,
	start: datetime,
	end: datetime,
	max_results: int,
	filter_keywords: bool,
	) -> list[dict]:
	"""Fetch papers from a single arXiv category."""
	client = arxiv.Client(page_size=200, delay_seconds=3.0, num_retries=3)

	query = arxiv.Search(
	query=f"cat:{cat}",
	max_results=max_results,
	sort_by=arxiv.SortCriterion.SubmittedDate,
	sort_order=arxiv.SortOrder.Descending,
	)

	papers = []
	for result in client.results(query):
	pub = result.published.replace(tzinfo=timezone.utc)
	if pub < start:
	break
	if pub > end:
	continue

	if filter_keywords:
	text = f"{result.title} {result.summary}"
	if not INCLUDE_RE.search(text):
	continue
	if EXCLUDE_RE.search(text):
	continue

	papers.append(_arxiv_result_to_dict(result))

	return papers


	def _arxiv_result_to_dict(result: arxiv.Result) -> dict:
	"""Convert an arxiv.Result to our normalized format."""
	arxiv_id = result.entry_id.split("/abs/")[-1]
	base_id = re.sub(r"v\d+$", "", arxiv_id)

	github_urls = GITHUB_URL_RE.findall(f"{result.summary} {result.comment or ''}")
	github_repo = github_urls[0].rstrip(".") if github_urls else ""

	return {
	"arxiv_id": base_id,
	"title": result.title.replace("\n", " ").strip(),
	"authors": [a.name for a in result.authors[:10]],
	"abstract": result.summary.replace("\n", " ").strip(),
	"published": result.published.isoformat(),
	"categories": list(result.categories),
	"pdf_url": result.pdf_url,
	"arxiv_url": result.entry_id,
	"comment": (result.comment or "").replace("\n", " ").strip(),
	"source": "arxiv",
	"hf_upvotes": 0,
	"github_repo": github_repo,
	"github_stars": None,
	"hf_models": [],
	"hf_datasets": [],
	"hf_spaces": [],
	}


	# ---------------------------------------------------------------------------
	# Enrichment
	# ---------------------------------------------------------------------------


	def enrich_paper(paper: dict) -> dict:
	"""Query HF API for linked models, datasets, and spaces."""
	arxiv_id = paper["arxiv_id"]
	if not arxiv_id:
	return paper

	base_id = re.sub(r"v\d+$", "", arxiv_id)

	for resource, key, limit in [
	("models", "hf_models", 5),
	("datasets", "hf_datasets", 3),
	("spaces", "hf_spaces", 3),
	]:
	url = f"{HF_API}/{resource}?filter=arxiv:{base_id}&limit={limit}&sort=likes"
	try:
	resp = requests.get(url, timeout=15)
	if resp.ok:
	items = resp.json()
	paper[key] = [
	{"id": item.get("id", item.get("_id", "")), "likes": item.get("likes", 0)}
	for item in items
	]
	except (requests.RequestException, ValueError):
	pass

	time.sleep(0.2)
	return paper


	# ---------------------------------------------------------------------------
	# Merge
	# ---------------------------------------------------------------------------


	def merge_papers(hf_papers: list[dict], arxiv_papers: list[dict]) -> list[dict]:
	"""Deduplicate by arXiv ID. When both sources have a paper, merge."""
	by_id: dict[str, dict] = {}

	for p in arxiv_papers:
	aid = re.sub(r"v\d+$", "", p["arxiv_id"])
	if aid:
	by_id[aid] = p

	for p in hf_papers:
	aid = re.sub(r"v\d+$", "", p["arxiv_id"])
	if not aid:
	continue
	if aid in by_id:
	existing = by_id[aid]
	existing["source"] = "both"
	existing["hf_upvotes"] = max(existing.get("hf_upvotes", 0), p.get("hf_upvotes", 0))
	if p.get("github_repo") and not existing.get("github_repo"):
	existing["github_repo"] = p["github_repo"]
	if not existing.get("categories") and p.get("categories"):
	existing["categories"] = p["categories"]
	else:
	by_id[aid] = p

	return list(by_id.values())


	# ---------------------------------------------------------------------------
	# Pipeline entry point
	# ---------------------------------------------------------------------------


	def run_aiml_pipeline(
	start: datetime \| None = None,
	end: datetime \| None = None,
	max_papers: int = 300,
	skip_enrich: bool = False,
	) -> int:
	"""Run the full AI/ML pipeline. Returns the run ID."""
	if end is None:
	end = datetime.now(timezone.utc)
	if start is None:
	start = end - timedelta(days=7)

	# Ensure timezone-aware
	if start.tzinfo is None:
	start = start.replace(tzinfo=timezone.utc)
	if end.tzinfo is None:
	end = end.replace(tzinfo=timezone.utc, hour=23, minute=59, second=59)

	run_id = create_run("aiml", start.date().isoformat(), end.date().isoformat())
	log.info("Run %d: %s to %s", run_id, start.date(), end.date())

	try:
	# Step 1: Fetch HF papers
	log.info("Fetching HuggingFace Daily Papers ...")
	hf_papers_raw = []
	current = start
	while current <= end:
	date_str = current.strftime("%Y-%m-%d")
	daily = fetch_hf_daily(date_str)
	hf_papers_raw.extend(daily)
	current += timedelta(days=1)

	trending = fetch_hf_trending(limit=50)
	hf_papers_raw.extend(trending)

	hf_papers = [p for p in (normalize_hf_paper(e) for e in hf_papers_raw) if p is not None]
	log.info("HF papers: %d", len(hf_papers))

	# Step 2: Fetch arXiv papers
	log.info("Fetching arXiv papers ...")
	arxiv_papers = []
	for cat in ARXIV_LARGE_CATS:
	papers = fetch_arxiv_category(cat, start, end, max_papers, filter_keywords=True)
	arxiv_papers.extend(papers)
	log.info(" %s: %d papers (keyword-filtered)", cat, len(papers))

	for cat in ARXIV_SMALL_CATS:
	papers = fetch_arxiv_category(cat, start, end, max_papers, filter_keywords=False)
	arxiv_papers.extend(papers)
	log.info(" %s: %d papers", cat, len(papers))

	# Step 3: Merge
	all_papers = merge_papers(hf_papers, arxiv_papers)
	log.info("Merged: %d unique papers", len(all_papers))

	# Step 4: Enrich
	if not skip_enrich:
	log.info("Enriching with HF ecosystem links ...")
	for i, paper in enumerate(all_papers):
	all_papers[i] = enrich_paper(paper)
	if (i + 1) % 25 == 0:
	log.info(" Enriched %d/%d ...", i + 1, len(all_papers))
	log.info("Enrichment complete")

	# Step 5: Insert into DB
	insert_papers(all_papers, run_id, "aiml")
	finish_run(run_id, len(all_papers))
	log.info("Done — %d papers inserted", len(all_papers))
	return run_id

	except Exception as e:
	finish_run(run_id, 0, status="failed")
	log.exception("Pipeline failed")
	raise