recon / src /openalex_utils.py
MukulRay's picture
fix: suppress OPENALEX_API_KEY warning when no DOI present
dcd26cc
"""
OpenAlex API integration for RECON v2.
Provides paper search and DOI-based lookup with citation centrality.
API key required (free at openalex.org/settings/api).
Store as OPENALEX_API_KEY in .env
Rate limits (free tier, April 2026):
- Singleton DOI lookups: unlimited
- List/filter searches: 10,000/day
- Full-text search: 1,000/day
"""
import os
import time
import hashlib
import json
import logging
import requests
from typing import Optional
logger = logging.getLogger(__name__)
OPENALEX_BASE = "https://api.openalex.org"
TIMEOUT = 8 # seconds — fail fast, OpenAlex is usually quick
MAX_RESULTS_PER_QUERY = 5
def _get_api_key() -> Optional[str]:
return os.environ.get("OPENALEX_API_KEY")
def _headers() -> dict:
key = _get_api_key()
if key:
return {"Authorization": f"Bearer {key}"}
logger.debug("OPENALEX_API_KEY not set — requests may be rate limited")
return {}
def _safe_get(url: str, params: dict) -> Optional[dict]:
"""Single GET with timeout and error handling. Returns parsed JSON or None."""
try:
resp = requests.get(url, params=params, headers=_headers(), timeout=TIMEOUT)
if resp.status_code == 200:
return resp.json()
elif resp.status_code == 429:
logger.warning("OpenAlex rate limit hit — skipping")
return None
else:
logger.warning(f"OpenAlex {resp.status_code} for {url}")
return None
except requests.exceptions.Timeout:
logger.warning(f"OpenAlex timeout for {url}")
return None
except Exception as e:
logger.warning(f"OpenAlex error: {e}")
return None
def _parse_work(work: dict) -> Optional[dict]:
"""
Parse a single OpenAlex Work object into a flat dict compatible with
RECON's Paper dataclass fields.
Returns dict with keys: title, year, doi, abstract, citation_count,
authors, paper_id, url, source
Returns None if work is missing essential fields.
"""
title = work.get("title") or ""
if not title:
return None
year = work.get("publication_year")
doi = work.get("doi") or ""
# OpenAlex DOIs come as full URLs — strip to bare DOI
if doi.startswith("https://doi.org/"):
doi = doi[len("https://doi.org/"):]
citation_count = work.get("cited_by_count") or 0
# Abstract: OpenAlex stores as inverted index — reconstruct
abstract = ""
inv_abstract = work.get("abstract_inverted_index") or {}
if inv_abstract:
word_positions = []
for word, positions in inv_abstract.items():
for pos in positions:
word_positions.append((pos, word))
word_positions.sort(key=lambda x: x[0])
abstract = " ".join(w for _, w in word_positions)
# Authors
authorships = work.get("authorships") or []
authors = []
for a in authorships[:5]: # cap at 5
display = (a.get("author") or {}).get("display_name") or ""
if display:
authors.append(display)
authors_str = ", ".join(authors)
# Stable ID: prefer DOI, fall back to OpenAlex ID
openalex_id = work.get("id") or ""
paper_id = f"openalex:{doi}" if doi else f"openalex:{openalex_id}"
# URL: prefer DOI link, then OpenAlex page
url = f"https://doi.org/{doi}" if doi else (work.get("primary_location") or {}).get("landing_page_url") or ""
return {
"title": title,
"year": year,
"doi": doi,
"abstract": abstract,
"citation_count": citation_count,
"authors": authors_str,
"paper_id": paper_id,
"url": url,
"source": "openalex",
}
def search_openalex(query: str, max_results: int = MAX_RESULTS_PER_QUERY) -> list[dict]:
"""
Search OpenAlex for papers matching query string.
Returns list of parsed paper dicts (compatible with Paper dataclass fields).
Returns empty list on any failure — never raises.
"""
params = {
"search": query,
"filter": "type:article",
"sort": "cited_by_count:desc",
"per-page": max_results,
"select": "id,title,publication_year,doi,cited_by_count,abstract_inverted_index,authorships,primary_location",
}
data = _safe_get(f"{OPENALEX_BASE}/works", params)
if not data:
return []
results = []
for work in (data.get("results") or []):
parsed = _parse_work(work)
if parsed:
results.append(parsed)
logger.info(f"OpenAlex search '{query[:40]}': {len(results)} results")
return results
def get_openalex_by_doi(doi: str) -> Optional[dict]:
"""
Fetch a single paper by DOI. Used for enrichment — getting cited_by_count
for papers already retrieved from Semantic Scholar.
Returns parsed paper dict or None.
"""
if not doi:
return None
clean_doi = doi.strip()
if clean_doi.startswith("https://doi.org/"):
clean_doi = clean_doi[len("https://doi.org/"):]
params = {
"filter": f"doi:{clean_doi}",
"select": "id,title,publication_year,doi,cited_by_count,abstract_inverted_index,authorships",
}
data = _safe_get(f"{OPENALEX_BASE}/works", params)
if not data:
return None
results = data.get("results") or []
if not results:
return None
return _parse_work(results[0])
def get_citation_centrality(doi: str, citation_count: Optional[int] = None) -> float:
"""
Compute normalized citation centrality for a paper.
If doi is provided, fetches cited_by_count from OpenAlex for accuracy.
If doi is missing or fetch fails, uses provided citation_count as fallback.
Formula: min(1.0, log1p(cited_by_count) / log1p(10000))
This matches the existing authority_score formula in retriever_utils.py
so the scales are comparable.
Returns float in [0, 1]. Returns 0.0 on complete failure.
"""
import math
count = None
if doi:
paper = get_openalex_by_doi(doi)
if paper:
count = paper.get("citation_count")
if count is None:
count = citation_count or 0
return min(1.0, math.log1p(count) / math.log1p(10000))