File size: 6,176 Bytes
b98fd6c dcd26cc b98fd6c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | """
OpenAlex API integration for RECON v2.
Provides paper search and DOI-based lookup with citation centrality.
API key required (free at openalex.org/settings/api).
Store as OPENALEX_API_KEY in .env
Rate limits (free tier, April 2026):
- Singleton DOI lookups: unlimited
- List/filter searches: 10,000/day
- Full-text search: 1,000/day
"""
import os
import time
import hashlib
import json
import logging
import requests
from typing import Optional
logger = logging.getLogger(__name__)
OPENALEX_BASE = "https://api.openalex.org"
TIMEOUT = 8 # seconds β fail fast, OpenAlex is usually quick
MAX_RESULTS_PER_QUERY = 5
def _get_api_key() -> Optional[str]:
return os.environ.get("OPENALEX_API_KEY")
def _headers() -> dict:
key = _get_api_key()
if key:
return {"Authorization": f"Bearer {key}"}
logger.debug("OPENALEX_API_KEY not set β requests may be rate limited")
return {}
def _safe_get(url: str, params: dict) -> Optional[dict]:
"""Single GET with timeout and error handling. Returns parsed JSON or None."""
try:
resp = requests.get(url, params=params, headers=_headers(), timeout=TIMEOUT)
if resp.status_code == 200:
return resp.json()
elif resp.status_code == 429:
logger.warning("OpenAlex rate limit hit β skipping")
return None
else:
logger.warning(f"OpenAlex {resp.status_code} for {url}")
return None
except requests.exceptions.Timeout:
logger.warning(f"OpenAlex timeout for {url}")
return None
except Exception as e:
logger.warning(f"OpenAlex error: {e}")
return None
def _parse_work(work: dict) -> Optional[dict]:
"""
Parse a single OpenAlex Work object into a flat dict compatible with
RECON's Paper dataclass fields.
Returns dict with keys: title, year, doi, abstract, citation_count,
authors, paper_id, url, source
Returns None if work is missing essential fields.
"""
title = work.get("title") or ""
if not title:
return None
year = work.get("publication_year")
doi = work.get("doi") or ""
# OpenAlex DOIs come as full URLs β strip to bare DOI
if doi.startswith("https://doi.org/"):
doi = doi[len("https://doi.org/"):]
citation_count = work.get("cited_by_count") or 0
# Abstract: OpenAlex stores as inverted index β reconstruct
abstract = ""
inv_abstract = work.get("abstract_inverted_index") or {}
if inv_abstract:
word_positions = []
for word, positions in inv_abstract.items():
for pos in positions:
word_positions.append((pos, word))
word_positions.sort(key=lambda x: x[0])
abstract = " ".join(w for _, w in word_positions)
# Authors
authorships = work.get("authorships") or []
authors = []
for a in authorships[:5]: # cap at 5
display = (a.get("author") or {}).get("display_name") or ""
if display:
authors.append(display)
authors_str = ", ".join(authors)
# Stable ID: prefer DOI, fall back to OpenAlex ID
openalex_id = work.get("id") or ""
paper_id = f"openalex:{doi}" if doi else f"openalex:{openalex_id}"
# URL: prefer DOI link, then OpenAlex page
url = f"https://doi.org/{doi}" if doi else (work.get("primary_location") or {}).get("landing_page_url") or ""
return {
"title": title,
"year": year,
"doi": doi,
"abstract": abstract,
"citation_count": citation_count,
"authors": authors_str,
"paper_id": paper_id,
"url": url,
"source": "openalex",
}
def search_openalex(query: str, max_results: int = MAX_RESULTS_PER_QUERY) -> list[dict]:
"""
Search OpenAlex for papers matching query string.
Returns list of parsed paper dicts (compatible with Paper dataclass fields).
Returns empty list on any failure β never raises.
"""
params = {
"search": query,
"filter": "type:article",
"sort": "cited_by_count:desc",
"per-page": max_results,
"select": "id,title,publication_year,doi,cited_by_count,abstract_inverted_index,authorships,primary_location",
}
data = _safe_get(f"{OPENALEX_BASE}/works", params)
if not data:
return []
results = []
for work in (data.get("results") or []):
parsed = _parse_work(work)
if parsed:
results.append(parsed)
logger.info(f"OpenAlex search '{query[:40]}': {len(results)} results")
return results
def get_openalex_by_doi(doi: str) -> Optional[dict]:
"""
Fetch a single paper by DOI. Used for enrichment β getting cited_by_count
for papers already retrieved from Semantic Scholar.
Returns parsed paper dict or None.
"""
if not doi:
return None
clean_doi = doi.strip()
if clean_doi.startswith("https://doi.org/"):
clean_doi = clean_doi[len("https://doi.org/"):]
params = {
"filter": f"doi:{clean_doi}",
"select": "id,title,publication_year,doi,cited_by_count,abstract_inverted_index,authorships",
}
data = _safe_get(f"{OPENALEX_BASE}/works", params)
if not data:
return None
results = data.get("results") or []
if not results:
return None
return _parse_work(results[0])
def get_citation_centrality(doi: str, citation_count: Optional[int] = None) -> float:
"""
Compute normalized citation centrality for a paper.
If doi is provided, fetches cited_by_count from OpenAlex for accuracy.
If doi is missing or fetch fails, uses provided citation_count as fallback.
Formula: min(1.0, log1p(cited_by_count) / log1p(10000))
This matches the existing authority_score formula in retriever_utils.py
so the scales are comparable.
Returns float in [0, 1]. Returns 0.0 on complete failure.
"""
import math
count = None
if doi:
paper = get_openalex_by_doi(doi)
if paper:
count = paper.get("citation_count")
if count is None:
count = citation_count or 0
return min(1.0, math.log1p(count) / math.log1p(10000))
|