MukulRay commited on
Commit
b98fd6c
Β·
1 Parent(s): 10f9a75

Phase 2.1: add OpenAlex API integration (search, DOI lookup, citation centrality)

Browse files
Files changed (1) hide show
  1. src/openalex_utils.py +198 -0
src/openalex_utils.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OpenAlex API integration for RECON v2.
3
+ Provides paper search and DOI-based lookup with citation centrality.
4
+
5
+ API key required (free at openalex.org/settings/api).
6
+ Store as OPENALEX_API_KEY in .env
7
+
8
+ Rate limits (free tier, April 2026):
9
+ - Singleton DOI lookups: unlimited
10
+ - List/filter searches: 10,000/day
11
+ - Full-text search: 1,000/day
12
+ """
13
+
14
+ import os
15
+ import time
16
+ import hashlib
17
+ import json
18
+ import logging
19
+ import requests
20
+ from typing import Optional
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ OPENALEX_BASE = "https://api.openalex.org"
25
+ TIMEOUT = 8 # seconds β€” fail fast, OpenAlex is usually quick
26
+ MAX_RESULTS_PER_QUERY = 5
27
+
28
+
29
+ def _get_api_key() -> Optional[str]:
30
+ return os.environ.get("OPENALEX_API_KEY")
31
+
32
+
33
+ def _headers() -> dict:
34
+ key = _get_api_key()
35
+ if key:
36
+ return {"Authorization": f"Bearer {key}"}
37
+ logger.warning("OPENALEX_API_KEY not set β€” requests may be rate limited")
38
+ return {}
39
+
40
+
41
+ def _safe_get(url: str, params: dict) -> Optional[dict]:
42
+ """Single GET with timeout and error handling. Returns parsed JSON or None."""
43
+ try:
44
+ resp = requests.get(url, params=params, headers=_headers(), timeout=TIMEOUT)
45
+ if resp.status_code == 200:
46
+ return resp.json()
47
+ elif resp.status_code == 429:
48
+ logger.warning("OpenAlex rate limit hit β€” skipping")
49
+ return None
50
+ else:
51
+ logger.warning(f"OpenAlex {resp.status_code} for {url}")
52
+ return None
53
+ except requests.exceptions.Timeout:
54
+ logger.warning(f"OpenAlex timeout for {url}")
55
+ return None
56
+ except Exception as e:
57
+ logger.warning(f"OpenAlex error: {e}")
58
+ return None
59
+
60
+
61
+ def _parse_work(work: dict) -> Optional[dict]:
62
+ """
63
+ Parse a single OpenAlex Work object into a flat dict compatible with
64
+ RECON's Paper dataclass fields.
65
+
66
+ Returns dict with keys: title, year, doi, abstract, citation_count,
67
+ authors, paper_id, url, source
68
+ Returns None if work is missing essential fields.
69
+ """
70
+ title = work.get("title") or ""
71
+ if not title:
72
+ return None
73
+
74
+ year = work.get("publication_year")
75
+ doi = work.get("doi") or ""
76
+ # OpenAlex DOIs come as full URLs β€” strip to bare DOI
77
+ if doi.startswith("https://doi.org/"):
78
+ doi = doi[len("https://doi.org/"):]
79
+
80
+ citation_count = work.get("cited_by_count") or 0
81
+
82
+ # Abstract: OpenAlex stores as inverted index β€” reconstruct
83
+ abstract = ""
84
+ inv_abstract = work.get("abstract_inverted_index") or {}
85
+ if inv_abstract:
86
+ word_positions = []
87
+ for word, positions in inv_abstract.items():
88
+ for pos in positions:
89
+ word_positions.append((pos, word))
90
+ word_positions.sort(key=lambda x: x[0])
91
+ abstract = " ".join(w for _, w in word_positions)
92
+
93
+ # Authors
94
+ authorships = work.get("authorships") or []
95
+ authors = []
96
+ for a in authorships[:5]: # cap at 5
97
+ display = (a.get("author") or {}).get("display_name") or ""
98
+ if display:
99
+ authors.append(display)
100
+ authors_str = ", ".join(authors)
101
+
102
+ # Stable ID: prefer DOI, fall back to OpenAlex ID
103
+ openalex_id = work.get("id") or ""
104
+ paper_id = f"openalex:{doi}" if doi else f"openalex:{openalex_id}"
105
+
106
+ # URL: prefer DOI link, then OpenAlex page
107
+ url = f"https://doi.org/{doi}" if doi else (work.get("primary_location") or {}).get("landing_page_url") or ""
108
+
109
+ return {
110
+ "title": title,
111
+ "year": year,
112
+ "doi": doi,
113
+ "abstract": abstract,
114
+ "citation_count": citation_count,
115
+ "authors": authors_str,
116
+ "paper_id": paper_id,
117
+ "url": url,
118
+ "source": "openalex",
119
+ }
120
+
121
+
122
+ def search_openalex(query: str, max_results: int = MAX_RESULTS_PER_QUERY) -> list[dict]:
123
+ """
124
+ Search OpenAlex for papers matching query string.
125
+ Returns list of parsed paper dicts (compatible with Paper dataclass fields).
126
+ Returns empty list on any failure β€” never raises.
127
+ """
128
+ params = {
129
+ "search": query,
130
+ "filter": "type:article",
131
+ "sort": "cited_by_count:desc",
132
+ "per-page": max_results,
133
+ "select": "id,title,publication_year,doi,cited_by_count,abstract_inverted_index,authorships,primary_location",
134
+ }
135
+ data = _safe_get(f"{OPENALEX_BASE}/works", params)
136
+ if not data:
137
+ return []
138
+
139
+ results = []
140
+ for work in (data.get("results") or []):
141
+ parsed = _parse_work(work)
142
+ if parsed:
143
+ results.append(parsed)
144
+
145
+ logger.info(f"OpenAlex search '{query[:40]}': {len(results)} results")
146
+ return results
147
+
148
+
149
+ def get_openalex_by_doi(doi: str) -> Optional[dict]:
150
+ """
151
+ Fetch a single paper by DOI. Used for enrichment β€” getting cited_by_count
152
+ for papers already retrieved from Semantic Scholar.
153
+ Returns parsed paper dict or None.
154
+ """
155
+ if not doi:
156
+ return None
157
+ clean_doi = doi.strip()
158
+ if clean_doi.startswith("https://doi.org/"):
159
+ clean_doi = clean_doi[len("https://doi.org/"):]
160
+
161
+ params = {
162
+ "filter": f"doi:{clean_doi}",
163
+ "select": "id,title,publication_year,doi,cited_by_count,abstract_inverted_index,authorships",
164
+ }
165
+ data = _safe_get(f"{OPENALEX_BASE}/works", params)
166
+ if not data:
167
+ return None
168
+ results = data.get("results") or []
169
+ if not results:
170
+ return None
171
+ return _parse_work(results[0])
172
+
173
+
174
+ def get_citation_centrality(doi: str, citation_count: Optional[int] = None) -> float:
175
+ """
176
+ Compute normalized citation centrality for a paper.
177
+
178
+ If doi is provided, fetches cited_by_count from OpenAlex for accuracy.
179
+ If doi is missing or fetch fails, uses provided citation_count as fallback.
180
+
181
+ Formula: min(1.0, log1p(cited_by_count) / log1p(10000))
182
+ This matches the existing authority_score formula in retriever_utils.py
183
+ so the scales are comparable.
184
+
185
+ Returns float in [0, 1]. Returns 0.0 on complete failure.
186
+ """
187
+ import math
188
+ count = None
189
+
190
+ if doi:
191
+ paper = get_openalex_by_doi(doi)
192
+ if paper:
193
+ count = paper.get("citation_count")
194
+
195
+ if count is None:
196
+ count = citation_count or 0
197
+
198
+ return min(1.0, math.log1p(count) / math.log1p(10000))