siddhm11 commited on
Commit
239539e
Β·
1 Parent(s): 003b415

Phase 6.5 Day 5: Semantic Scholar author import (B4)

Browse files

config.py:
- Add S2_API_KEY = os.getenv('S2_API_KEY', '') β€” key already in .env

s2_svc.py: [NEW]
- parse_author_input(): accepts S2 URL, raw S2 ID, or ORCID
- resolve_orcid(): S2 author search API β†’ S2 author ID
- fetch_author_arxiv_papers(): fetches papers, filters to ArXiv external IDs,
returns up to 20 IDs sorted by citation count descending
- Uses httpx (matches turso_svc/arxiv_svc patterns)

onboarding.py:
- POST /api/onboarding/import-author: parses input, resolves ORCID if needed,
fetches arXiv papers, auto-saves via user_state + db.log_interaction
- Returns inline HTMX partial (alert div) with success/error feedback

seed_search.html:
- Add quick-import form above search bar with HTMX POST
- 'OR search manually' divider between import and search

Tests: 200 passed (3 pre-existing flaky: 2x arXiv 429 + 1x RNG-dependent)

app/config.py CHANGED
@@ -24,6 +24,9 @@ METADATA_CACHE_TTL_DAYS = 30 # re-fetch metadata after this many days
24
  TURSO_URL = os.getenv("TURSO_URL", "")
25
  TURSO_DB_TOKEN = os.getenv("TURSO_DB_TOKEN", "")
26
 
 
 
 
27
  # ── Recommendation settings ───────────────────────────────────────────────────
28
  REC_LIMIT = 10 # how many recommendations to show
29
  REC_POSITIVE_LIMIT = 20 # max positive examples sent to Qdrant
 
24
  TURSO_URL = os.getenv("TURSO_URL", "")
25
  TURSO_DB_TOKEN = os.getenv("TURSO_DB_TOKEN", "")
26
 
27
+ # ── Semantic Scholar API β€” Phase 5.1 (author import) ─────────────────────────
28
+ S2_API_KEY = os.getenv("S2_API_KEY", "")
29
+
30
  # ── Recommendation settings ───────────────────────────────────────────────────
31
  REC_LIMIT = 10 # how many recommendations to show
32
  REC_POSITIVE_LIMIT = 20 # max positive examples sent to Qdrant
app/routers/onboarding.py CHANGED
@@ -159,3 +159,92 @@ async def skip_onboarding(
159
  resp = RedirectResponse("/", status_code=303)
160
  resp.set_cookie(COOKIE_NAME, user_id, max_age=365 * 24 * 3600, httponly=True)
161
  return resp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  resp = RedirectResponse("/", status_code=303)
160
  resp.set_cookie(COOKIE_NAME, user_id, max_age=365 * 24 * 3600, httponly=True)
161
  return resp
162
+
163
+
164
+ @router.post("/api/onboarding/import-author", response_class=HTMLResponse)
165
+ async def import_author(
166
+ request: Request,
167
+ author_url: str = Form(default=""),
168
+ user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
169
+ ):
170
+ """Phase 5.1: Import papers from a Semantic Scholar author profile.
171
+
172
+ Accepts S2 URL, raw S2 author ID, or ORCID.
173
+ Auto-saves the author's arXiv papers as seed interests.
174
+ """
175
+ user_id = user_id or str(uuid.uuid4())
176
+
177
+ if not author_url.strip():
178
+ return HTMLResponse(
179
+ '<div class="alert alert-warning text-sm py-2">'
180
+ '⚠️ Please paste a Semantic Scholar author URL, ID, or ORCID.</div>'
181
+ )
182
+
183
+ from app import s2_svc, user_state as us
184
+
185
+ # 1. Parse input
186
+ parsed_id, input_type = s2_svc.parse_author_input(author_url)
187
+ if parsed_id is None:
188
+ return HTMLResponse(
189
+ '<div class="alert alert-error text-sm py-2">'
190
+ '❌ Could not recognise input. Paste a Semantic Scholar author URL, '
191
+ 'a numeric author ID, or an ORCID (e.g. 0000-0003-3394-6622).</div>'
192
+ )
193
+
194
+ # 2. Resolve ORCID β†’ S2 author ID if needed
195
+ try:
196
+ if input_type == "orcid":
197
+ s2_id = await s2_svc.resolve_orcid(parsed_id)
198
+ if not s2_id:
199
+ return HTMLResponse(
200
+ '<div class="alert alert-warning text-sm py-2">'
201
+ f'⚠️ No Semantic Scholar author found for ORCID {parsed_id}.</div>'
202
+ )
203
+ else:
204
+ s2_id = parsed_id
205
+ except Exception as e:
206
+ print(f"[onboarding] ORCID resolve failed: {e}")
207
+ return HTMLResponse(
208
+ '<div class="alert alert-error text-sm py-2">'
209
+ '❌ Failed to look up ORCID. Please try pasting the S2 URL directly.</div>'
210
+ )
211
+
212
+ # 3. Fetch arXiv papers
213
+ try:
214
+ arxiv_ids = await s2_svc.fetch_author_arxiv_papers(s2_id, limit=20)
215
+ except Exception as e:
216
+ print(f"[onboarding] S2 author paper fetch failed: {e}")
217
+ return HTMLResponse(
218
+ '<div class="alert alert-error text-sm py-2">'
219
+ '❌ Failed to fetch papers from Semantic Scholar. '
220
+ 'The author ID may be invalid, or the API may be down.</div>'
221
+ )
222
+
223
+ if not arxiv_ids:
224
+ return HTMLResponse(
225
+ '<div class="alert alert-warning text-sm py-2">'
226
+ '⚠️ No arXiv papers found for this author. '
227
+ 'They may publish in venues not indexed on arXiv.</div>'
228
+ )
229
+
230
+ # 4. Auto-save each paper as a positive interaction
231
+ for aid in arxiv_ids:
232
+ us.record_positive(user_id, aid)
233
+ await db.log_interaction(
234
+ user_id=user_id,
235
+ paper_id=aid,
236
+ event_type="save",
237
+ source="s2_import",
238
+ )
239
+
240
+ state = await us.ensure_loaded(user_id)
241
+ seed_count = len(state.positives)
242
+
243
+ resp = HTMLResponse(
244
+ f'<div class="alert alert-success text-sm py-2">'
245
+ f'βœ… Imported {len(arxiv_ids)} papers! '
246
+ f'You now have {seed_count} saved papers. '
247
+ f'Click <strong>"Done β€” start exploring β†’"</strong> to see your recommendations.</div>'
248
+ )
249
+ resp.set_cookie(COOKIE_NAME, user_id, max_age=365 * 24 * 3600, httponly=True)
250
+ return resp
app/s2_svc.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Semantic Scholar service β€” Phase 5.1 (author import for onboarding).
3
+
4
+ Accepts an S2 author URL, a raw S2 author ID, or an ORCID, then
5
+ fetches that author's papers and returns arXiv IDs for auto-saving.
6
+
7
+ API docs: https://api.semanticscholar.org/api-docs/graph
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ import httpx
13
+ from app.config import S2_API_KEY
14
+
15
+ _BASE = "https://api.semanticscholar.org/graph/v1"
16
+ _TIMEOUT = 15.0 # seconds
17
+
18
+ # ── Patterns ──────────────────────────────────────────────────────────────────
19
+ # URL: https://www.semanticscholar.org/author/Yoshua-Bengio/1751762
20
+ # Raw: 1751762
21
+ # ORCID: 0000-0003-3394-6622
22
+ _S2_URL_RE = re.compile(
23
+ r"semanticscholar\.org/author/[^/]+/(\d+)", re.IGNORECASE
24
+ )
25
+ _ORCID_RE = re.compile(r"\d{4}-\d{4}-\d{4}-\d{3}[\dX]")
26
+ _RAW_ID_RE = re.compile(r"^\d{3,}$") # 3+ digits = plausible S2 author ID
27
+
28
+
29
+ def _headers() -> dict[str, str]:
30
+ """Build request headers, including API key if available."""
31
+ h: dict[str, str] = {"Accept": "application/json"}
32
+ if S2_API_KEY:
33
+ h["x-api-key"] = S2_API_KEY
34
+ return h
35
+
36
+
37
+ # ── Public API ────────────────────────────────────────────────────────────────
38
+
39
+ def parse_author_input(text: str) -> tuple[str | None, str]:
40
+ """Parse user-provided text into an S2 author ID or ORCID.
41
+
42
+ Returns (s2_author_id | None, input_type) where input_type is one of:
43
+ "s2_url", "s2_id", "orcid", "unknown"
44
+ """
45
+ text = text.strip()
46
+ if not text:
47
+ return None, "unknown"
48
+
49
+ # 1. Try S2 URL
50
+ m = _S2_URL_RE.search(text)
51
+ if m:
52
+ return m.group(1), "s2_url"
53
+
54
+ # 2. Try ORCID
55
+ m = _ORCID_RE.search(text)
56
+ if m:
57
+ return m.group(0), "orcid"
58
+
59
+ # 3. Try raw numeric ID
60
+ if _RAW_ID_RE.match(text):
61
+ return text, "s2_id"
62
+
63
+ return None, "unknown"
64
+
65
+
66
+ async def resolve_orcid(orcid: str) -> str | None:
67
+ """Resolve an ORCID to an S2 author ID via the author search endpoint.
68
+
69
+ Returns the S2 authorId string or None if not found.
70
+ """
71
+ url = f"{_BASE}/author/search"
72
+ params = {"query": orcid, "limit": 1}
73
+ async with httpx.AsyncClient(timeout=_TIMEOUT) as client:
74
+ resp = await client.get(url, params=params, headers=_headers())
75
+ resp.raise_for_status()
76
+ data = resp.json()
77
+ authors = data.get("data", [])
78
+ if authors:
79
+ return str(authors[0]["authorId"])
80
+ return None
81
+
82
+
83
+ async def fetch_author_arxiv_papers(
84
+ author_id: str, limit: int = 50,
85
+ ) -> list[str]:
86
+ """Fetch an author's papers from S2 and return arXiv IDs.
87
+
88
+ Filters to papers that have an ArXiv external ID.
89
+ Returns at most `limit` arXiv IDs, ordered by citation count (desc).
90
+ """
91
+ url = f"{_BASE}/author/{author_id}/papers"
92
+ params = {
93
+ "fields": "externalIds,citationCount",
94
+ "limit": min(limit * 2, 500), # over-fetch since not all have arXiv IDs
95
+ }
96
+ arxiv_ids: list[tuple[int, str]] = [] # (citation_count, arxiv_id)
97
+
98
+ async with httpx.AsyncClient(timeout=_TIMEOUT) as client:
99
+ resp = await client.get(url, params=params, headers=_headers())
100
+ resp.raise_for_status()
101
+ data = resp.json()
102
+ for paper in data.get("data", []):
103
+ ext = paper.get("externalIds") or {}
104
+ arxiv_id = ext.get("ArXiv")
105
+ if arxiv_id:
106
+ cites = paper.get("citationCount") or 0
107
+ arxiv_ids.append((cites, arxiv_id))
108
+
109
+ # Sort by citation count descending so we import the most impactful first
110
+ arxiv_ids.sort(key=lambda x: x[0], reverse=True)
111
+ return [aid for _, aid in arxiv_ids[:limit]]
app/templates/partials/seed_search.html CHANGED
@@ -15,6 +15,30 @@
15
  </p>
16
  </div>
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  {# Search bar #}
19
  <div class="mb-4">
20
  <form hx-get="/api/onboarding/seed-search"
 
15
  </p>
16
  </div>
17
 
18
+ {# Phase 5.1: Quick author import #}
19
+ <div class="mb-4 p-3 bg-base-200/50 rounded-lg">
20
+ <p class="text-xs font-medium text-base-content/70 mb-2">
21
+ ⚑ Quick import: Paste your Semantic Scholar profile URL to auto-import papers
22
+ </p>
23
+ <form hx-post="/api/onboarding/import-author"
24
+ hx-target="#import-result"
25
+ hx-swap="innerHTML"
26
+ hx-indicator="#import-spinner"
27
+ class="flex gap-2">
28
+ <input type="text"
29
+ name="author_url"
30
+ placeholder="e.g. https://www.semanticscholar.org/author/…/1234567"
31
+ class="input input-bordered input-sm flex-1 text-xs" />
32
+ <button class="btn btn-secondary btn-sm" type="submit">
33
+ Import
34
+ <span id="import-spinner" class="htmx-indicator loading loading-spinner loading-xs ml-1"></span>
35
+ </button>
36
+ </form>
37
+ <div id="import-result" class="mt-2"></div>
38
+ </div>
39
+
40
+ <div class="divider text-xs text-base-content/40">OR search manually</div>
41
+
42
  {# Search bar #}
43
  <div class="mb-4">
44
  <form hx-get="/api/onboarding/seed-search"