Spaces:

siddhm11
/

ResearchIT

Running

siddhm11 commited on May 5

Commit

239539e

1 Parent(s): 003b415

Phase 6.5 Day 5: Semantic Scholar author import (B4)

config.py:
- Add S2_API_KEY = os.getenv('S2_API_KEY', '') — key already in .env

s2_svc.py: [NEW]
- parse_author_input(): accepts S2 URL, raw S2 ID, or ORCID
- resolve_orcid(): S2 author search API → S2 author ID
- fetch_author_arxiv_papers(): fetches papers, filters to ArXiv external IDs,
returns up to 20 IDs sorted by citation count descending
- Uses httpx (matches turso_svc/arxiv_svc patterns)

onboarding.py:
- POST /api/onboarding/import-author: parses input, resolves ORCID if needed,
fetches arXiv papers, auto-saves via user_state + db.log_interaction
- Returns inline HTMX partial (alert div) with success/error feedback

seed_search.html:
- Add quick-import form above search bar with HTMX POST
- 'OR search manually' divider between import and search

Tests: 200 passed (3 pre-existing flaky: 2x arXiv 429 + 1x RNG-dependent)

Files changed (4) hide show

app/config.py +3 -0
app/routers/onboarding.py +89 -0
app/s2_svc.py +111 -0
app/templates/partials/seed_search.html +24 -0

app/config.py CHANGED Viewed

@@ -24,6 +24,9 @@ METADATA_CACHE_TTL_DAYS = 30    # re-fetch metadata after this many days
 TURSO_URL = os.getenv("TURSO_URL", "")
 TURSO_DB_TOKEN = os.getenv("TURSO_DB_TOKEN", "")
 # ── Recommendation settings ───────────────────────────────────────────────────
 REC_LIMIT = 10                  # how many recommendations to show
 REC_POSITIVE_LIMIT = 20         # max positive examples sent to Qdrant

 TURSO_URL = os.getenv("TURSO_URL", "")
 TURSO_DB_TOKEN = os.getenv("TURSO_DB_TOKEN", "")
+# ── Semantic Scholar API — Phase 5.1 (author import) ─────────────────────────
+S2_API_KEY = os.getenv("S2_API_KEY", "")
 # ── Recommendation settings ───────────────────────────────────────────────────
 REC_LIMIT = 10                  # how many recommendations to show
 REC_POSITIVE_LIMIT = 20         # max positive examples sent to Qdrant

app/routers/onboarding.py CHANGED Viewed

@@ -159,3 +159,92 @@ async def skip_onboarding(
     resp = RedirectResponse("/", status_code=303)
     resp.set_cookie(COOKIE_NAME, user_id, max_age=365 * 24 * 3600, httponly=True)
     return resp

     resp = RedirectResponse("/", status_code=303)
     resp.set_cookie(COOKIE_NAME, user_id, max_age=365 * 24 * 3600, httponly=True)
     return resp
+@router.post("/api/onboarding/import-author", response_class=HTMLResponse)
+async def import_author(
+    request: Request,
+    author_url: str = Form(default=""),
+    user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
+):
+    """Phase 5.1: Import papers from a Semantic Scholar author profile.
+    Accepts S2 URL, raw S2 author ID, or ORCID.
+    Auto-saves the author's arXiv papers as seed interests.
+    """
+    user_id = user_id or str(uuid.uuid4())
+    if not author_url.strip():
+        return HTMLResponse(
+            '<div class="alert alert-warning text-sm py-2">'
+            '⚠️ Please paste a Semantic Scholar author URL, ID, or ORCID.</div>'
+        )
+    from app import s2_svc, user_state as us
+    # 1. Parse input
+    parsed_id, input_type = s2_svc.parse_author_input(author_url)
+    if parsed_id is None:
+        return HTMLResponse(
+            '<div class="alert alert-error text-sm py-2">'
+            '❌ Could not recognise input. Paste a Semantic Scholar author URL, '
+            'a numeric author ID, or an ORCID (e.g. 0000-0003-3394-6622).</div>'
+        )
+    # 2. Resolve ORCID → S2 author ID if needed
+    try:
+        if input_type == "orcid":
+            s2_id = await s2_svc.resolve_orcid(parsed_id)
+            if not s2_id:
+                return HTMLResponse(
+                    '<div class="alert alert-warning text-sm py-2">'
+                    f'⚠️ No Semantic Scholar author found for ORCID {parsed_id}.</div>'
+                )
+        else:
+            s2_id = parsed_id
+    except Exception as e:
+        print(f"[onboarding] ORCID resolve failed: {e}")
+        return HTMLResponse(
+            '<div class="alert alert-error text-sm py-2">'
+            '❌ Failed to look up ORCID. Please try pasting the S2 URL directly.</div>'
+        )
+    # 3. Fetch arXiv papers
+    try:
+        arxiv_ids = await s2_svc.fetch_author_arxiv_papers(s2_id, limit=20)
+    except Exception as e:
+        print(f"[onboarding] S2 author paper fetch failed: {e}")
+        return HTMLResponse(
+            '<div class="alert alert-error text-sm py-2">'
+            '❌ Failed to fetch papers from Semantic Scholar. '
+            'The author ID may be invalid, or the API may be down.</div>'
+        )
+    if not arxiv_ids:
+        return HTMLResponse(
+            '<div class="alert alert-warning text-sm py-2">'
+            '⚠️ No arXiv papers found for this author. '
+            'They may publish in venues not indexed on arXiv.</div>'
+        )
+    # 4. Auto-save each paper as a positive interaction
+    for aid in arxiv_ids:
+        us.record_positive(user_id, aid)
+        await db.log_interaction(
+            user_id=user_id,
+            paper_id=aid,
+            event_type="save",
+            source="s2_import",
+        )
+    state = await us.ensure_loaded(user_id)
+    seed_count = len(state.positives)
+    resp = HTMLResponse(
+        f'<div class="alert alert-success text-sm py-2">'
+        f'✅ Imported {len(arxiv_ids)} papers! '
+        f'You now have {seed_count} saved papers. '
+        f'Click <strong>"Done — start exploring →"</strong> to see your recommendations.</div>'
+    )
+    resp.set_cookie(COOKIE_NAME, user_id, max_age=365 * 24 * 3600, httponly=True)
+    return resp

app/s2_svc.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""
+Semantic Scholar service — Phase 5.1 (author import for onboarding).
+Accepts an S2 author URL, a raw S2 author ID, or an ORCID, then
+fetches that author's papers and returns arXiv IDs for auto-saving.
+API docs: https://api.semanticscholar.org/api-docs/graph
+"""
+from __future__ import annotations
+import re
+import httpx
+from app.config import S2_API_KEY
+_BASE = "https://api.semanticscholar.org/graph/v1"
+_TIMEOUT = 15.0  # seconds
+# ── Patterns ──────────────────────────────────────────────────────────────────
+#   URL:   https://www.semanticscholar.org/author/Yoshua-Bengio/1751762
+#   Raw:   1751762
+#   ORCID: 0000-0003-3394-6622
+_S2_URL_RE = re.compile(
+    r"semanticscholar\.org/author/[^/]+/(\d+)", re.IGNORECASE
+)
+_ORCID_RE = re.compile(r"\d{4}-\d{4}-\d{4}-\d{3}[\dX]")
+_RAW_ID_RE = re.compile(r"^\d{3,}$")  # 3+ digits = plausible S2 author ID
+def _headers() -> dict[str, str]:
+    """Build request headers, including API key if available."""
+    h: dict[str, str] = {"Accept": "application/json"}
+    if S2_API_KEY:
+        h["x-api-key"] = S2_API_KEY
+    return h
+# ── Public API ────────────────────────────────────────────────────────────────
+def parse_author_input(text: str) -> tuple[str | None, str]:
+    """Parse user-provided text into an S2 author ID or ORCID.
+    Returns (s2_author_id | None, input_type) where input_type is one of:
+      "s2_url", "s2_id", "orcid", "unknown"
+    """
+    text = text.strip()
+    if not text:
+        return None, "unknown"
+    # 1. Try S2 URL
+    m = _S2_URL_RE.search(text)
+    if m:
+        return m.group(1), "s2_url"
+    # 2. Try ORCID
+    m = _ORCID_RE.search(text)
+    if m:
+        return m.group(0), "orcid"
+    # 3. Try raw numeric ID
+    if _RAW_ID_RE.match(text):
+        return text, "s2_id"
+    return None, "unknown"
+async def resolve_orcid(orcid: str) -> str | None:
+    """Resolve an ORCID to an S2 author ID via the author search endpoint.
+    Returns the S2 authorId string or None if not found.
+    """
+    url = f"{_BASE}/author/search"
+    params = {"query": orcid, "limit": 1}
+    async with httpx.AsyncClient(timeout=_TIMEOUT) as client:
+        resp = await client.get(url, params=params, headers=_headers())
+        resp.raise_for_status()
+        data = resp.json()
+        authors = data.get("data", [])
+        if authors:
+            return str(authors[0]["authorId"])
+    return None
+async def fetch_author_arxiv_papers(
+    author_id: str, limit: int = 50,
+) -> list[str]:
+    """Fetch an author's papers from S2 and return arXiv IDs.
+    Filters to papers that have an ArXiv external ID.
+    Returns at most `limit` arXiv IDs, ordered by citation count (desc).
+    """
+    url = f"{_BASE}/author/{author_id}/papers"
+    params = {
+        "fields": "externalIds,citationCount",
+        "limit": min(limit * 2, 500),  # over-fetch since not all have arXiv IDs
+    }
+    arxiv_ids: list[tuple[int, str]] = []  # (citation_count, arxiv_id)
+    async with httpx.AsyncClient(timeout=_TIMEOUT) as client:
+        resp = await client.get(url, params=params, headers=_headers())
+        resp.raise_for_status()
+        data = resp.json()
+        for paper in data.get("data", []):
+            ext = paper.get("externalIds") or {}
+            arxiv_id = ext.get("ArXiv")
+            if arxiv_id:
+                cites = paper.get("citationCount") or 0
+                arxiv_ids.append((cites, arxiv_id))
+    # Sort by citation count descending so we import the most impactful first
+    arxiv_ids.sort(key=lambda x: x[0], reverse=True)
+    return [aid for _, aid in arxiv_ids[:limit]]

app/templates/partials/seed_search.html CHANGED Viewed

@@ -15,6 +15,30 @@
     </p>
   </div>
   {# Search bar #}
   <div class="mb-4">
     <form hx-get="/api/onboarding/seed-search"

     </p>
   </div>
+  {# Phase 5.1: Quick author import #}
+  <div class="mb-4 p-3 bg-base-200/50 rounded-lg">
+    <p class="text-xs font-medium text-base-content/70 mb-2">
+      ⚡ Quick import: Paste your Semantic Scholar profile URL to auto-import papers
+    </p>
+    <form hx-post="/api/onboarding/import-author"
+          hx-target="#import-result"
+          hx-swap="innerHTML"
+          hx-indicator="#import-spinner"
+          class="flex gap-2">
+      <input type="text"
+             name="author_url"
+             placeholder="e.g. https://www.semanticscholar.org/author/…/1234567"
+             class="input input-bordered input-sm flex-1 text-xs" />
+      <button class="btn btn-secondary btn-sm" type="submit">
+        Import
+        <span id="import-spinner" class="htmx-indicator loading loading-spinner loading-xs ml-1"></span>
+      </button>
+    </form>
+    <div id="import-result" class="mt-2"></div>
+  </div>
+  <div class="divider text-xs text-base-content/40">OR search manually</div>
   {# Search bar #}
   <div class="mb-4">
     <form hx-get="/api/onboarding/seed-search"