Spaces:

evalstate
/

hf-papers

Sleeping

App Files Files Community

evalstate HF Staff commited on Jan 22

Commit

b487777

verified ·

1 Parent(s): 5a0e824

Deploy hf_paper_search MCP server

Browse files

Files changed (4) hide show

Dockerfile +26 -0
README.md +26 -6
hf_paper_search.md +43 -0
hf_papers_tool.py +185 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM python:3.13-slim
+RUN apt-get update && \
+    apt-get install -y \
+      bash \
+      git git-lfs \
+      wget curl procps \
+      && rm -rf /var/lib/apt/lists/*
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+WORKDIR /app
+RUN uv pip install --system --no-cache fast-agent-mcp
+COPY --link ./ /app
+RUN chown -R 1000:1000 /app
+USER 1000
+EXPOSE 7860
+CMD ["fast-agent", "serve", \
+     "--card", "hf_paper_search.md", \
+     "--transport", "http", \
+     "--instance-scope", "request", \
+     "--host", "0.0.0.0", \
+     "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,30 @@
 ---
-title: Hf Papers
-emoji: 💻
-colorFrom: yellow
-colorTo: purple
 sdk: docker
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: HF Papers Search
+emoji: 📚
+colorFrom: purple
+colorTo: blue
 sdk: docker
+app_port: 7860
+short_description: Fast-agent MCP server for Hugging Face Daily Papers search
 ---
+# HF Papers Search (MCP)
+This Space runs [fast-agent](https://fast-agent.ai/) as an MCP server to provide
+specialized search over the Hugging Face Daily Papers feed.
+## Features
+- Query `/api/daily_papers` with date/week/month filters
+- Local keyword filtering across titles, summaries, authors, AI keywords
+- Token passthrough via Hugging Face OAuth or Bearer tokens
+## Environment Variables
+Set these in Space settings:
+- `FAST_AGENT_SERVE_OAUTH=hf`
+- `FAST_AGENT_OAUTH_SCOPES=inference-api`
+- `FAST_AGENT_OAUTH_RESOURCE_URL=https://evalstate-hf-papers.hf.space`
+- `HF_TOKEN=hf_dummy` (dummy token required at startup)
+- `OPENAI_API_KEY=DUMMY` (per request, your clients can override)
+## Usage
+Once running, the agent is available via HTTP at the Space URL.

hf_paper_search.md ADDED Viewed

	@@ -0,0 +1,43 @@

+---
+type: agent
+name: hf-papers-search
+function_tools:
+  - hf_papers_tool.py:hf_papers_search
+model: gpt-oss
+description: "Search Hugging Face Daily Papers with local keyword filtering, date/week/month selectors, and trending/published sort. Returns structured paper entries from /api/daily_papers."
+---
+Hugging Face Daily Papers Search
+================================
+Use this tool when you need a specialized paper search against the Hugging Face
+Daily Papers feed. It queries `/api/daily_papers` and applies optional local
+keyword filtering across titles, summaries, authors, AI summaries, keywords,
+project pages, GitHub repos, and paper ids (arXiv ids).
+Tool
+----
+`hf_papers_search(query: str | None, *, date, week, month, submitter, sort, limit, page, max_pages, api_limit)`
+Parameters
+----------
+- `query`: Keyword search (case-insensitive). Multiple tokens are ANDed.
+- `date`: ISO date `YYYY-MM-DD`.
+- `week`: ISO week `YYYY-Www`.
+- `month`: ISO month `YYYY-MM`.
+- `submitter`: HF username of the submitter.
+- `sort`: `publishedAt` or `trending`.
+- `limit`: Max results to return after filtering (default 20).
+- `page`: API page index (default 0).
+- `max_pages`: How many pages to fetch for local filtering (default 1).
+- `api_limit`: Page size for the API (default 50, max 100).
+Examples
+--------
+- Latest papers (first page):
+  `hf_papers_search()`
+- Search for "diffusion" in the past week, up to 40 results, across 3 pages:
+  `hf_papers_search("diffusion", week="2026-W03", limit=40, max_pages=3)`
+- Trending papers this month tagged by query term:
+  `hf_papers_search("alignment", month="2026-01", sort="trending")`

hf_papers_tool.py ADDED Viewed

	@@ -0,0 +1,185 @@

+from __future__ import annotations
+import json
+import os
+import re
+from pathlib import Path
+from typing import Any
+from urllib.error import HTTPError, URLError
+from urllib.parse import urlencode
+from urllib.request import Request, urlopen
+DEFAULT_LIMIT = 20
+DEFAULT_TIMEOUT_SEC = 30
+MAX_API_LIMIT = 100
+def _load_token() -> str | None:
+    # Check for request-scoped token first (when running as MCP server)
+    try:
+        from fast_agent.mcp.auth.context import request_bearer_token
+        ctx_token = request_bearer_token.get()
+        if ctx_token:
+            return ctx_token
+    except ImportError:
+        pass
+    token = os.getenv("HF_TOKEN")
+    if token:
+        return token
+    token_path = Path.home() / ".cache" / "huggingface" / "token"
+    if token_path.exists():
+        token_value = token_path.read_text(encoding="utf-8").strip()
+        return token_value or None
+    return None
+def _normalize_date_param(value: str | None) -> str | None:
+    if not value:
+        return None
+    return value.strip()
+def _build_url(params: dict[str, Any]) -> str:
+    base = os.getenv("HF_ENDPOINT", "https://huggingface.co").rstrip("/")
+    query = urlencode({k: v for k, v in params.items() if v is not None}, doseq=True)
+    return f"{base}/api/daily_papers?{query}" if query else f"{base}/api/daily_papers"
+def _request_json(url: str) -> list[dict[str, Any]]:
+    headers = {"Accept": "application/json"}
+    token = _load_token()
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+    request = Request(url, headers=headers, method="GET")
+    try:
+        with urlopen(request, timeout=DEFAULT_TIMEOUT_SEC) as response:
+            raw = response.read()
+    except HTTPError as exc:
+        error_body = exc.read().decode("utf-8", errors="replace")
+        raise RuntimeError(f"HF API error {exc.code} for {url}: {error_body}") from exc
+    except URLError as exc:
+        raise RuntimeError(f"HF API request failed for {url}: {exc}") from exc
+    payload = json.loads(raw)
+    if not isinstance(payload, list):
+        raise RuntimeError("Unexpected response shape from /api/daily_papers")
+    return payload
+def _extract_search_blob(item: dict[str, Any]) -> str:
+    paper = item.get("paper") or {}
+    authors = paper.get("authors") or []
+    author_names = [a.get("name", "") for a in authors if isinstance(a, dict)]
+    ai_keywords = paper.get("ai_keywords") or []
+    if isinstance(ai_keywords, list):
+        ai_keywords_text = " ".join(str(k) for k in ai_keywords)
+    else:
+        ai_keywords_text = str(ai_keywords)
+    parts = [
+        item.get("title"),
+        item.get("summary"),
+        paper.get("title"),
+        paper.get("summary"),
+        paper.get("ai_summary"),
+        ai_keywords_text,
+        " ".join(author_names),
+        paper.get("id"),
+        paper.get("projectPage"),
+        paper.get("githubRepo"),
+    ]
+    text = " ".join(str(part) for part in parts if part)
+    return text.lower()
+def _matches_query(item: dict[str, Any], query: str) -> bool:
+    tokens = [t for t in re.split(r"\s+", query.strip().lower()) if t]
+    if not tokens:
+        return True
+    haystack = _extract_search_blob(item)
+    return all(token in haystack for token in tokens)
+def hf_papers_search(
+    query: str | None = None,
+    *,
+    date: str | None = None,
+    week: str | None = None,
+    month: str | None = None,
+    submitter: str | None = None,
+    sort: str | None = None,
+    limit: int | None = None,
+    page: int | None = None,
+    max_pages: int | None = None,
+    api_limit: int | None = None,
+) -> dict[str, Any]:
+    """
+    Search Hugging Face Daily Papers with optional local filtering.
+    Args:
+        query: Case-insensitive keyword search across title, summary, authors,
+            AI summary/keywords, project page, repo link, and paper id.
+        date: ISO date (YYYY-MM-DD).
+        week: ISO week (YYYY-Www).
+        month: ISO month (YYYY-MM).
+        submitter: HF username of the submitter.
+        sort: "publishedAt" or "trending".
+        limit: Max results to return after filtering (default 20).
+        page: Page index for the API (default 0).
+        max_pages: Number of pages to fetch for local filtering (default 1).
+        api_limit: Page size for the API (default 50, max 100).
+    Returns:
+        dict with query metadata and list of daily paper entries.
+    """
+    resolved_limit = DEFAULT_LIMIT if limit is None else max(int(limit), 1)
+    start_page = max(int(page or 0), 0)
+    pages_to_fetch = max(int(max_pages or 1), 1)
+    per_page = 50 if api_limit is None else max(int(api_limit), 1)
+    per_page = min(per_page, MAX_API_LIMIT)
+    params_base: dict[str, Any] = {
+        "date": _normalize_date_param(date),
+        "week": _normalize_date_param(week),
+        "month": _normalize_date_param(month),
+        "submitter": submitter.strip() if submitter else None,
+        "sort": sort.strip() if sort else None,
+        "limit": per_page,
+    }
+    results: list[dict[str, Any]] = []
+    pages_fetched = 0
+    for page_index in range(start_page, start_page + pages_to_fetch):
+        params = {**params_base, "p": page_index}
+        url = _build_url(params)
+        payload = _request_json(url)
+        pages_fetched += 1
+        if query:
+            filtered = [item for item in payload if _matches_query(item, query)]
+        else:
+            filtered = payload
+        results.extend(filtered)
+        if len(results) >= resolved_limit:
+            break
+    return {
+        "query": query,
+        "params": {
+            **{k: v for k, v in params_base.items() if v is not None},
+            "page": start_page,
+            "max_pages": pages_fetched,
+            "api_limit": per_page,
+        },
+        "returned": min(len(results), resolved_limit),
+        "data": results[:resolved_limit],
+    }