Spaces:

mgbam
/

MCP_Res

Runtime error

App Files Files Community

mgbam commited on Jun 25, 2025

Commit

aae312e

verified ·

1 Parent(s): b506ef3

Update mcp/arxiv.py

Browse files

Files changed (1) hide show

mcp/arxiv.py +72 -16

mcp/arxiv.py CHANGED Viewed

@@ -1,23 +1,79 @@
-# mcp/arxiv.py
-import feedparser
 from urllib.parse import quote_plus
-ARXIV_BASE = "http://export.arxiv.org/api/query?search_query="
-async def fetch_arxiv(query: str, max_results: int = 5):
-    """Fetch latest arXiv papers for the query."""
-    encoded_query = quote_plus(query)
-    search_url = f"{ARXIV_BASE}{encoded_query}&max_results={max_results}"
-    feed = feedparser.parse(search_url)
-    results = []
-    for entry in feed.entries:
         results.append({
-            "title": getattr(entry, "title", ""),
-            "authors": ", ".join([a.name for a in getattr(entry, "authors", [])]) if hasattr(entry, 'authors') else "",
-            "summary": getattr(entry, "summary", ""),
-            "link": getattr(entry, "link", ""),
-            "published": entry.get("published", "") if hasattr(entry, 'get') else getattr(entry, "published", ""),
-            "source": "arXiv"
         })
     return results

+#!/usr/bin/env python3
+"""MedGenesis – arXiv async fetcher (Atom API).
+Improvements over the legacy helper
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+* Uses **httpx.AsyncClient** with 10‑second timeout & *exponential back‑off retry*.
+* Caches raw XML for 6 h via `lru_cache` (key = query+max_results).
+* Parses feed with **feedparser** inside a thread to avoid blocking.
+* Normalises output to match `schemas.Paper`.
+API docs: https://arxiv.org/help/api/user-manual
+"""
+from __future__ import annotations
+import asyncio, feedparser
+from functools import lru_cache
+from typing import List, Dict
 from urllib.parse import quote_plus
+import httpx
+_BASE = "http://export.arxiv.org/api/query?search_query="
+_TIMEOUT = 10
+_MAX_RES = 25
+_HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"}
+# ---------------------------------------------------------------------
+# Internal fetch w/ retry
+# ---------------------------------------------------------------------
+async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
+    """Return Atom XML text from arXiv."""
+    max_results = max(1, min(max_results, _MAX_RES))
+    url = f"{_BASE}{quote_plus(query)}&max_results={max_results}"
+    delay = 2
+    last: httpx.Response | None = None
+    for _ in range(retries):
+        async with httpx.AsyncClient(timeout=_TIMEOUT, headers=_HEADERS) as cli:
+            last = await cli.get(url)
+            if last.status_code == 200:
+                return last.text
+        await asyncio.sleep(delay)
+        delay *= 2
+    raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}")
+# ---------------------------------------------------------------------
+# Cached fetch + parse
+# ---------------------------------------------------------------------
+@lru_cache(maxsize=256)
+async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]:
+    """Return list of arXiv paper dicts compatible with `schemas.Paper`."""
+    xml_text = await _fetch_raw(query, max_results)
+    # feedparser is blocking; run in thread
+    feed = await asyncio.to_thread(feedparser.parse, xml_text)
+    results: List[Dict] = []
+    for ent in feed.entries:
+        authors = ", ".join(a.name for a in getattr(ent, "authors", [])) if hasattr(ent, "authors") else "Unknown"
+        published = getattr(ent, "published", "")
         results.append({
+            "title"    : getattr(ent, "title", "[No title]"),
+            "authors"  : authors,
+            "summary"  : getattr(ent, "summary", ""),
+            "link"     : getattr(ent, "link", ""),
+            "published": published,
+            "source"   : "arXiv",
         })
     return results
+# ---------------------------------------------------------------------
+# CLI demo
+# ---------------------------------------------------------------------
+if __name__ == "__main__":
+    import json, asyncio
+    async def _demo():
+        papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3)
+        print(json.dumps(papers, indent=2)[:500])
+    asyncio.run(_demo())