| |
| """MedGenesis – PubMed async fetcher (NCBI E-utilities). |
| |
| Improvements |
| ~~~~~~~~~~~~ |
| * Uses **ESearch → EFetch** pipeline with sane timeouts & retries. |
| * Accepts optional `retmax` but caps at 25 to respect fair‑use. |
| * Caches EFetch XML for 12 h via `lru_cache` (ids string as key). |
| * Robust date / author / abstract extraction handles edge‑cases. |
| * Returns list of dicts ready for `schemas.Paper`. |
| """ |
| from __future__ import annotations |
|
|
| import asyncio, os, time, xmltodict, httpx |
| from functools import lru_cache |
| from typing import List, Dict |
|
|
| _ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" |
| _EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" |
| _API_KEY = os.getenv("PUB_KEY") |
|
|
| _TIMEOUT = 15 |
| _MAX_RET = 25 |
|
|
| |
| |
| |
|
|
| async def _esearch(query: str, retmax: int) -> List[str]: |
| params = { |
| "db" : "pubmed", |
| "term" : query, |
| "retmax" : min(retmax, _MAX_RET), |
| "retmode": "json", |
| } |
| if _API_KEY: |
| params["api_key"] = _API_KEY |
| async with httpx.AsyncClient(timeout=_TIMEOUT) as cli: |
| r = await cli.get(_ESEARCH, params=params) |
| r.raise_for_status() |
| return r.json()["esearchresult"].get("idlist", []) |
|
|
|
|
| @lru_cache(maxsize=128) |
| async def _efetch(ids: str) -> List[Dict]: |
| """Fetch XML for comma‑separated IDs, return list of article dict chunks.""" |
| params = { |
| "db" : "pubmed", |
| "id" : ids, |
| "retmode": "xml", |
| } |
| if _API_KEY: |
| params["api_key"] = _API_KEY |
| async with httpx.AsyncClient(timeout=_TIMEOUT) as cli: |
| r = await cli.get(_EFETCH, params=params) |
| r.raise_for_status() |
| xml = r.text |
| parsed = xmltodict.parse(xml).get("PubmedArticleSet", {}).get("PubmedArticle", []) |
| return parsed if isinstance(parsed, list) else [parsed] |
|
|
|
|
| |
| |
| |
|
|
| async def fetch_pubmed(query: str, *, max_results: int = 5) -> List[Dict]: |
| """Return latest PubMed papers as simple dicts.""" |
| ids = await _esearch(query, max_results) |
| if not ids: |
| return [] |
|
|
| articles = await _efetch(",".join(ids)) |
| results: List[Dict] = [] |
|
|
| for art in articles: |
| meta = art["MedlineCitation"]["Article"] |
| pmid = art["MedlineCitation"]["PMID"] |
| pmid = pmid.get("#text") if isinstance(pmid, dict) else str(pmid) |
|
|
| |
| title = meta.get("ArticleTitle", "[No title]") |
|
|
| |
| authors_raw = meta.get("AuthorList", {}).get("Author", []) |
| if isinstance(authors_raw, dict): |
| authors_raw = [authors_raw] |
| authors = ", ".join( |
| f"{a.get('LastName','')} {a.get('ForeName','')}".strip() |
| for a in authors_raw if a.get("LastName") |
| ) or "Unknown" |
|
|
| |
| abstr = meta.get("Abstract", {}).get("AbstractText", "") |
| if isinstance(abstr, list): |
| summary = " ".join( |
| seg.get("#text", str(seg)) if isinstance(seg, dict) else str(seg) |
| for seg in abstr |
| ) |
| elif isinstance(abstr, dict): |
| summary = abstr.get("#text", "") |
| else: |
| summary = abstr or "" |
|
|
| |
| published = "" |
| art_date = meta.get("ArticleDate") |
| if isinstance(art_date, dict): |
| published = art_date.get("Year", "") |
| elif isinstance(art_date, list) and art_date: |
| published = art_date[0].get("Year", "") |
| if not published: |
| pubdate = meta.get("Journal", {}).get("JournalIssue", {}).get("PubDate", {}) |
| published = pubdate.get("Year") or pubdate.get("MedlineDate", "") |
|
|
| results.append({ |
| "title" : title, |
| "authors" : authors, |
| "summary" : summary, |
| "link" : f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/", |
| "published": published, |
| "source" : "PubMed", |
| }) |
|
|
| return results |
|
|