|
|
"""bioRxiv/medRxiv preprint search tool.""" |
|
|
|
|
|
import re |
|
|
from datetime import datetime, timedelta |
|
|
from typing import Any |
|
|
|
|
|
import httpx |
|
|
from tenacity import retry, stop_after_attempt, wait_exponential |
|
|
|
|
|
from src.utils.exceptions import SearchError |
|
|
from src.utils.models import Citation, Evidence |
|
|
|
|
|
|
|
|
class BioRxivTool: |
|
|
"""Search tool for bioRxiv and medRxiv preprints.""" |
|
|
|
|
|
BASE_URL = "https://api.biorxiv.org/details" |
|
|
|
|
|
DEFAULT_SERVER = "medrxiv" |
|
|
|
|
|
DEFAULT_DAYS = 90 |
|
|
|
|
|
def __init__(self, server: str = DEFAULT_SERVER, days: int = DEFAULT_DAYS) -> None: |
|
|
""" |
|
|
Initialize bioRxiv tool. |
|
|
|
|
|
Args: |
|
|
server: "biorxiv" or "medrxiv" |
|
|
days: How many days back to search |
|
|
""" |
|
|
self.server = server |
|
|
self.days = days |
|
|
|
|
|
@property |
|
|
def name(self) -> str: |
|
|
return "biorxiv" |
|
|
|
|
|
@retry( |
|
|
stop=stop_after_attempt(3), |
|
|
wait=wait_exponential(multiplier=1, min=1, max=10), |
|
|
reraise=True, |
|
|
) |
|
|
async def search(self, query: str, max_results: int = 10) -> list[Evidence]: |
|
|
""" |
|
|
Search bioRxiv/medRxiv for preprints matching query. |
|
|
|
|
|
Note: bioRxiv API doesn't support keyword search directly. |
|
|
We fetch recent papers and filter client-side. |
|
|
|
|
|
Args: |
|
|
query: Search query (keywords) |
|
|
max_results: Maximum results to return |
|
|
|
|
|
Returns: |
|
|
List of Evidence objects from preprints |
|
|
""" |
|
|
|
|
|
end_date = datetime.now().strftime("%Y-%m-%d") |
|
|
start_date = (datetime.now() - timedelta(days=self.days)).strftime("%Y-%m-%d") |
|
|
interval = f"{start_date}/{end_date}" |
|
|
|
|
|
|
|
|
url = f"{self.BASE_URL}/{self.server}/{interval}/0/json" |
|
|
|
|
|
async with httpx.AsyncClient(timeout=30.0) as client: |
|
|
try: |
|
|
response = await client.get(url) |
|
|
response.raise_for_status() |
|
|
except httpx.HTTPStatusError as e: |
|
|
raise SearchError(f"bioRxiv search failed: {e}") from e |
|
|
except httpx.RequestError as e: |
|
|
raise SearchError(f"bioRxiv connection failed: {e}") from e |
|
|
|
|
|
data = response.json() |
|
|
papers = data.get("collection", []) |
|
|
|
|
|
|
|
|
query_terms = self._extract_terms(query) |
|
|
matching = self._filter_by_keywords(papers, query_terms, max_results) |
|
|
|
|
|
return [self._paper_to_evidence(paper) for paper in matching] |
|
|
|
|
|
def _extract_terms(self, query: str) -> list[str]: |
|
|
"""Extract search terms from query.""" |
|
|
|
|
|
terms = re.findall(r"\b\w+\b", query.lower()) |
|
|
|
|
|
stop_words = {"the", "a", "an", "in", "on", "for", "and", "or", "of", "to"} |
|
|
return [t for t in terms if t not in stop_words and len(t) > 2] |
|
|
|
|
|
def _filter_by_keywords( |
|
|
self, papers: list[dict[str, Any]], terms: list[str], max_results: int |
|
|
) -> list[dict[str, Any]]: |
|
|
"""Filter papers that contain query terms in title or abstract.""" |
|
|
scored_papers = [] |
|
|
|
|
|
for paper in papers: |
|
|
title = paper.get("title", "").lower() |
|
|
abstract = paper.get("abstract", "").lower() |
|
|
text = f"{title} {abstract}" |
|
|
|
|
|
|
|
|
matches = sum(1 for term in terms if term in text) |
|
|
|
|
|
if matches > 0: |
|
|
scored_papers.append((matches, paper)) |
|
|
|
|
|
|
|
|
scored_papers.sort(key=lambda x: x[0], reverse=True) |
|
|
|
|
|
return [paper for _, paper in scored_papers[:max_results]] |
|
|
|
|
|
def _paper_to_evidence(self, paper: dict[str, Any]) -> Evidence: |
|
|
"""Convert a preprint paper to Evidence.""" |
|
|
doi = paper.get("doi", "") |
|
|
title = paper.get("title", "Untitled") |
|
|
authors_str = paper.get("authors", "Unknown") |
|
|
date = paper.get("date", "Unknown") |
|
|
abstract = paper.get("abstract", "No abstract available.") |
|
|
category = paper.get("category", "") |
|
|
|
|
|
|
|
|
authors = [a.strip() for a in authors_str.split(";")][:5] |
|
|
|
|
|
|
|
|
truncated_abstract = abstract[:1800] |
|
|
suffix = "..." if len(abstract) > 1800 else "" |
|
|
|
|
|
|
|
|
content = ( |
|
|
f"[PREPRINT - Not peer-reviewed] {truncated_abstract}{suffix} Category: {category}." |
|
|
) |
|
|
|
|
|
return Evidence( |
|
|
content=content[:2000], |
|
|
citation=Citation( |
|
|
source="biorxiv", |
|
|
title=title[:500], |
|
|
url=f"https://doi.org/{doi}" if doi else "https://www.medrxiv.org/", |
|
|
date=date, |
|
|
authors=authors, |
|
|
), |
|
|
relevance=0.75, |
|
|
) |
|
|
|