Spaces:

VibecoderMcSwaggins
/

DeepBoner

Paused

File size: 6,407 Bytes

"""OpenAlex search tool - citation-aware scholarly search."""

import re
from typing import Any

import httpx
from tenacity import retry, stop_after_attempt, wait_exponential

from src.utils.exceptions import SearchError
from src.utils.models import Citation, Evidence


class OpenAlexTool:
    """
    Search OpenAlex for scholarly works with citation metrics.

    OpenAlex indexes 209M+ works and provides:
    - Citation counts (prioritize influential papers)
    - Concept tagging (hierarchical classification)
    - Open access links (direct PDF URLs)
    - Related works (ML-powered similarity)

    API Docs: https://docs.openalex.org
    Rate Limits: Polite pool with mailto = 100k/day
    """

    BASE_URL = "https://api.openalex.org/works"
    POLITE_EMAIL = "deepboner-research@proton.me"

    @property
    def name(self) -> str:
        return "openalex"

    @retry(
        stop=stop_after_attempt(3),
        wait=wait_exponential(multiplier=1, min=1, max=10),
        reraise=True,
    )
    async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
        """
        Search OpenAlex, sorted by citation count.

        Args:
            query: Search terms
            max_results: Maximum results to return

        Returns:
            List of Evidence objects with citation metadata
        """
        params: dict[str, str | int] = {
            "search": query,
            "filter": "type:article,has_abstract:true",  # Only articles with abstracts
            "sort": "cited_by_count:desc",  # Most cited first
            "per_page": min(max_results, 100),
            "mailto": self.POLITE_EMAIL,
        }

        async with httpx.AsyncClient(timeout=30.0) as client:
            try:
                response = await client.get(self.BASE_URL, params=params)
                response.raise_for_status()

                data = response.json()
                works = data.get("results", [])

                return [self._to_evidence(work) for work in works[:max_results]]

            except httpx.HTTPStatusError as e:
                raise SearchError(f"OpenAlex API error: {e}") from e
            except httpx.RequestError as e:
                raise SearchError(f"OpenAlex connection failed: {e}") from e

    def _to_evidence(self, work: dict[str, Any]) -> Evidence:
        """Convert OpenAlex work to Evidence with rich metadata."""
        # Extract basic fields
        title = work.get("display_name", "Untitled")
        doi = work.get("doi", "")
        year = work.get("publication_year", "Unknown")
        cited_by_count = work.get("cited_by_count", 0)

        # Reconstruct abstract from inverted index
        abstract = self._reconstruct_abstract(work.get("abstract_inverted_index"))
        if not abstract:
            # Should be caught by filter=has_abstract:true, but defensive coding
            abstract = f"[No abstract available. Cited by {cited_by_count} works.]"

        # Extract authors (limit to 5)
        authors = self._extract_authors(work.get("authorships", []))

        # Extract concepts (top 5 by score)
        concepts = self._extract_concepts(work.get("concepts", []))

        # Open access info
        oa_info = work.get("open_access", {})
        is_oa = oa_info.get("is_oa", False)

        # Get PDF URL (prefer best_oa_location)
        best_oa = work.get("best_oa_location", {})
        pdf_url = best_oa.get("pdf_url") if best_oa else None

        # Build URL
        if doi:
            url = doi if doi.startswith("http") else f"https://doi.org/{doi}"
        else:
            openalex_id = work.get("id", "")
            url = openalex_id if openalex_id else "https://openalex.org"

        # NEW: Extract PMID from ids object for deduplication
        ids_obj = work.get("ids", {})
        pmid_url = ids_obj.get("pmid")  # "https://pubmed.ncbi.nlm.nih.gov/29456894"
        pmid = None
        if pmid_url and isinstance(pmid_url, str) and "pubmed.ncbi.nlm.nih.gov" in pmid_url:
            # Extract numeric PMID from URL
            pmid_match = re.search(r"/(\d+)/?$", pmid_url)
            if pmid_match:
                pmid = pmid_match.group(1)

        # Prepend citation badge to content
        citation_badge = f"[Cited by {cited_by_count}] " if cited_by_count > 0 else ""
        content = f"{citation_badge}{abstract[:1900]}"

        # Calculate relevance: normalized citation count (capped at 1.0 for 100 citations)
        # 100 citations is a very strong signal in most fields.
        relevance = min(1.0, cited_by_count / 100.0)

        return Evidence(
            content=content[:2000],
            citation=Citation(
                source="openalex",
                title=title[:500],
                url=url,
                date=str(year),
                authors=authors,
            ),
            relevance=relevance,
            metadata={
                "cited_by_count": cited_by_count,
                "concepts": concepts,
                "is_open_access": is_oa,
                "pdf_url": pdf_url,
                "pmid": pmid,  # NEW: Store PMID for deduplication
            },
        )

    def _reconstruct_abstract(self, inverted_index: dict[str, list[int]] | None) -> str:
        """Rebuild abstract from {"word": [positions]} format."""
        if not inverted_index:
            return ""

        position_word: dict[int, str] = {}
        for word, positions in inverted_index.items():
            for pos in positions:
                position_word[pos] = word

        if not position_word:
            return ""

        max_pos = max(position_word.keys())
        return " ".join(position_word.get(i, "") for i in range(max_pos + 1))

    def _extract_authors(self, authorships: list[dict[str, Any]]) -> list[str]:
        """Extract author names from authorships array."""
        authors = []
        for authorship in authorships[:5]:
            author = authorship.get("author", {})
            name = author.get("display_name")
            if name:
                authors.append(name)
        return authors

    def _extract_concepts(self, concepts: list[dict[str, Any]]) -> list[str]:
        """Extract concept names, sorted by score."""
        sorted_concepts = sorted(concepts, key=lambda c: c.get("score", 0), reverse=True)
        return [c.get("display_name", "") for c in sorted_concepts[:5] if c.get("display_name")]