""" PaleoData Explorer — PBDB & Macrostrat API Client ================================================== Provides robust, cached-friendly wrappers around the Paleobiology Database (PBDB) occurrence endpoint and (optionally) the Macrostrat interval endpoint. Every function handles timeouts, HTTP errors, and empty responses gracefully. Domain notes ------------ * Geological time is in "Ma" (Mega-annum, millions of years ago). * The "show" parameter must include `paleoloc` (paleocoordinates), `phylo` (phylogeny / taxonomy) and `time,ident` so the returned JSON carries `paleolat`, `paleolng`, taxonomic hierarchies and temporal bounds (`max_ma`, `min_ma`). * The PBDB API returns `records` inside a top-level key; we safely unwrap that in `fetch_occurrences`. """ import logging from typing import Any, Dict, List, Optional import requests logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- PBDB_OCCURRENCE_URL: str = "https://paleobiodb.org/data1.2/occs/list.json" MACROSTRAT_INTERVALS_URL: str = "https://macrostrat.org/api/v2/defs/intervals" DEFAULT_LIMIT: int = 1000 DEFAULT_SHOW: str = "paleoloc,phylo,time,ident" REQUEST_TIMEOUT: int = 30 # seconds # --------------------------------------------------------------------------- # Public helpers # --------------------------------------------------------------------------- def _safe_get(url: str, params: Dict[str, Any]) -> requests.Response: """Perform a GET request with standardised error handling. Raises ------ requests.exceptions.Timeout When the request hangs past *REQUEST_TIMEOUT*. requests.exceptions.HTTPError On 4xx / 5xx responses. ValueError When the response body is not valid JSON. """ logger.debug("GET %s | params=%s", url, params) resp = requests.get(url, params=params, timeout=REQUEST_TIMEOUT) resp.raise_for_status() return resp # --------------------------------------------------------------------------- # PBDB Occurrence fetch # --------------------------------------------------------------------------- def fetch_occurrences( base_name: str, *, max_ma: Optional[float] = None, min_ma: Optional[float] = None, limit: int = DEFAULT_LIMIT, show: str = DEFAULT_SHOW, ) -> List[Dict[str, Any]]: """Fetch fossil occurrence records from the PBDB API. Parameters ---------- base_name : str Taxonomic clade or genus name, e.g. ``"Ceratopsidae"`` or ``"Tyrannosaurus"``. PBDB resolves this hierarchically so all subordinate taxa are included automatically. max_ma, min_ma : float or None Optional temporal window in Ma. When supplied the API filters occurrences to those whose age range overlaps this window. limit : int Maximum number of records to return (default 1 000). show : str Comma-separated list of PBDB "show" fields. Must include at least ``paleoloc,phylo,time,ident`` for the downstream pipeline. Returns ------- list[dict] List of raw occurrence records. Returns an empty list when the API returns no records or the response is malformed. Raises ------ requests.exceptions.Timeout If the PBDB API does not respond within *REQUEST_TIMEOUT*. requests.exceptions.HTTPError If the API returns a non-200 status. ValueError If the response body cannot be parsed as JSON. """ params: Dict[str, Any] = { "base_name": base_name, "show": show, "limit": limit, } if max_ma is not None: params["max_ma"] = max_ma if min_ma is not None: params["min_ma"] = min_ma logger.info("Querying PBDB with base_name=%r", base_name) try: resp = _safe_get(PBDB_OCCURRENCE_URL, params) except requests.exceptions.Timeout: logger.error("PBDB request timed out after %d s", REQUEST_TIMEOUT) raise except requests.exceptions.HTTPError as exc: logger.error("PBDB request failed (HTTP %s)", exc.response.status_code if exc.response is not None else "unknown") raise except requests.exceptions.RequestException as exc: logger.error("PBDB request failed: %s", exc) raise try: data = resp.json() except ValueError: logger.error("PBDB response body is not valid JSON") raise records: List[Dict[str, Any]] = data.get("records", []) if not records: logger.warning("PBDB returned zero records for base_name=%r", base_name) logger.info("PBDB returned %d records", len(records)) return records # --------------------------------------------------------------------------- # Wikipedia profile fetch (optional helper) # --------------------------------------------------------------------------- WIKIPEDIA_SUMMARY_URL: str = "https://en.wikipedia.org/api/rest_v1/page/summary" def fetch_wikipedia_profile(taxon_name: str) -> Dict[str, Any]: """Fetch a short summary and thumbnail for a taxon from Wikipedia. Uses the Wikimedia REST API ``/page/summary/{title}`` endpoint. Failures are caught silently and an empty dict (or dict with an ``"error"`` key) is returned so callers never crash. Parameters ---------- taxon_name : str The Wikipedia article title, e.g. ``"Triceratops"`` or ``"Tyrannosaurus"``. Returns ------- dict On success: ``{"extract": str, "image_url": str|None, "page_url": str}``. On failure: ``{"error": str}`` or ``{}``. """ import urllib.parse safe_title = urllib.parse.quote(taxon_name.strip(), safe="") url = f"{WIKIPEDIA_SUMMARY_URL}/{safe_title}" logger.info("Fetching Wikipedia summary for %r", taxon_name) try: resp = requests.get( url, timeout=REQUEST_TIMEOUT, headers={"User-Agent": "PaleoDataExplorer/1.0 (educational tool; https://github.com/anomalyco/opencode)"}, ) if resp.status_code == 404: logger.warning("Wikipedia page not found for %r", taxon_name) return {"error": f"No Wikipedia article found for '{taxon_name}'."} resp.raise_for_status() except requests.exceptions.Timeout: logger.error("Wikipedia request timed out for %r", taxon_name) return {"error": "Wikipedia request timed out."} except requests.exceptions.RequestException as exc: logger.error("Wikipedia request failed for %r: %s", taxon_name, exc) return {"error": f"Wikipedia request failed: {exc}"} try: data = resp.json() except ValueError: logger.error("Wikipedia response is not valid JSON for %r", taxon_name) return {"error": "Invalid response from Wikipedia."} extract = data.get("extract", "") thumbnail = data.get("thumbnail", {}) image_url = thumbnail.get("source") if isinstance(thumbnail, dict) else None page_url = data.get("content_urls", {}).get("desktop", {}).get("page", "") return { "extract": extract, "image_url": image_url, "page_url": page_url, } # --------------------------------------------------------------------------- # Macrostrat interval fetch (optional helper) # --------------------------------------------------------------------------- def fetch_macrostrat_intervals() -> List[Dict[str, Any]]: """Fetch the Macrostrat interval definitions (geological periods). Useful for mapping absolute Ma values to named periods. Returns an empty list on failure so callers can fall back gracefully. Returns ------- list[dict] Each dict contains keys such as ``name``, ``t_age``, ``b_age``, ``color``, etc. """ logger.info("Querying Macrostrat interval definitions") try: resp = _safe_get(MACROSTRAT_INTERVALS_URL, {"all": True, "format": "json"}) return resp.json() # type: ignore[no-any-return] except Exception: logger.exception("Failed to fetch Macrostrat intervals") return []