Spaces:
Sleeping
Sleeping
| """ | |
| PaleoData Explorer — PBDB & Macrostrat API Client | |
| ================================================== | |
| Provides robust, cached-friendly wrappers around the Paleobiology Database | |
| (PBDB) occurrence endpoint and (optionally) the Macrostrat interval | |
| endpoint. Every function handles timeouts, HTTP errors, and empty | |
| responses gracefully. | |
| Domain notes | |
| ------------ | |
| * Geological time is in "Ma" (Mega-annum, millions of years ago). | |
| * The "show" parameter must include `paleoloc` (paleocoordinates), | |
| `phylo` (phylogeny / taxonomy) and `time,ident` so the returned JSON | |
| carries `paleolat`, `paleolng`, taxonomic hierarchies and temporal | |
| bounds (`max_ma`, `min_ma`). | |
| * The PBDB API returns `records` inside a top-level key; we safely | |
| unwrap that in `fetch_occurrences`. | |
| """ | |
| import logging | |
| from typing import Any, Dict, List, Optional | |
| import requests | |
| logger = logging.getLogger(__name__) | |
| # --------------------------------------------------------------------------- | |
| # Constants | |
| # --------------------------------------------------------------------------- | |
| PBDB_OCCURRENCE_URL: str = "https://paleobiodb.org/data1.2/occs/list.json" | |
| MACROSTRAT_INTERVALS_URL: str = "https://macrostrat.org/api/v2/defs/intervals" | |
| DEFAULT_LIMIT: int = 1000 | |
| DEFAULT_SHOW: str = "paleoloc,phylo,time,ident" | |
| REQUEST_TIMEOUT: int = 30 # seconds | |
| # --------------------------------------------------------------------------- | |
| # Public helpers | |
| # --------------------------------------------------------------------------- | |
| def _safe_get(url: str, params: Dict[str, Any]) -> requests.Response: | |
| """Perform a GET request with standardised error handling. | |
| Raises | |
| ------ | |
| requests.exceptions.Timeout | |
| When the request hangs past *REQUEST_TIMEOUT*. | |
| requests.exceptions.HTTPError | |
| On 4xx / 5xx responses. | |
| ValueError | |
| When the response body is not valid JSON. | |
| """ | |
| logger.debug("GET %s | params=%s", url, params) | |
| resp = requests.get(url, params=params, timeout=REQUEST_TIMEOUT) | |
| resp.raise_for_status() | |
| return resp | |
| # --------------------------------------------------------------------------- | |
| # PBDB Occurrence fetch | |
| # --------------------------------------------------------------------------- | |
| def fetch_occurrences( | |
| base_name: str, | |
| *, | |
| max_ma: Optional[float] = None, | |
| min_ma: Optional[float] = None, | |
| limit: int = DEFAULT_LIMIT, | |
| show: str = DEFAULT_SHOW, | |
| ) -> List[Dict[str, Any]]: | |
| """Fetch fossil occurrence records from the PBDB API. | |
| Parameters | |
| ---------- | |
| base_name : str | |
| Taxonomic clade or genus name, e.g. ``"Ceratopsidae"`` or | |
| ``"Tyrannosaurus"``. PBDB resolves this hierarchically so all | |
| subordinate taxa are included automatically. | |
| max_ma, min_ma : float or None | |
| Optional temporal window in Ma. When supplied the API filters | |
| occurrences to those whose age range overlaps this window. | |
| limit : int | |
| Maximum number of records to return (default 1 000). | |
| show : str | |
| Comma-separated list of PBDB "show" fields. Must include at | |
| least ``paleoloc,phylo,time,ident`` for the downstream pipeline. | |
| Returns | |
| ------- | |
| list[dict] | |
| List of raw occurrence records. Returns an empty list when the | |
| API returns no records or the response is malformed. | |
| Raises | |
| ------ | |
| requests.exceptions.Timeout | |
| If the PBDB API does not respond within *REQUEST_TIMEOUT*. | |
| requests.exceptions.HTTPError | |
| If the API returns a non-200 status. | |
| ValueError | |
| If the response body cannot be parsed as JSON. | |
| """ | |
| params: Dict[str, Any] = { | |
| "base_name": base_name, | |
| "show": show, | |
| "limit": limit, | |
| } | |
| if max_ma is not None: | |
| params["max_ma"] = max_ma | |
| if min_ma is not None: | |
| params["min_ma"] = min_ma | |
| logger.info("Querying PBDB with base_name=%r", base_name) | |
| try: | |
| resp = _safe_get(PBDB_OCCURRENCE_URL, params) | |
| except requests.exceptions.Timeout: | |
| logger.error("PBDB request timed out after %d s", REQUEST_TIMEOUT) | |
| raise | |
| except requests.exceptions.HTTPError as exc: | |
| logger.error("PBDB request failed (HTTP %s)", exc.response.status_code if exc.response is not None else "unknown") | |
| raise | |
| except requests.exceptions.RequestException as exc: | |
| logger.error("PBDB request failed: %s", exc) | |
| raise | |
| try: | |
| data = resp.json() | |
| except ValueError: | |
| logger.error("PBDB response body is not valid JSON") | |
| raise | |
| records: List[Dict[str, Any]] = data.get("records", []) | |
| if not records: | |
| logger.warning("PBDB returned zero records for base_name=%r", base_name) | |
| logger.info("PBDB returned %d records", len(records)) | |
| return records | |
| # --------------------------------------------------------------------------- | |
| # Wikipedia profile fetch (optional helper) | |
| # --------------------------------------------------------------------------- | |
| WIKIPEDIA_SUMMARY_URL: str = "https://en.wikipedia.org/api/rest_v1/page/summary" | |
| def fetch_wikipedia_profile(taxon_name: str) -> Dict[str, Any]: | |
| """Fetch a short summary and thumbnail for a taxon from Wikipedia. | |
| Uses the Wikimedia REST API ``/page/summary/{title}`` endpoint. | |
| Failures are caught silently and an empty dict (or dict with an | |
| ``"error"`` key) is returned so callers never crash. | |
| Parameters | |
| ---------- | |
| taxon_name : str | |
| The Wikipedia article title, e.g. ``"Triceratops"`` or | |
| ``"Tyrannosaurus"``. | |
| Returns | |
| ------- | |
| dict | |
| On success: ``{"extract": str, "image_url": str|None, "page_url": str}``. | |
| On failure: ``{"error": str}`` or ``{}``. | |
| """ | |
| import urllib.parse | |
| safe_title = urllib.parse.quote(taxon_name.strip(), safe="") | |
| url = f"{WIKIPEDIA_SUMMARY_URL}/{safe_title}" | |
| logger.info("Fetching Wikipedia summary for %r", taxon_name) | |
| try: | |
| resp = requests.get( | |
| url, | |
| timeout=REQUEST_TIMEOUT, | |
| headers={"User-Agent": "PaleoDataExplorer/1.0 (educational tool; https://github.com/anomalyco/opencode)"}, | |
| ) | |
| if resp.status_code == 404: | |
| logger.warning("Wikipedia page not found for %r", taxon_name) | |
| return {"error": f"No Wikipedia article found for '{taxon_name}'."} | |
| resp.raise_for_status() | |
| except requests.exceptions.Timeout: | |
| logger.error("Wikipedia request timed out for %r", taxon_name) | |
| return {"error": "Wikipedia request timed out."} | |
| except requests.exceptions.RequestException as exc: | |
| logger.error("Wikipedia request failed for %r: %s", taxon_name, exc) | |
| return {"error": f"Wikipedia request failed: {exc}"} | |
| try: | |
| data = resp.json() | |
| except ValueError: | |
| logger.error("Wikipedia response is not valid JSON for %r", taxon_name) | |
| return {"error": "Invalid response from Wikipedia."} | |
| extract = data.get("extract", "") | |
| thumbnail = data.get("thumbnail", {}) | |
| image_url = thumbnail.get("source") if isinstance(thumbnail, dict) else None | |
| page_url = data.get("content_urls", {}).get("desktop", {}).get("page", "") | |
| return { | |
| "extract": extract, | |
| "image_url": image_url, | |
| "page_url": page_url, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Macrostrat interval fetch (optional helper) | |
| # --------------------------------------------------------------------------- | |
| def fetch_macrostrat_intervals() -> List[Dict[str, Any]]: | |
| """Fetch the Macrostrat interval definitions (geological periods). | |
| Useful for mapping absolute Ma values to named periods. Returns an | |
| empty list on failure so callers can fall back gracefully. | |
| Returns | |
| ------- | |
| list[dict] | |
| Each dict contains keys such as ``name``, ``t_age``, ``b_age``, | |
| ``color``, etc. | |
| """ | |
| logger.info("Querying Macrostrat interval definitions") | |
| try: | |
| resp = _safe_get(MACROSTRAT_INTERVALS_URL, {"all": True, "format": "json"}) | |
| return resp.json() # type: ignore[no-any-return] | |
| except Exception: | |
| logger.exception("Failed to fetch Macrostrat intervals") | |
| return [] | |