PaleoData-Explorer / api_client.py
benjamintia's picture
Upload folder using huggingface_hub
81f7f5d verified
Raw
History Blame Contribute Delete
8.21 kB
"""
PaleoData Explorer — PBDB & Macrostrat API Client
==================================================
Provides robust, cached-friendly wrappers around the Paleobiology Database
(PBDB) occurrence endpoint and (optionally) the Macrostrat interval
endpoint. Every function handles timeouts, HTTP errors, and empty
responses gracefully.
Domain notes
------------
* Geological time is in "Ma" (Mega-annum, millions of years ago).
* The "show" parameter must include `paleoloc` (paleocoordinates),
`phylo` (phylogeny / taxonomy) and `time,ident` so the returned JSON
carries `paleolat`, `paleolng`, taxonomic hierarchies and temporal
bounds (`max_ma`, `min_ma`).
* The PBDB API returns `records` inside a top-level key; we safely
unwrap that in `fetch_occurrences`.
"""
import logging
from typing import Any, Dict, List, Optional
import requests
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
PBDB_OCCURRENCE_URL: str = "https://paleobiodb.org/data1.2/occs/list.json"
MACROSTRAT_INTERVALS_URL: str = "https://macrostrat.org/api/v2/defs/intervals"
DEFAULT_LIMIT: int = 1000
DEFAULT_SHOW: str = "paleoloc,phylo,time,ident"
REQUEST_TIMEOUT: int = 30 # seconds
# ---------------------------------------------------------------------------
# Public helpers
# ---------------------------------------------------------------------------
def _safe_get(url: str, params: Dict[str, Any]) -> requests.Response:
"""Perform a GET request with standardised error handling.
Raises
------
requests.exceptions.Timeout
When the request hangs past *REQUEST_TIMEOUT*.
requests.exceptions.HTTPError
On 4xx / 5xx responses.
ValueError
When the response body is not valid JSON.
"""
logger.debug("GET %s | params=%s", url, params)
resp = requests.get(url, params=params, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
return resp
# ---------------------------------------------------------------------------
# PBDB Occurrence fetch
# ---------------------------------------------------------------------------
def fetch_occurrences(
base_name: str,
*,
max_ma: Optional[float] = None,
min_ma: Optional[float] = None,
limit: int = DEFAULT_LIMIT,
show: str = DEFAULT_SHOW,
) -> List[Dict[str, Any]]:
"""Fetch fossil occurrence records from the PBDB API.
Parameters
----------
base_name : str
Taxonomic clade or genus name, e.g. ``"Ceratopsidae"`` or
``"Tyrannosaurus"``. PBDB resolves this hierarchically so all
subordinate taxa are included automatically.
max_ma, min_ma : float or None
Optional temporal window in Ma. When supplied the API filters
occurrences to those whose age range overlaps this window.
limit : int
Maximum number of records to return (default 1 000).
show : str
Comma-separated list of PBDB "show" fields. Must include at
least ``paleoloc,phylo,time,ident`` for the downstream pipeline.
Returns
-------
list[dict]
List of raw occurrence records. Returns an empty list when the
API returns no records or the response is malformed.
Raises
------
requests.exceptions.Timeout
If the PBDB API does not respond within *REQUEST_TIMEOUT*.
requests.exceptions.HTTPError
If the API returns a non-200 status.
ValueError
If the response body cannot be parsed as JSON.
"""
params: Dict[str, Any] = {
"base_name": base_name,
"show": show,
"limit": limit,
}
if max_ma is not None:
params["max_ma"] = max_ma
if min_ma is not None:
params["min_ma"] = min_ma
logger.info("Querying PBDB with base_name=%r", base_name)
try:
resp = _safe_get(PBDB_OCCURRENCE_URL, params)
except requests.exceptions.Timeout:
logger.error("PBDB request timed out after %d s", REQUEST_TIMEOUT)
raise
except requests.exceptions.HTTPError as exc:
logger.error("PBDB request failed (HTTP %s)", exc.response.status_code if exc.response is not None else "unknown")
raise
except requests.exceptions.RequestException as exc:
logger.error("PBDB request failed: %s", exc)
raise
try:
data = resp.json()
except ValueError:
logger.error("PBDB response body is not valid JSON")
raise
records: List[Dict[str, Any]] = data.get("records", [])
if not records:
logger.warning("PBDB returned zero records for base_name=%r", base_name)
logger.info("PBDB returned %d records", len(records))
return records
# ---------------------------------------------------------------------------
# Wikipedia profile fetch (optional helper)
# ---------------------------------------------------------------------------
WIKIPEDIA_SUMMARY_URL: str = "https://en.wikipedia.org/api/rest_v1/page/summary"
def fetch_wikipedia_profile(taxon_name: str) -> Dict[str, Any]:
"""Fetch a short summary and thumbnail for a taxon from Wikipedia.
Uses the Wikimedia REST API ``/page/summary/{title}`` endpoint.
Failures are caught silently and an empty dict (or dict with an
``"error"`` key) is returned so callers never crash.
Parameters
----------
taxon_name : str
The Wikipedia article title, e.g. ``"Triceratops"`` or
``"Tyrannosaurus"``.
Returns
-------
dict
On success: ``{"extract": str, "image_url": str|None, "page_url": str}``.
On failure: ``{"error": str}`` or ``{}``.
"""
import urllib.parse
safe_title = urllib.parse.quote(taxon_name.strip(), safe="")
url = f"{WIKIPEDIA_SUMMARY_URL}/{safe_title}"
logger.info("Fetching Wikipedia summary for %r", taxon_name)
try:
resp = requests.get(
url,
timeout=REQUEST_TIMEOUT,
headers={"User-Agent": "PaleoDataExplorer/1.0 (educational tool; https://github.com/anomalyco/opencode)"},
)
if resp.status_code == 404:
logger.warning("Wikipedia page not found for %r", taxon_name)
return {"error": f"No Wikipedia article found for '{taxon_name}'."}
resp.raise_for_status()
except requests.exceptions.Timeout:
logger.error("Wikipedia request timed out for %r", taxon_name)
return {"error": "Wikipedia request timed out."}
except requests.exceptions.RequestException as exc:
logger.error("Wikipedia request failed for %r: %s", taxon_name, exc)
return {"error": f"Wikipedia request failed: {exc}"}
try:
data = resp.json()
except ValueError:
logger.error("Wikipedia response is not valid JSON for %r", taxon_name)
return {"error": "Invalid response from Wikipedia."}
extract = data.get("extract", "")
thumbnail = data.get("thumbnail", {})
image_url = thumbnail.get("source") if isinstance(thumbnail, dict) else None
page_url = data.get("content_urls", {}).get("desktop", {}).get("page", "")
return {
"extract": extract,
"image_url": image_url,
"page_url": page_url,
}
# ---------------------------------------------------------------------------
# Macrostrat interval fetch (optional helper)
# ---------------------------------------------------------------------------
def fetch_macrostrat_intervals() -> List[Dict[str, Any]]:
"""Fetch the Macrostrat interval definitions (geological periods).
Useful for mapping absolute Ma values to named periods. Returns an
empty list on failure so callers can fall back gracefully.
Returns
-------
list[dict]
Each dict contains keys such as ``name``, ``t_age``, ``b_age``,
``color``, etc.
"""
logger.info("Querying Macrostrat interval definitions")
try:
resp = _safe_get(MACROSTRAT_INTERVALS_URL, {"all": True, "format": "json"})
return resp.json() # type: ignore[no-any-return]
except Exception:
logger.exception("Failed to fetch Macrostrat intervals")
return []