Spaces:

benjamintia
/

PaleoData-Explorer

Sleeping

App Files Files Community

PaleoData-Explorer / api_client.py

benjamintia

Upload folder using huggingface_hub

81f7f5d verified 25 days ago

Raw

History Blame Contribute Delete

8.21 kB

	"""
	PaleoData Explorer — PBDB & Macrostrat API Client
	==================================================
	Provides robust, cached-friendly wrappers around the Paleobiology Database
	(PBDB) occurrence endpoint and (optionally) the Macrostrat interval
	endpoint. Every function handles timeouts, HTTP errors, and empty
	responses gracefully.

	Domain notes
	------------
	* Geological time is in "Ma" (Mega-annum, millions of years ago).
	* The "show" parameter must include `paleoloc` (paleocoordinates),
	`phylo` (phylogeny / taxonomy) and `time,ident` so the returned JSON
	carries `paleolat`, `paleolng`, taxonomic hierarchies and temporal
	bounds (`max_ma`, `min_ma`).
	* The PBDB API returns `records` inside a top-level key; we safely
	unwrap that in `fetch_occurrences`.
	"""

	import logging
	from typing import Any, Dict, List, Optional

	import requests

	logger = logging.getLogger(__name__)

	# ---------------------------------------------------------------------------
	# Constants
	# ---------------------------------------------------------------------------

	PBDB_OCCURRENCE_URL: str = "https://paleobiodb.org/data1.2/occs/list.json"
	MACROSTRAT_INTERVALS_URL: str = "https://macrostrat.org/api/v2/defs/intervals"

	DEFAULT_LIMIT: int = 1000
	DEFAULT_SHOW: str = "paleoloc,phylo,time,ident"
	REQUEST_TIMEOUT: int = 30 # seconds


	# ---------------------------------------------------------------------------
	# Public helpers
	# ---------------------------------------------------------------------------

	def _safe_get(url: str, params: Dict[str, Any]) -> requests.Response:
	"""Perform a GET request with standardised error handling.

	Raises
	------
	requests.exceptions.Timeout
	When the request hangs past REQUEST_TIMEOUT.
	requests.exceptions.HTTPError
	On 4xx / 5xx responses.
	ValueError
	When the response body is not valid JSON.
	"""
	logger.debug("GET %s \| params=%s", url, params)
	resp = requests.get(url, params=params, timeout=REQUEST_TIMEOUT)
	resp.raise_for_status()
	return resp


	# ---------------------------------------------------------------------------
	# PBDB Occurrence fetch
	# ---------------------------------------------------------------------------

	def fetch_occurrences(
	base_name: str,
	*,
	max_ma: Optional[float] = None,
	min_ma: Optional[float] = None,
	limit: int = DEFAULT_LIMIT,
	show: str = DEFAULT_SHOW,
	) -> List[Dict[str, Any]]:
	"""Fetch fossil occurrence records from the PBDB API.

	Parameters
	----------
	base_name : str
	Taxonomic clade or genus name, e.g. ``"Ceratopsidae"`` or
	``"Tyrannosaurus"``. PBDB resolves this hierarchically so all
	subordinate taxa are included automatically.
	max_ma, min_ma : float or None
	Optional temporal window in Ma. When supplied the API filters
	occurrences to those whose age range overlaps this window.
	limit : int
	Maximum number of records to return (default 1 000).
	show : str
	Comma-separated list of PBDB "show" fields. Must include at
	least ``paleoloc,phylo,time,ident`` for the downstream pipeline.

	Returns
	-------
	list[dict]
	List of raw occurrence records. Returns an empty list when the
	API returns no records or the response is malformed.

	Raises
	------
	requests.exceptions.Timeout
	If the PBDB API does not respond within REQUEST_TIMEOUT.
	requests.exceptions.HTTPError
	If the API returns a non-200 status.
	ValueError
	If the response body cannot be parsed as JSON.
	"""
	params: Dict[str, Any] = {
	"base_name": base_name,
	"show": show,
	"limit": limit,
	}
	if max_ma is not None:
	params["max_ma"] = max_ma
	if min_ma is not None:
	params["min_ma"] = min_ma

	logger.info("Querying PBDB with base_name=%r", base_name)

	try:
	resp = _safe_get(PBDB_OCCURRENCE_URL, params)
	except requests.exceptions.Timeout:
	logger.error("PBDB request timed out after %d s", REQUEST_TIMEOUT)
	raise
	except requests.exceptions.HTTPError as exc:
	logger.error("PBDB request failed (HTTP %s)", exc.response.status_code if exc.response is not None else "unknown")
	raise
	except requests.exceptions.RequestException as exc:
	logger.error("PBDB request failed: %s", exc)
	raise

	try:
	data = resp.json()
	except ValueError:
	logger.error("PBDB response body is not valid JSON")
	raise

	records: List[Dict[str, Any]] = data.get("records", [])
	if not records:
	logger.warning("PBDB returned zero records for base_name=%r", base_name)

	logger.info("PBDB returned %d records", len(records))
	return records


	# ---------------------------------------------------------------------------
	# Wikipedia profile fetch (optional helper)
	# ---------------------------------------------------------------------------

	WIKIPEDIA_SUMMARY_URL: str = "https://en.wikipedia.org/api/rest_v1/page/summary"


	def fetch_wikipedia_profile(taxon_name: str) -> Dict[str, Any]:
	"""Fetch a short summary and thumbnail for a taxon from Wikipedia.

	Uses the Wikimedia REST API ``/page/summary/{title}`` endpoint.
	Failures are caught silently and an empty dict (or dict with an
	``"error"`` key) is returned so callers never crash.

	Parameters
	----------
	taxon_name : str
	The Wikipedia article title, e.g. ``"Triceratops"`` or
	``"Tyrannosaurus"``.

	Returns
	-------
	dict
	On success: ``{"extract": str, "image_url": str\|None, "page_url": str}``.
	On failure: ``{"error": str}`` or ``{}``.
	"""
	import urllib.parse

	safe_title = urllib.parse.quote(taxon_name.strip(), safe="")
	url = f"{WIKIPEDIA_SUMMARY_URL}/{safe_title}"

	logger.info("Fetching Wikipedia summary for %r", taxon_name)

	try:
	resp = requests.get(
	url,
	timeout=REQUEST_TIMEOUT,
	headers={"User-Agent": "PaleoDataExplorer/1.0 (educational tool; https://github.com/anomalyco/opencode)"},
	)
	if resp.status_code == 404:
	logger.warning("Wikipedia page not found for %r", taxon_name)
	return {"error": f"No Wikipedia article found for '{taxon_name}'."}
	resp.raise_for_status()
	except requests.exceptions.Timeout:
	logger.error("Wikipedia request timed out for %r", taxon_name)
	return {"error": "Wikipedia request timed out."}
	except requests.exceptions.RequestException as exc:
	logger.error("Wikipedia request failed for %r: %s", taxon_name, exc)
	return {"error": f"Wikipedia request failed: {exc}"}

	try:
	data = resp.json()
	except ValueError:
	logger.error("Wikipedia response is not valid JSON for %r", taxon_name)
	return {"error": "Invalid response from Wikipedia."}

	extract = data.get("extract", "")
	thumbnail = data.get("thumbnail", {})
	image_url = thumbnail.get("source") if isinstance(thumbnail, dict) else None
	page_url = data.get("content_urls", {}).get("desktop", {}).get("page", "")

	return {
	"extract": extract,
	"image_url": image_url,
	"page_url": page_url,
	}


	# ---------------------------------------------------------------------------
	# Macrostrat interval fetch (optional helper)
	# ---------------------------------------------------------------------------

	def fetch_macrostrat_intervals() -> List[Dict[str, Any]]:
	"""Fetch the Macrostrat interval definitions (geological periods).

	Useful for mapping absolute Ma values to named periods. Returns an
	empty list on failure so callers can fall back gracefully.

	Returns
	-------
	list[dict]
	Each dict contains keys such as ``name``, ``t_age``, ``b_age``,
	``color``, etc.
	"""
	logger.info("Querying Macrostrat interval definitions")
	try:
	resp = _safe_get(MACROSTRAT_INTERVALS_URL, {"all": True, "format": "json"})
	return resp.json() # type: ignore[no-any-return]
	except Exception:
	logger.exception("Failed to fetch Macrostrat intervals")
	return []