Spaces:
Running
Running
| """BacDive REST API client (v2, public). | |
| The BacDive v2 API is fully open as of February 2026 — no registration, no auth. | |
| Documentation: https://api.bacdive.dsmz.de/ | |
| We discover strain IDs by scanning the integer ID space in semicolon-batched fetches | |
| of up to 100 IDs per call. Missing IDs are silently dropped server-side, so a blind | |
| scan over [1, MAX_ID] yields every existing record in one pass. At ~150K live IDs | |
| (as of 2026-04), this takes ~30 minutes single-threaded. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import time | |
| from collections.abc import Iterator | |
| from pathlib import Path | |
| from typing import Any | |
| import requests | |
| from microbe_model import config | |
| BASE_URL = "https://api.bacdive.dsmz.de/v2" | |
| BATCH_SIZE = 100 # max IDs per /fetch/ call (server limit) | |
| DEFAULT_MAX_ID = 200_000 # conservative upper bound; live max is ~160K-180K as of 2026-04 | |
| class BacDiveClient: | |
| def __init__(self, *, request_timeout: int = 60, retry_sleep_s: float = 1.0) -> None: | |
| self._session = requests.Session() | |
| self.timeout = request_timeout | |
| self.retry_sleep_s = retry_sleep_s | |
| def _get(self, path: str, params: dict | None = None) -> dict[str, Any]: | |
| url = f"{BASE_URL}{path}" | |
| for attempt in range(3): | |
| resp = self._session.get(url, params=params, timeout=self.timeout) | |
| if resp.status_code == 429: | |
| time.sleep(self.retry_sleep_s * (attempt + 1)) | |
| continue | |
| resp.raise_for_status() | |
| return resp.json() | |
| resp.raise_for_status() | |
| return {} | |
| def fetch_batch(self, ids: list[int]) -> dict[int, dict[str, Any]]: | |
| """Fetch up to BATCH_SIZE strain records in a single call. | |
| Returns a {bacdive_id: record} mapping. Missing IDs are absent from the result. | |
| """ | |
| if not ids: | |
| return {} | |
| if len(ids) > BATCH_SIZE: | |
| raise ValueError(f"Batch size {len(ids)} exceeds server limit {BATCH_SIZE}") | |
| path = f"/fetch/{';'.join(str(i) for i in ids)}" | |
| body = self._get(path) | |
| results = body.get("results") | |
| if isinstance(results, dict): | |
| return {int(k): v for k, v in results.items()} | |
| return {} | |
| def iter_records( | |
| self, | |
| *, | |
| start: int = 1, | |
| end: int = DEFAULT_MAX_ID, | |
| batch_size: int = BATCH_SIZE, | |
| ) -> Iterator[tuple[int, dict[str, Any]]]: | |
| """Scan the BacDive ID range and yield (id, record) for every existing strain.""" | |
| for batch_start in range(start, end + 1, batch_size): | |
| batch_end = min(batch_start + batch_size - 1, end) | |
| ids = list(range(batch_start, batch_end + 1)) | |
| records = self.fetch_batch(ids) | |
| yield from sorted(records.items()) | |
| def cache_path(bacdive_id: int) -> Path: | |
| return config.BACDIVE_DIR / f"{bacdive_id}.json" | |
| def cache_record(bacdive_id: int, record: dict[str, Any]) -> Path: | |
| path = cache_path(bacdive_id) | |
| path.write_text(json.dumps(record)) | |
| return path | |
| def load_cached(bacdive_id: int) -> dict[str, Any] | None: | |
| path = cache_path(bacdive_id) | |
| if not path.exists(): | |
| return None | |
| return json.loads(path.read_text()) | |
| def extract_phenotypes(record: dict[str, Any]) -> dict[str, Any]: | |
| """Pull the v0 prediction targets out of a BacDive v2 record. | |
| Field locations (verified against live API on 2026-04-26): | |
| - General → BacDive-ID | |
| - Name and taxonomic classification → species, genus, family | |
| - Culture and growth conditions → culture temp[] (type ∈ {growth, optimum, range, no growth}) | |
| - Culture and growth conditions → culture pH[] (same shape) | |
| - Physiology and metabolism → oxygen tolerance[] | |
| - Physiology and metabolism → halophily[] | |
| - Sequence information → Genome sequences[].INSDC accession | |
| - Isolation, sampling and environmental information → isolation source categories[].Cat{1,2,3} | |
| """ | |
| general = record.get("General") or {} | |
| taxon = record.get("Name and taxonomic classification") or {} | |
| culture = record.get("Culture and growth conditions") or {} | |
| physio = record.get("Physiology and metabolism") or {} | |
| seq = record.get("Sequence information") or {} | |
| iso = record.get("Isolation, sampling and environmental information") or {} | |
| iso_cats = _collect_isolation_categories(iso.get("isolation source categories")) | |
| out: dict[str, Any] = { | |
| "bacdive_id": general.get("BacDive-ID"), | |
| "species": taxon.get("species"), | |
| "genus": taxon.get("genus"), | |
| "family": (taxon.get("LPSN") or {}).get("family") or taxon.get("family"), | |
| "ncbi_taxon_id": _first_ncbi_tax_id(general.get("NCBI tax id")), | |
| "optimal_temperature_c": _derive_optimum(_as_list(culture.get("culture temp")), "temperature"), | |
| "optimal_ph": _derive_optimum(_as_list(culture.get("culture pH")), "pH"), | |
| "oxygen_requirement": _first_value(_as_list(physio.get("oxygen tolerance")), "oxygen tolerance"), | |
| "salt_tolerance_pct": _derive_salt(physio.get("halophily")), | |
| "genome_accession": _first_genome_accession(seq.get("Genome sequences")), | |
| "isolation_cat1": iso_cats["cat1"], | |
| "isolation_cat2": iso_cats["cat2"], | |
| "isolation_cat3": iso_cats["cat3"], | |
| } | |
| return out | |
| def _collect_isolation_categories(raw: Any) -> dict[str, str | None]: | |
| """Flatten BacDive's `isolation source categories` into 3 pipe-joined string fields. | |
| A strain commonly has multiple parallel category descriptions (e.g., #Host=Human AND | |
| #Host Body Product=Blood). We collect *all* unique values per level into a sorted, | |
| pipe-joined string so downstream code can split & one-hot. The leading '#' is stripped. | |
| """ | |
| cats: dict[str, set[str]] = {"Cat1": set(), "Cat2": set(), "Cat3": set()} | |
| for entry in _as_list(raw): | |
| if not isinstance(entry, dict): | |
| continue | |
| for level in cats: | |
| value = entry.get(level) | |
| if isinstance(value, str) and value: | |
| cats[level].add(value.lstrip("#").strip()) | |
| return { | |
| "cat1": "|".join(sorted(cats["Cat1"])) or None, | |
| "cat2": "|".join(sorted(cats["Cat2"])) or None, | |
| "cat3": "|".join(sorted(cats["Cat3"])) or None, | |
| } | |
| def _as_list(x: Any) -> list: | |
| if x is None: | |
| return [] | |
| if isinstance(x, list): | |
| return x | |
| return [x] | |
| def _to_float(x: Any) -> float | None: | |
| if x is None: | |
| return None | |
| s = str(x).strip() | |
| if not s: | |
| return None | |
| if "-" in s and not s.startswith("-"): | |
| # e.g. "5-30" — return midpoint | |
| parts = s.split("-") | |
| try: | |
| lo, hi = float(parts[0]), float(parts[1]) | |
| return (lo + hi) / 2 | |
| except (ValueError, IndexError): | |
| return None | |
| try: | |
| return float(s.split()[0]) | |
| except (ValueError, AttributeError): | |
| return None | |
| def _derive_optimum(entries: list, value_key: str) -> float | None: | |
| """Find an optimum for a temperature- or pH-like list of {type, value} entries. | |
| Preference order: | |
| 1. type == "optimum" (exact) | |
| 2. median of "positive growth" entries | |
| 3. None | |
| """ | |
| optima = [] | |
| growth = [] | |
| for entry in entries: | |
| if not isinstance(entry, dict): | |
| continue | |
| etype = (entry.get("type") or "").lower() | |
| value = _to_float(entry.get(value_key)) | |
| if value is None: | |
| continue | |
| is_positive = (entry.get("growth") or "").lower() in {"positive", "yes", "+", "true"} | |
| if "optim" in etype: | |
| optima.append(value) | |
| elif etype == "growth" and is_positive: | |
| growth.append(value) | |
| if optima: | |
| return sum(optima) / len(optima) | |
| if growth: | |
| sorted_g = sorted(growth) | |
| n = len(sorted_g) | |
| return sorted_g[n // 2] if n % 2 else (sorted_g[n // 2 - 1] + sorted_g[n // 2]) / 2 | |
| return None | |
| def _first_value(entries: list, key: str) -> str | None: | |
| for entry in entries: | |
| if isinstance(entry, dict) and entry.get(key): | |
| return str(entry[key]) | |
| return None | |
| def _derive_salt(halophily: Any) -> float | None: | |
| """Derive optimal NaCl concentration (% w/v) from BacDive halophily entries. | |
| Each entry has shape: | |
| {salt: 'NaCl', growth: 'positive'|'no', tested relation: 'optimum'|'growth', concentration: '3 %'} | |
| Preference order (mirrors _derive_optimum): | |
| 1. tested relation == 'optimum' AND growth == 'positive' | |
| 2. median of positive-growth concentrations (the strain's tolerated range) | |
| 3. None | |
| The previous implementation returned the first parsable value, which often picked | |
| the lowest tested concentration or a no-growth entry — overstating salt sensitivity. | |
| """ | |
| positive_tokens = {"positive", "yes", "+", "true"} | |
| optima: list[float] = [] | |
| growth: list[float] = [] | |
| for entry in _as_list(halophily): | |
| if not isinstance(entry, dict): | |
| continue | |
| if (entry.get("salt") or "NaCl") != "NaCl": | |
| continue | |
| is_positive = (entry.get("growth") or "").lower() in positive_tokens | |
| relation = (entry.get("tested relation") or "").lower() | |
| value = _to_float(entry.get("concentration") or entry.get("salt concentration")) | |
| if value is None: | |
| continue | |
| if "optim" in relation and is_positive: | |
| optima.append(value) | |
| elif is_positive: | |
| growth.append(value) | |
| if optima: | |
| return sum(optima) / len(optima) | |
| if growth: | |
| sorted_g = sorted(growth) | |
| n = len(sorted_g) | |
| return sorted_g[n // 2] if n % 2 else (sorted_g[n // 2 - 1] + sorted_g[n // 2]) / 2 | |
| return None | |
| def _first_genome_accession(genome_entries: Any) -> str | None: | |
| for entry in _as_list(genome_entries): | |
| if isinstance(entry, dict): | |
| for key in ("INSDC accession", "NCBI accession", "accession"): | |
| value = entry.get(key) | |
| if value: | |
| return str(value) | |
| return None | |
| def _first_ncbi_tax_id(tax: Any) -> int | None: | |
| for entry in _as_list(tax): | |
| if isinstance(entry, dict): | |
| value = entry.get("NCBI tax id") | |
| if value is not None: | |
| try: | |
| return int(value) | |
| except (ValueError, TypeError): | |
| continue | |
| return None | |