"""Extract richer environment-of-origin features from raw BacDive JSON. Reads: data/bacdive/*.json — one file per BacDive ID Writes: data/isolation_metadata.parquet — one row per bacdive_id with cols: - iso_country, iso_continent (categorical → caller can one-hot) - iso_lat, iso_lon (float, NaN if missing) - iso_collection_year (int from sampling/isolation date, NaN if missing) - iso_host_species (string, NaN if missing) - iso_sample_text (free-text description, for downstream NLP if needed) - iso_continent_ binary one-hots (8 continents) - iso_country_ top-30 country one-hots - iso_host_kingdom (animal / plant / human / fungal / NaN — coarse map) Wires into baseline by genome's bacdive_id (one strain per row, not per genome). """ from __future__ import annotations import glob import json import re from collections import Counter from pathlib import Path import pandas as pd from tqdm import tqdm from microbe_model import config DATE_RE = re.compile(r"\b(19|20)\d{2}\b") HOST_KINGDOM_KEYWORDS = { "human": "human", "homo sapiens": "human", "patient": "human", "infant": "human", "mouse": "animal", "rat": "animal", "cow": "animal", "bovine": "animal", "pig": "animal", "swine": "animal", "chicken": "animal", "fish": "animal", "honey bee": "animal", "insect": "animal", "termite": "animal", "bird": "animal", "tick": "animal", "plant": "plant", "rice": "plant", "wheat": "plant", "soybean": "plant", "tomato": "plant", "leaf": "plant", "root": "plant", "rhizosphere": "plant", "fungus": "fungal", "yeast": "fungal", "mushroom": "fungal", } def coerce_float(v) -> float | None: if v is None: return None try: return float(v) except (TypeError, ValueError): return None def parse_year(s) -> int | None: if not s or not isinstance(s, str): return None m = DATE_RE.search(s) if not m: return None y = int(m.group(0)) return y if 1850 <= y <= 2100 else None def host_kingdom(host_str) -> str | None: if not host_str or not isinstance(host_str, str): return None s = host_str.lower() for k, v in HOST_KINGDOM_KEYWORDS.items(): if k in s: return v return "other" def extract_one(path: Path) -> dict | None: try: with open(path) as fh: d = json.load(fh) except Exception: return None bid_str = path.stem # filename is e.g. "12345.json" try: bid = int(bid_str) except ValueError: return None iso_section = d.get("Isolation, sampling and environmental information", {}) if not isinstance(iso_section, dict): return {"bacdive_id": bid} iso = iso_section.get("isolation", {}) if isinstance(iso, list): iso = iso[0] if iso else {} if not isinstance(iso, dict): iso = {} sample_type = iso.get("sample type") sample_text = sample_type if isinstance(sample_type, str) else None year = parse_year(iso.get("sampling date")) or parse_year(iso.get("isolation date")) host_species = iso.get("host species") if isinstance(iso.get("host species"), str) else None return { "bacdive_id": bid, "iso_country": iso.get("country") if isinstance(iso.get("country"), str) else None, "iso_continent": iso.get("continent") if isinstance(iso.get("continent"), str) else None, "iso_lat": coerce_float(iso.get("latitude")), "iso_lon": coerce_float(iso.get("longitude")), "iso_collection_year": year, "iso_host_species": host_species, "iso_host_kingdom": host_kingdom(host_species) or host_kingdom(sample_text), "iso_sample_text": sample_text, "iso_geographic_location": iso.get("geographic location") if isinstance(iso.get("geographic location"), str) else None, } def add_categorical_onehots(df: pd.DataFrame, top_n_countries: int = 30) -> pd.DataFrame: # continent one-hots (small, fixed set) continents = [c for c in df["iso_continent"].dropna().unique()] for c in continents: slug = re.sub(r"[^a-z0-9]+", "_", c.lower()).strip("_") if slug: df[f"iso_continent_{slug}"] = (df["iso_continent"] == c).astype(int) # top-N country one-hots (long tail; cap to keep feature count manageable) top_countries = df["iso_country"].value_counts().head(top_n_countries).index.tolist() for c in top_countries: slug = re.sub(r"[^a-z0-9]+", "_", c.lower()).strip("_") if slug: df[f"iso_country_{slug}"] = (df["iso_country"] == c).astype(int) # host-kingdom one-hots (very small fixed set) for k in ("human", "animal", "plant", "fungal", "other"): df[f"iso_host_kingdom_{k}"] = (df["iso_host_kingdom"] == k).astype(int) return df def main() -> None: bacdive_dir = config.DATA / "bacdive" if not bacdive_dir.exists(): raise SystemExit(f"Missing {bacdive_dir}") paths = list(bacdive_dir.glob("*.json")) print(f"Parsing {len(paths):,} BacDive JSON files...") rows: list[dict] = [] for p in tqdm(paths, unit="file"): r = extract_one(p) if r: rows.append(r) df = pd.DataFrame(rows) print(f"Parsed {len(df):,} rows") # Coverage report on the high-value fields for col in ["iso_country", "iso_continent", "iso_lat", "iso_lon", "iso_collection_year", "iso_host_species", "iso_host_kingdom"]: nn = df[col].notna().sum() if col in df.columns else 0 print(f" {col:30s} {nn:>6,} populated ({100*nn/len(df):.1f}%)") df = add_categorical_onehots(df) out = config.DATA / "isolation_metadata.parquet" df.to_parquet(out, index=False) print(f"\nWrote {out}: {len(df):,} rows × {df.shape[1]} cols") # Sample print("\nMost common countries (sanity check):") print(df["iso_country"].value_counts().head(10)) if __name__ == "__main__": main()