Spaces:
Running
Running
File size: 5,987 Bytes
0ed74db | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 | """Extract richer environment-of-origin features from raw BacDive JSON.
Reads:
data/bacdive/*.json — one file per BacDive ID
Writes:
data/isolation_metadata.parquet — one row per bacdive_id with cols:
- iso_country, iso_continent (categorical → caller can one-hot)
- iso_lat, iso_lon (float, NaN if missing)
- iso_collection_year (int from sampling/isolation date, NaN if missing)
- iso_host_species (string, NaN if missing)
- iso_sample_text (free-text description, for downstream NLP if needed)
- iso_continent_<X> binary one-hots (8 continents)
- iso_country_<X> top-30 country one-hots
- iso_host_kingdom (animal / plant / human / fungal / NaN — coarse map)
Wires into baseline by genome's bacdive_id (one strain per row, not per genome).
"""
from __future__ import annotations
import glob
import json
import re
from collections import Counter
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from microbe_model import config
DATE_RE = re.compile(r"\b(19|20)\d{2}\b")
HOST_KINGDOM_KEYWORDS = {
"human": "human",
"homo sapiens": "human",
"patient": "human",
"infant": "human",
"mouse": "animal", "rat": "animal", "cow": "animal", "bovine": "animal",
"pig": "animal", "swine": "animal", "chicken": "animal", "fish": "animal",
"honey bee": "animal", "insect": "animal", "termite": "animal",
"bird": "animal", "tick": "animal",
"plant": "plant", "rice": "plant", "wheat": "plant", "soybean": "plant",
"tomato": "plant", "leaf": "plant", "root": "plant", "rhizosphere": "plant",
"fungus": "fungal", "yeast": "fungal", "mushroom": "fungal",
}
def coerce_float(v) -> float | None:
if v is None:
return None
try:
return float(v)
except (TypeError, ValueError):
return None
def parse_year(s) -> int | None:
if not s or not isinstance(s, str):
return None
m = DATE_RE.search(s)
if not m:
return None
y = int(m.group(0))
return y if 1850 <= y <= 2100 else None
def host_kingdom(host_str) -> str | None:
if not host_str or not isinstance(host_str, str):
return None
s = host_str.lower()
for k, v in HOST_KINGDOM_KEYWORDS.items():
if k in s:
return v
return "other"
def extract_one(path: Path) -> dict | None:
try:
with open(path) as fh:
d = json.load(fh)
except Exception:
return None
bid_str = path.stem # filename is e.g. "12345.json"
try:
bid = int(bid_str)
except ValueError:
return None
iso_section = d.get("Isolation, sampling and environmental information", {})
if not isinstance(iso_section, dict):
return {"bacdive_id": bid}
iso = iso_section.get("isolation", {})
if isinstance(iso, list):
iso = iso[0] if iso else {}
if not isinstance(iso, dict):
iso = {}
sample_type = iso.get("sample type")
sample_text = sample_type if isinstance(sample_type, str) else None
year = parse_year(iso.get("sampling date")) or parse_year(iso.get("isolation date"))
host_species = iso.get("host species") if isinstance(iso.get("host species"), str) else None
return {
"bacdive_id": bid,
"iso_country": iso.get("country") if isinstance(iso.get("country"), str) else None,
"iso_continent": iso.get("continent") if isinstance(iso.get("continent"), str) else None,
"iso_lat": coerce_float(iso.get("latitude")),
"iso_lon": coerce_float(iso.get("longitude")),
"iso_collection_year": year,
"iso_host_species": host_species,
"iso_host_kingdom": host_kingdom(host_species) or host_kingdom(sample_text),
"iso_sample_text": sample_text,
"iso_geographic_location": iso.get("geographic location") if isinstance(iso.get("geographic location"), str) else None,
}
def add_categorical_onehots(df: pd.DataFrame, top_n_countries: int = 30) -> pd.DataFrame:
# continent one-hots (small, fixed set)
continents = [c for c in df["iso_continent"].dropna().unique()]
for c in continents:
slug = re.sub(r"[^a-z0-9]+", "_", c.lower()).strip("_")
if slug:
df[f"iso_continent_{slug}"] = (df["iso_continent"] == c).astype(int)
# top-N country one-hots (long tail; cap to keep feature count manageable)
top_countries = df["iso_country"].value_counts().head(top_n_countries).index.tolist()
for c in top_countries:
slug = re.sub(r"[^a-z0-9]+", "_", c.lower()).strip("_")
if slug:
df[f"iso_country_{slug}"] = (df["iso_country"] == c).astype(int)
# host-kingdom one-hots (very small fixed set)
for k in ("human", "animal", "plant", "fungal", "other"):
df[f"iso_host_kingdom_{k}"] = (df["iso_host_kingdom"] == k).astype(int)
return df
def main() -> None:
bacdive_dir = config.DATA / "bacdive"
if not bacdive_dir.exists():
raise SystemExit(f"Missing {bacdive_dir}")
paths = list(bacdive_dir.glob("*.json"))
print(f"Parsing {len(paths):,} BacDive JSON files...")
rows: list[dict] = []
for p in tqdm(paths, unit="file"):
r = extract_one(p)
if r:
rows.append(r)
df = pd.DataFrame(rows)
print(f"Parsed {len(df):,} rows")
# Coverage report on the high-value fields
for col in ["iso_country", "iso_continent", "iso_lat", "iso_lon",
"iso_collection_year", "iso_host_species", "iso_host_kingdom"]:
nn = df[col].notna().sum() if col in df.columns else 0
print(f" {col:30s} {nn:>6,} populated ({100*nn/len(df):.1f}%)")
df = add_categorical_onehots(df)
out = config.DATA / "isolation_metadata.parquet"
df.to_parquet(out, index=False)
print(f"\nWrote {out}: {len(df):,} rows × {df.shape[1]} cols")
# Sample
print("\nMost common countries (sanity check):")
print(df["iso_country"].value_counts().head(10))
if __name__ == "__main__":
main()
|