File size: 5,987 Bytes
0ed74db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
"""Extract richer environment-of-origin features from raw BacDive JSON.

Reads:
  data/bacdive/*.json  — one file per BacDive ID

Writes:
  data/isolation_metadata.parquet — one row per bacdive_id with cols:
    - iso_country, iso_continent (categorical → caller can one-hot)
    - iso_lat, iso_lon (float, NaN if missing)
    - iso_collection_year (int from sampling/isolation date, NaN if missing)
    - iso_host_species (string, NaN if missing)
    - iso_sample_text (free-text description, for downstream NLP if needed)
    - iso_continent_<X> binary one-hots (8 continents)
    - iso_country_<X> top-30 country one-hots
    - iso_host_kingdom (animal / plant / human / fungal / NaN — coarse map)

Wires into baseline by genome's bacdive_id (one strain per row, not per genome).
"""
from __future__ import annotations

import glob
import json
import re
from collections import Counter
from pathlib import Path

import pandas as pd
from tqdm import tqdm

from microbe_model import config

DATE_RE = re.compile(r"\b(19|20)\d{2}\b")
HOST_KINGDOM_KEYWORDS = {
    "human": "human",
    "homo sapiens": "human",
    "patient": "human",
    "infant": "human",
    "mouse": "animal", "rat": "animal", "cow": "animal", "bovine": "animal",
    "pig": "animal", "swine": "animal", "chicken": "animal", "fish": "animal",
    "honey bee": "animal", "insect": "animal", "termite": "animal",
    "bird": "animal", "tick": "animal",
    "plant": "plant", "rice": "plant", "wheat": "plant", "soybean": "plant",
    "tomato": "plant", "leaf": "plant", "root": "plant", "rhizosphere": "plant",
    "fungus": "fungal", "yeast": "fungal", "mushroom": "fungal",
}


def coerce_float(v) -> float | None:
    if v is None:
        return None
    try:
        return float(v)
    except (TypeError, ValueError):
        return None


def parse_year(s) -> int | None:
    if not s or not isinstance(s, str):
        return None
    m = DATE_RE.search(s)
    if not m:
        return None
    y = int(m.group(0))
    return y if 1850 <= y <= 2100 else None


def host_kingdom(host_str) -> str | None:
    if not host_str or not isinstance(host_str, str):
        return None
    s = host_str.lower()
    for k, v in HOST_KINGDOM_KEYWORDS.items():
        if k in s:
            return v
    return "other"


def extract_one(path: Path) -> dict | None:
    try:
        with open(path) as fh:
            d = json.load(fh)
    except Exception:
        return None

    bid_str = path.stem  # filename is e.g. "12345.json"
    try:
        bid = int(bid_str)
    except ValueError:
        return None

    iso_section = d.get("Isolation, sampling and environmental information", {})
    if not isinstance(iso_section, dict):
        return {"bacdive_id": bid}

    iso = iso_section.get("isolation", {})
    if isinstance(iso, list):
        iso = iso[0] if iso else {}
    if not isinstance(iso, dict):
        iso = {}

    sample_type = iso.get("sample type")
    sample_text = sample_type if isinstance(sample_type, str) else None

    year = parse_year(iso.get("sampling date")) or parse_year(iso.get("isolation date"))
    host_species = iso.get("host species") if isinstance(iso.get("host species"), str) else None

    return {
        "bacdive_id": bid,
        "iso_country": iso.get("country") if isinstance(iso.get("country"), str) else None,
        "iso_continent": iso.get("continent") if isinstance(iso.get("continent"), str) else None,
        "iso_lat": coerce_float(iso.get("latitude")),
        "iso_lon": coerce_float(iso.get("longitude")),
        "iso_collection_year": year,
        "iso_host_species": host_species,
        "iso_host_kingdom": host_kingdom(host_species) or host_kingdom(sample_text),
        "iso_sample_text": sample_text,
        "iso_geographic_location": iso.get("geographic location") if isinstance(iso.get("geographic location"), str) else None,
    }


def add_categorical_onehots(df: pd.DataFrame, top_n_countries: int = 30) -> pd.DataFrame:
    # continent one-hots (small, fixed set)
    continents = [c for c in df["iso_continent"].dropna().unique()]
    for c in continents:
        slug = re.sub(r"[^a-z0-9]+", "_", c.lower()).strip("_")
        if slug:
            df[f"iso_continent_{slug}"] = (df["iso_continent"] == c).astype(int)

    # top-N country one-hots (long tail; cap to keep feature count manageable)
    top_countries = df["iso_country"].value_counts().head(top_n_countries).index.tolist()
    for c in top_countries:
        slug = re.sub(r"[^a-z0-9]+", "_", c.lower()).strip("_")
        if slug:
            df[f"iso_country_{slug}"] = (df["iso_country"] == c).astype(int)

    # host-kingdom one-hots (very small fixed set)
    for k in ("human", "animal", "plant", "fungal", "other"):
        df[f"iso_host_kingdom_{k}"] = (df["iso_host_kingdom"] == k).astype(int)

    return df


def main() -> None:
    bacdive_dir = config.DATA / "bacdive"
    if not bacdive_dir.exists():
        raise SystemExit(f"Missing {bacdive_dir}")

    paths = list(bacdive_dir.glob("*.json"))
    print(f"Parsing {len(paths):,} BacDive JSON files...")

    rows: list[dict] = []
    for p in tqdm(paths, unit="file"):
        r = extract_one(p)
        if r:
            rows.append(r)
    df = pd.DataFrame(rows)
    print(f"Parsed {len(df):,} rows")

    # Coverage report on the high-value fields
    for col in ["iso_country", "iso_continent", "iso_lat", "iso_lon",
                "iso_collection_year", "iso_host_species", "iso_host_kingdom"]:
        nn = df[col].notna().sum() if col in df.columns else 0
        print(f"  {col:30s} {nn:>6,} populated  ({100*nn/len(df):.1f}%)")

    df = add_categorical_onehots(df)

    out = config.DATA / "isolation_metadata.parquet"
    df.to_parquet(out, index=False)
    print(f"\nWrote {out}: {len(df):,} rows × {df.shape[1]} cols")

    # Sample
    print("\nMost common countries (sanity check):")
    print(df["iso_country"].value_counts().head(10))


if __name__ == "__main__":
    main()