"""Scan BacDive and write strain phenotype labels to data/bacdive_phenotypes.parquet. Uses the v2 public API (no auth). Discovers strain IDs by batch-scanning the integer ID range — missing IDs are silently dropped server-side, so the scan is complete in one pass over [start, end]. Usage: # Phase 1 smoke test — scan the first ~5K IDs (returns ~3-4K real records) uv run python scripts/01_fetch_bacdive.py --end 5000 # Full BacDive (~150K live records, ~30 min wall time) uv run python scripts/01_fetch_bacdive.py --end 200000 """ from __future__ import annotations import argparse import pandas as pd from tqdm import tqdm from microbe_model import config from microbe_model.data.bacdive import ( BATCH_SIZE, DEFAULT_MAX_ID, BacDiveClient, cache_record, extract_phenotypes, ) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--start", type=int, default=1) parser.add_argument("--end", type=int, default=DEFAULT_MAX_ID) parser.add_argument("--no-cache", action="store_true", help="Skip writing per-strain JSON to disk (saves ~150K small files).") args = parser.parse_args() client = BacDiveClient() rows = [] n_batches = (args.end - args.start) // BATCH_SIZE + 1 with tqdm(total=n_batches, desc="BacDive batches", unit="batch") as bar: for bacdive_id, record in client.iter_records(start=args.start, end=args.end): if not args.no_cache: cache_record(bacdive_id, record) rows.append(extract_phenotypes(record)) # tqdm advances per batch — track via the integer ID if bacdive_id % BATCH_SIZE == 0: bar.update(1) bar.update(n_batches - bar.n) # finalize df = pd.DataFrame(rows) out = config.DATA / "bacdive_phenotypes.parquet" df.to_parquet(out, index=False) print(f"\nWrote {len(df)} strains to {out}") print("Coverage of prediction targets:") for col in ("optimal_temperature_c", "optimal_ph", "oxygen_requirement", "salt_tolerance_pct"): n = df[col].notna().sum() print(f" {col:30s} {n:>6d} / {len(df)} ({100 * n / max(1, len(df)):.1f}%)") n_genome = df["genome_accession"].notna().sum() print(f" genome_accession {n_genome:>6d} / {len(df)} ({100 * n_genome / max(1, len(df)):.1f}%)") n_both = df[df["genome_accession"].notna() & df["optimal_temperature_c"].notna()].shape[0] print(f"\n genome + T_opt (training-ready) {n_both:>4d} strains") if __name__ == "__main__": main()