microbe-model / scripts /08_extract_strain_media.py
Miyu Horiuchi
Phase E scaffolding: MediaDive integration + strain↔medium links
3d34be9
"""Build data/strain_media.parquet by walking the BacDive cache.
No network calls — pure local extraction from cached records.
Output: one row per (bacdive_id, medium_id) link.
"""
from __future__ import annotations
import pandas as pd
from tqdm import tqdm
from microbe_model import config
from microbe_model.data.mediadive import iter_bacdive_strain_media
def main() -> None:
rows = []
n_files = sum(1 for _ in config.BACDIVE_DIR.glob("*.json"))
print(f"Walking {n_files:,} BacDive cached records...")
for row in tqdm(iter_bacdive_strain_media(), total=n_files, unit="link"):
rows.append(row)
df = pd.DataFrame(rows)
out = config.DATA / "strain_media.parquet"
df.to_parquet(out, index=False)
print(f"\nWrote {len(df):,} strain↔medium links to {out}")
print(f" unique strains: {df['bacdive_id'].nunique():,}")
print(f" unique media: {df['medium_id'].nunique():,}")
print(f" growth=yes: {(df['growth'] == 'yes').sum():,}")
print(f" growth=no: {(df['growth'] == 'no').sum():,}")
print(f" growth=weak: {(df['growth'] == 'weak').sum():,}")
print("\nTop 10 most-used media:")
print(df.groupby(['medium_id', 'medium_name']).size().sort_values(ascending=False).head(10))
if __name__ == "__main__":
main()