File size: 1,308 Bytes
3d34be9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
"""Build data/strain_media.parquet by walking the BacDive cache.

No network calls — pure local extraction from cached records.
Output: one row per (bacdive_id, medium_id) link.
"""
from __future__ import annotations

import pandas as pd
from tqdm import tqdm

from microbe_model import config
from microbe_model.data.mediadive import iter_bacdive_strain_media


def main() -> None:
    rows = []
    n_files = sum(1 for _ in config.BACDIVE_DIR.glob("*.json"))
    print(f"Walking {n_files:,} BacDive cached records...")
    for row in tqdm(iter_bacdive_strain_media(), total=n_files, unit="link"):
        rows.append(row)

    df = pd.DataFrame(rows)
    out = config.DATA / "strain_media.parquet"
    df.to_parquet(out, index=False)

    print(f"\nWrote {len(df):,} strain↔medium links to {out}")
    print(f"  unique strains:  {df['bacdive_id'].nunique():,}")
    print(f"  unique media:    {df['medium_id'].nunique():,}")
    print(f"  growth=yes:      {(df['growth'] == 'yes').sum():,}")
    print(f"  growth=no:       {(df['growth'] == 'no').sum():,}")
    print(f"  growth=weak:     {(df['growth'] == 'weak').sum():,}")
    print("\nTop 10 most-used media:")
    print(df.groupby(['medium_id', 'medium_name']).size().sort_values(ascending=False).head(10))


if __name__ == "__main__":
    main()