Spaces:
Running
Running
| """Build data/strain_media.parquet by walking the BacDive cache. | |
| No network calls — pure local extraction from cached records. | |
| Output: one row per (bacdive_id, medium_id) link. | |
| """ | |
| from __future__ import annotations | |
| import pandas as pd | |
| from tqdm import tqdm | |
| from microbe_model import config | |
| from microbe_model.data.mediadive import iter_bacdive_strain_media | |
| def main() -> None: | |
| rows = [] | |
| n_files = sum(1 for _ in config.BACDIVE_DIR.glob("*.json")) | |
| print(f"Walking {n_files:,} BacDive cached records...") | |
| for row in tqdm(iter_bacdive_strain_media(), total=n_files, unit="link"): | |
| rows.append(row) | |
| df = pd.DataFrame(rows) | |
| out = config.DATA / "strain_media.parquet" | |
| df.to_parquet(out, index=False) | |
| print(f"\nWrote {len(df):,} strain↔medium links to {out}") | |
| print(f" unique strains: {df['bacdive_id'].nunique():,}") | |
| print(f" unique media: {df['medium_id'].nunique():,}") | |
| print(f" growth=yes: {(df['growth'] == 'yes').sum():,}") | |
| print(f" growth=no: {(df['growth'] == 'no').sum():,}") | |
| print(f" growth=weak: {(df['growth'] == 'weak').sum():,}") | |
| print("\nTop 10 most-used media:") | |
| print(df.groupby(['medium_id', 'medium_name']).size().sort_values(ascending=False).head(10)) | |
| if __name__ == "__main__": | |
| main() | |