Spaces:
Running
Running
| """Build data/embeddings.parquet from data/embeddings.jsonl. | |
| Use after Modal/local extraction finishes (or partway through, to checkpoint). | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from pathlib import Path | |
| import pandas as pd | |
| from microbe_model import config | |
| def main() -> None: | |
| src = config.DATA / "embeddings.jsonl" | |
| if not src.exists(): | |
| raise SystemExit(f"Missing {src}") | |
| rows: list[dict] = [] | |
| with open(src) as fh: | |
| for line in fh: | |
| r = json.loads(line) | |
| d = {"bacdive_id": r["bacdive_id"], "genome_accession": r["genome_accession"]} | |
| d.update({f"emb_{i}": float(v) for i, v in enumerate(r["embedding"])}) | |
| rows.append(d) | |
| df = pd.DataFrame(rows) | |
| out = config.DATA / "embeddings.parquet" | |
| df.to_parquet(out, index=False) | |
| print(f"Wrote {len(df):,} embeddings × {df.shape[1]} cols to {out}") | |
| if __name__ == "__main__": | |
| main() | |