File size: 939 Bytes
0ed74db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
"""Build data/embeddings.parquet from data/embeddings.jsonl.

Use after Modal/local extraction finishes (or partway through, to checkpoint).
"""
from __future__ import annotations

import json
from pathlib import Path

import pandas as pd

from microbe_model import config


def main() -> None:
    src = config.DATA / "embeddings.jsonl"
    if not src.exists():
        raise SystemExit(f"Missing {src}")
    rows: list[dict] = []
    with open(src) as fh:
        for line in fh:
            r = json.loads(line)
            d = {"bacdive_id": r["bacdive_id"], "genome_accession": r["genome_accession"]}
            d.update({f"emb_{i}": float(v) for i, v in enumerate(r["embedding"])})
            rows.append(d)
    df = pd.DataFrame(rows)
    out = config.DATA / "embeddings.parquet"
    df.to_parquet(out, index=False)
    print(f"Wrote {len(df):,} embeddings × {df.shape[1]} cols to {out}")


if __name__ == "__main__":
    main()