ClinicianAssistant / scripts /prep_summaries.py
JDFPalladium
cleaning up organization of scripts and data and updating filepaths in app to processed data
389c5f0
import numpy as np
import pandas as pd
from llama_index.embeddings.openai import OpenAIEmbedding
import os
from dotenv import load_dotenv
load_dotenv("config.env")
os.environ.get("OPENAI_API_KEY")
# load vectorstore summaries
df = pd.read_csv("data/raw/guidelines_summaries.csv")
# Embed summaries
embedding_model = OpenAIEmbedding()
summary_embeddings = []
for summary in df["summary"]:
emb = embedding_model.get_text_embedding(summary)
summary_embeddings.append(emb)
summary_embeddings = np.vstack(summary_embeddings)
# Save embeddings and metadata
os.makedirs("data/processed/lp/summary_embeddings", exist_ok=True)
np.save("data/processed/lp/summary_embeddings/embeddings.npy", summary_embeddings)
df.to_csv("data/processed/lp/summary_embeddings/index.tsv", sep="\t", index=False)
print("✅ Saved embeddings and index.")