soapbox / setup.py
Sadhanha Anand
SoapBox β€” AI Clinical Scribe Agent
268b40a
import os
import json
from datasets import load_dataset
def setup():
from vector_store import collection
if collection.count() == 0:
print("Loading dataset into ChromaDB...")
dataset = load_dataset(
"AGBonnet/augmented-clinical-notes",
split="train[:200]"
)
df = dataset.to_pandas()
success = 0
for i, row in df.iterrows():
try:
summary = json.loads(row["summary"]) if isinstance(row["summary"], str) else row["summary"]
patient_info = summary.get("patient information", {})
age = patient_info.get("age", "Unknown")
sex = patient_info.get("sex", "Unknown")
visit_motivation = summary.get("visit motivation", "Unknown")
collection.add(
documents=[row["full_note"]],
metadatas=[{
"idx": str(row["idx"]),
"age": str(age),
"sex": str(sex),
"visit_motivation": str(visit_motivation)[:200]
}],
ids=[str(row["idx"])]
)
success += 1
except Exception:
continue
print(f"βœ… Loaded {success} notes into ChromaDB")
else:
print(f"βœ… ChromaDB already has {collection.count()} notes")
if __name__ == "__main__":
setup()