import os import json from datasets import load_dataset def setup(): from vector_store import collection if collection.count() == 0: print("Loading dataset into ChromaDB...") dataset = load_dataset( "AGBonnet/augmented-clinical-notes", split="train[:200]" ) df = dataset.to_pandas() success = 0 for i, row in df.iterrows(): try: summary = json.loads(row["summary"]) if isinstance(row["summary"], str) else row["summary"] patient_info = summary.get("patient information", {}) age = patient_info.get("age", "Unknown") sex = patient_info.get("sex", "Unknown") visit_motivation = summary.get("visit motivation", "Unknown") collection.add( documents=[row["full_note"]], metadatas=[{ "idx": str(row["idx"]), "age": str(age), "sex": str(sex), "visit_motivation": str(visit_motivation)[:200] }], ids=[str(row["idx"])] ) success += 1 except Exception: continue print(f"✅ Loaded {success} notes into ChromaDB") else: print(f"✅ ChromaDB already has {collection.count()} notes") if __name__ == "__main__": setup()